Flow123d  DF_profiler_memory_monitor-0108f36
sys_profiler.cc
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file sys_profiler.cc
15  * @ingroup system
16  * @brief Profiler
17  */
18 
19 
20 // Fat header
21 
22 #include <fstream>
23 #include <iomanip>
24 #include <sys/param.h>
25 #include <unordered_map>
26 
27 //#include <pybind11/pybind11.h>
28 
29 #include "sys_profiler.hh"
30 #include "system/system.hh"
31 #include "system/python_loader.hh"
32 #include <iostream>
33 #include <boost/format.hpp>
34 
35 #include "system/file_path.hh"
36 #include "mpi.h"
37 #include "time_point.hh"
38 
39 /*
40  * These should be replaced by using boost MPI interface
41  */
42 int MPI_Functions::sum(int* val, MPI_Comm comm) {
43  int total = 0;
44  MPI_Reduce(val, &total, 1, MPI_INT, MPI_SUM, 0, comm);
45  return total;
46  }
47 
48 double MPI_Functions::sum(double* val, MPI_Comm comm) {
49  double total = 0;
50  MPI_Reduce(val, &total, 1, MPI_DOUBLE, MPI_SUM, 0, comm);
51  return total;
52  }
53 
54 long MPI_Functions::sum(long* val, MPI_Comm comm) {
55  long total = 0;
56  MPI_Reduce(val, &total, 1, MPI_LONG, MPI_SUM, 0, comm);
57  return total;
58  }
59 
60 int MPI_Functions::min(int* val, MPI_Comm comm) {
61  int min = 0;
62  MPI_Reduce(val, &min, 1, MPI_INT, MPI_MIN, 0, comm);
63  return min;
64  }
65 
66 double MPI_Functions::min(double* val, MPI_Comm comm) {
67  double min = 0;
68  MPI_Reduce(val, &min, 1, MPI_DOUBLE, MPI_MIN, 0, comm);
69  return min;
70  }
71 
72 long MPI_Functions::min(long* val, MPI_Comm comm) {
73  long min = 0;
74  MPI_Reduce(val, &min, 1, MPI_LONG, MPI_MIN, 0, comm);
75  return min;
76  }
77 
78 int MPI_Functions::max(int* val, MPI_Comm comm) {
79  int max = 0;
80  MPI_Reduce(val, &max, 1, MPI_INT, MPI_MAX, 0, comm);
81  return max;
82  }
83 
84 double MPI_Functions::max(double* val, MPI_Comm comm) {
85  double max = 0;
86  MPI_Reduce(val, &max, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
87  return max;
88  }
89 
90 long MPI_Functions::max(long* val, MPI_Comm comm) {
91  long max = 0;
92  MPI_Reduce(val, &max, 1, MPI_LONG, MPI_MAX, 0, comm);
93  return max;
94  }
95 
96 
97 #ifdef FLOW123D_DEBUG_PROFILER
98 /*********************************************************************************************
99  * Implementation of class Timer
100  */
101 
102 const int timer_no_child=-1;
103 
104 Timer::Timer(const CodePoint &cp, int parent)
105 : start_time(TimePoint()),
106  cumul_time(0.0),
107  call_count(0),
108  start_count(0),
109  code_point_(&cp),
110  full_hash_(cp.hash_),
111  hash_idx_(cp.hash_idx_),
112  parent_timer(parent),
113  total_allocated_(0),
114  total_deallocated_(0),
115  max_allocated_(0),
116  current_allocated_(0),
117  alloc_called(0),
118  dealloc_called(0),
119  turn_off_memory_monitoring_(false)
120 #ifdef FLOW123D_HAVE_PETSC
121 , petsc_start_memory(0),
122  petsc_end_memory (0),
123  petsc_memory_difference(0),
124  petsc_peak_memory(0),
125  petsc_local_peak_memory(0)
126 #endif // FLOW123D_HAVE_PETSC
127 {
128  for(unsigned int i=0; i< max_n_childs ;i++) child_timers[i]=timer_no_child;
129 }
130 
131 
132 /**
133  * Debug information of the timer
134  */
135 ostream & operator <<(ostream& os, const Timer& timer) {
136  os << " Timer: " << timer.tag() << endl;
137  os << " malloc: " << timer.total_allocated_ << endl;
138  os << " dalloc: " << timer.total_deallocated_ << endl;
139  #ifdef FLOW123D_HAVE_PETSC
140  os << " start: " << timer.petsc_start_memory << endl;
141  os << " stop : " << timer.petsc_end_memory << endl;
142  os << " diff : " << timer.petsc_memory_difference << " (" << timer.petsc_end_memory - timer.petsc_start_memory << ")" << endl;
143  os << " peak : " << timer.petsc_peak_memory << " (" << timer.petsc_local_peak_memory << ")" << endl;
144  #endif // FLOW123D_HAVE_PETSC
145  os << endl;
146  return os;
147 }
148 
149 
150 double Timer::cumulative_time() const {
151  return cumul_time;
152 }
153 
154 void Profiler::accept_from_child(Timer &parent, Timer &child) {
155  int child_timer = 0;
156  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
157  child_timer = child.child_timers[i];
158  if (child_timer != timer_no_child) {
159  // propagate metrics from child to parent
160  accept_from_child(child, timers_[child_timer]);
161  }
162  }
163  // compute totals by adding values from child
164  parent.total_allocated_ += child.total_allocated_;
165  parent.total_deallocated_ += child.total_deallocated_;
166  parent.alloc_called += child.alloc_called;
167  parent.dealloc_called += child.dealloc_called;
168 
169 #ifdef FLOW123D_HAVE_PETSC
170  if (petsc_monitor_memory) {
171  // add differences from child
172  parent.petsc_memory_difference += child.petsc_memory_difference;
173  parent.current_allocated_ += child.current_allocated_;
174 
175  // when computing maximum, we take greater value from parent and child
176  // PetscMemoryGetCurrentUsage always return absolute (not relative) value
177  parent.petsc_peak_memory = max(parent.petsc_peak_memory, child.petsc_peak_memory);
178  }
179 #endif // FLOW123D_HAVE_PETSC
180 
181  parent.max_allocated_ = max(parent.max_allocated_, child.max_allocated_);
182 }
183 
184 
185 void Timer::pause() {
186 #ifdef FLOW123D_HAVE_PETSC
187  if (Profiler::get_petsc_memory_monitoring()) {
188  // get the maximum resident set size (memory used) for the program.
189  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
190  if (petsc_peak_memory < petsc_local_peak_memory)
191  petsc_peak_memory = petsc_local_peak_memory;
192  }
193 #endif // FLOW123D_HAVE_PETSC
194 }
195 
196 void Timer::resume() {
197 #ifdef FLOW123D_HAVE_PETSC
198  if (Profiler::get_petsc_memory_monitoring()) {
199  // tell PETSc to monitor the maximum memory usage so
200  // that PetscMemoryGetMaximumUsage() will work.
201  PetscMemorySetGetMaximumUsage();
202  }
203 #endif // FLOW123D_HAVE_PETSC
204 }
205 
206 void Timer::start() {
207 #ifdef FLOW123D_HAVE_PETSC
208  if (Profiler::get_petsc_memory_monitoring()) {
209  // Tell PETSc to monitor the maximum memory usage so
210  // that PetscMemoryGetMaximumUsage() will work.
211  PetscMemorySetGetMaximumUsage();
212  PetscMemoryGetCurrentUsage (&petsc_start_memory);
213  }
214 #endif // FLOW123D_HAVE_PETSC
215 
216  if (start_count == 0) {
217  start_time = TimePoint();
218  }
219  call_count++;
220  start_count++;
221 }
222 
223 
224 
225 bool Timer::stop(bool forced) {
226 #ifdef FLOW123D_HAVE_PETSC
227  if (Profiler::get_petsc_memory_monitoring()) {
228  // get current memory usage
229  PetscMemoryGetCurrentUsage (&petsc_end_memory);
230  petsc_memory_difference += petsc_end_memory - petsc_start_memory;
231 
232  // get the maximum resident set size (memory used) for the program.
233  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
234  if (petsc_peak_memory < petsc_local_peak_memory)
235  petsc_peak_memory = petsc_local_peak_memory;
236  }
237 #endif // FLOW123D_HAVE_PETSC
238 
239  if (turn_off_memory_monitoring_) {
241  }
242 
243  if (forced) start_count=1;
244 
245  if (start_count == 1) {
246  cumul_time += (TimePoint() - start_time);
247  start_count--;
248  return true;
249  } else {
250  start_count--;
251  }
252  return false;
253 }
254 
255 
256 
257 void Timer::add_child(int child_index, const Timer &child)
258 {
259  unsigned int idx = child.hash_idx_;
260  if (child_timers[idx] != timer_no_child) {
261  // hash collision, find first empty place
262  unsigned int i=idx;
263  do {
264  i=( i < max_n_childs ? i+1 : 0);
265  } while (i!=idx && child_timers[i] != timer_no_child);
266  ASSERT_PERMANENT(i!=idx)(tag()).error("Too many children of the timer");
267  idx=i;
268  }
269  //DebugOut().fmt("Adding child {} at index: {}\n", child_index, idx);
270  child_timers[idx] = child_index;
271 }
272 
273 
274 
275 string Timer::code_point_str() const {
276  return fmt::format("{%s}:{%d}, {%s}()",
277  code_point_->file_, code_point_->line_, code_point_->func_ );
278 }
279 
280 
281 /***********************************************************************************************
282  * Implementation of Profiler
283  */
284 
285 
286 Profiler * Profiler::instance(bool clear) {
287  static Profiler * _instance = NULL;
288 
289  if (clear) {
290  if (_instance != NULL) {
291  delete _instance;
292  _instance = NULL;
293  }
294  return _instance;
295  }
296 
297  if (_instance == NULL) {
298  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
299  _instance = new Profiler();
300  }
301 
302  return _instance;
303  }
304 
305 
306 // static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
307 const long Profiler::malloc_map_reserve = 100 * 1000;
308 
310 : actual_node(0),
311  task_size_(1),
312  start_time( time(NULL) ),
313  //json_filepath(""),
314  none_timer_(CODE_POINT("NONE TIMER"), 0),
315  calibration_time_(-1)
316 
317 {
318  static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
319  set_memory_monitoring(false);
320 #ifdef FLOW123D_DEBUG_PROFILER
321  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
322  timers_.push_back( Timer(main_cp, 0) );
323  timers_[0].start();
324 #endif
325 }
326 
327 
328 void Profiler::calibrate() {
329 
330  uint SIZE = 64 * 1024;
331  uint HALF = SIZE / 2;
332  START_TIMER("UNIT_PAYLOAD");
333  Timer &timer = timers_[actual_node];
334 
335  uint count = 0;
336  while(TimePoint() - timer.start_time < 0.1) {
337  double * block = new double[SIZE];
338  for(uint i=0; i<HALF; i++) {
339  block[HALF+i] = block[i]*block[i] + i;
340  }
341  delete[] block;
342  count++;
343  }
344 
345 
346 
347  END_TIMER("UNIT_PAYLOAD");
348 
349  const double reference_count = 7730;
350  // reference count is average of 5 measurements on payload free laptop:
351  // DELL Latitude E5570, Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
352  // new/delete makes about 3% of time
353  calibration_time_ = 10 * timer.cumul_time * reference_count / count;
354  LogOut() << "Profiler calibration count: " << count << std::endl;
355  LogOut() << "Profiler calibration time: " << calibration_time_ << std::endl;
356 
357 }
358 
359 
360 
361 void Profiler::propagate_timers() {
362  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
363  unsigned int child_timer = timers_[0].child_timers[i];
364  if ((signed int)child_timer != timer_no_child) {
365  // propagate metrics from child to Whole-Program time-frame
366  accept_from_child(timers_[0], timers_[child_timer]);
367  }
368  }
369 }
370 
371 
372 
373 void Profiler::set_task_info(string description, int size) {
374  task_description_ = description;
375  task_size_ = size;
376 }
377 
378 
379 
380 void Profiler::set_program_info(string program_name, string program_version, string branch, string revision, string build) {
381  flow_name_ = program_name;
382  flow_version_ = program_version;
383  flow_branch_ = branch;
384  flow_revision_ = revision;
385  flow_build_ = build;
386 }
387 
388 
389 
390 int Profiler::start_timer(const CodePoint &cp) {
391  unsigned int parent_node = actual_node;
392  //DebugOut().fmt("Start timer: {}\n", cp.tag_);
393  int child_idx = find_child(cp);
394  if (child_idx < 0) {
395  //DebugOut().fmt("Adding timer: {}\n", cp.tag_);
396  // tag not present - create new timer
397  child_idx=timers_.size();
398  timers_.push_back( Timer(cp, actual_node) );
399  timers_[actual_node].add_child(child_idx , timers_.back() );
400  }
401  actual_node=child_idx;
402 
403  // pause current timer
404  timers_[parent_node].pause();
405 
406  timers_[actual_node].start();
407 
408  return actual_node;
409 }
410 
411 
412 
413 int Profiler::find_child(const CodePoint &cp) {
414  Timer &timer =timers_[actual_node];
415  unsigned int idx = cp.hash_idx_;
416  unsigned int child_idx;
417  do {
418  if (timer.child_timers[idx] == timer_no_child) break; // tag is not there
419 
420  child_idx=timer.child_timers[idx];
421  ASSERT_PERMANENT_LT(child_idx, timers_.size()).error();
422  if (timers_[child_idx].full_hash_ == cp.hash_) return child_idx;
423  idx = ( (unsigned int)(idx)==(Timer::max_n_childs - 1) ? 0 : idx+1 );
424  } while ( (unsigned int)(idx) != cp.hash_idx_ ); // passed through whole array
425  return -1;
426 }
427 
428 
429 
430 void Profiler::stop_timer(const CodePoint &cp) {
431 #ifdef FLOW123D_DEBUG_ASSERTS
432  // check that all childrens are closed
433  Timer &timer=timers_[actual_node];
434  for(unsigned int i=0; i < Timer::max_n_childs; i++)
435  if (timer.child_timers[i] != timer_no_child)
436  ASSERT_PERMANENT(! timers_[timer.child_timers[i]].running())(timers_[timer.child_timers[i]].tag())(timer.tag())
437  .error("Child timer running while closing timer.");
438 #endif
439  unsigned int child_timer = actual_node;
440  if ( cp.hash_ != timers_[actual_node].full_hash_) {
441  // timer to close is not actual - we search for it above actual
442  for(unsigned int node=actual_node; node != 0; node=timers_[node].parent_timer) {
443  if ( cp.hash_ == timers_[node].full_hash_) {
444  // found above - close all nodes between
445  for(; (unsigned int)(actual_node) != node; actual_node=timers_[actual_node].parent_timer) {
446  WarningOut() << "Timer to close '" << cp.tag_ << "' do not match actual timer '"
447  << timers_[actual_node].tag() << "'. Force closing actual." << std::endl;
448  timers_[actual_node].stop(true);
449  }
450  // close 'node' itself
451  timers_[actual_node].stop(false);
452  actual_node = timers_[actual_node].parent_timer;
453 
454  // actual_node == child_timer indicates this is root
455  if (actual_node == child_timer)
456  return;
457 
458  // resume current timer
459  timers_[actual_node].resume();
460  return;
461  }
462  }
463  // node not found - do nothing
464  return;
465  }
466  // node to close match the actual
467  timers_[actual_node].stop(false);
468  actual_node = timers_[actual_node].parent_timer;
469 
470  // actual_node == child_timer indicates this is root
471  if (actual_node == child_timer)
472  return;
473 
474  // resume current timer
475  timers_[actual_node].resume();
476 }
477 
478 
479 
480 void Profiler::stop_timer(int timer_index) {
481  // stop_timer with CodePoint type
482  // timer which is still running MUST be the same as actual_node index
483  // if timer is not running index will differ
484  if (timers_[timer_index].running()) {
485  ASSERT_PERMANENT_EQ(timer_index, (int)actual_node).error();
486  stop_timer(*timers_[timer_index].code_point_);
487  }
488 
489 }
490 
491 
492 void Profiler::start_memory_monitoring() {
493  if ( !get_global_memory_monitoring() ) {
494  set_memory_monitoring(true);
495  timers_[actual_node].set_turn_off_memory_monitoring();
496  // add check of flag to stop_timer before line 463
497  }
498 }
499 
500 
501 void Profiler::add_calls(unsigned int n_calls) {
502  timers_[actual_node].call_count += n_calls-1;
503 }
504 
505 
506 
507 void Profiler::notify_malloc(const size_t size, const long p) {
508  MemoryAlloc::malloc_map()[p] = static_cast<int>(size);
509  timers_[actual_node].total_allocated_ += size;
510  timers_[actual_node].current_allocated_ += size;
511  timers_[actual_node].alloc_called++;
512 
513  if (timers_[actual_node].current_allocated_ > timers_[actual_node].max_allocated_)
514  timers_[actual_node].max_allocated_ = timers_[actual_node].current_allocated_;
515 }
516 
517 
518 
519 void Profiler::notify_free(const long p) {
520  int size = sizeof(p);
521  if (MemoryAlloc::malloc_map()[(long)p] > 0) {
522  size = MemoryAlloc::malloc_map()[(long)p];
523  MemoryAlloc::malloc_map().erase((long)p);
524  }
525  timers_[actual_node].total_deallocated_ += size;
526  timers_[actual_node].current_allocated_ -= size;
527  timers_[actual_node].dealloc_called++;
528 }
529 
530 
531 double Profiler::get_resolution () {
532  const int measurements = 100;
533  double result = 0;
534 
535  // perform 100 measurements
536  for (unsigned int i = 1; i < measurements; i++) {
537  TimePoint t1 = TimePoint ();
538  TimePoint t2 = TimePoint ();
539 
540  // double comparison should be avoided
541  while ((t2 - t1) == 0) t2 = TimePoint ();
542  // while ((t2.ticks - t1.ticks) == 0) t2 = TimePoint ();
543 
544  result += t2 - t1;
545  }
546 
547  return (result / measurements) * 1000; // ticks to seconds to microseconds conversion
548 }
549 
550 
551 Timer Profiler::find_timer(string tag) {
552  for(auto t : timers_) {
553  if (t.tag() == tag) return t;
554  }
555  return none_timer_;
556 }
557 
558 
559 std::string common_prefix( std::string a, std::string b ) {
560  if( a.size() > b.size() ) std::swap(a,b) ;
561  return std::string( a.begin(), std::mismatch( a.begin(), a.end(), b.begin() ).first ) ;
562 }
563 
564 
565 
566 template<typename ReduceFunctor>
567 void Profiler::add_timer_info(ReduceFunctor reduce, nlohmann::json* holder, int timer_idx, double parent_time) {
568 
569  // get timer and check preconditions
570  Timer &timer = timers_[timer_idx];
571  ASSERT_PERMANENT(timer_idx >=0)(timer_idx).error("Wrong timer index.");
572  ASSERT_PERMANENT(timer.parent_timer >=0).error("Inconsistent tree.");
573 
574  // fix path
575  string filepath = timer.code_point_->file_;
576 
577  // if constant FLOW123D_SOURCE_DIR is defined, we try to erase it from beginning of each CodePoint's filepath
578  #ifdef FLOW123D_SOURCE_DIR
579  string common_path = common_prefix (string(FLOW123D_SOURCE_DIR), filepath);
580  filepath.erase (0, common_path.size());
581  #endif
582 
583 
584  // generate node representing this timer
585  // add basic information
586  nlohmann::json node;
587  double cumul_time_sum;
588  node["tag"] = timer.tag();
589  node["file-path"] = filepath;
590  node["file-line"] = timer.code_point_->line_;
591  node["function"] = timer.code_point_->func_;
592  cumul_time_sum = reduce(timer, node);
593 
594 
595  // statistical info
596  if (timer_idx == 0) parent_time = cumul_time_sum;
597  double percent = parent_time > 1.0e-10 ? cumul_time_sum / parent_time * 100.0 : 0.0;
598  node["percent"] = percent;
599 
600  // write times children timers using secured child_timers array
601  nlohmann::json children;
602 
603  // sort childs by its index in Profiler::timers_ in order to preserve call order
604  std::vector<int> child_timers_values;
605  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
606  if (timer.child_timers[i] != timer_no_child)
607  child_timers_values.push_back(timer.child_timers[i]);
608  }
609  std::sort(child_timers_values.begin(), child_timers_values.end());
610 
611  for(int idx : child_timers_values)
612  add_timer_info(reduce, &children, idx, cumul_time_sum);
613 
614  // add children tag and other info if present
615  if (child_timers_values.size() > 0) {
616  node["children"] = children;
617  }
618 
619  // add to the array
620  holder->push_back(node);
621 }
622 
623 
624 template <class T>
625 void save_nonmpi_metric (nlohmann::json &node, T * ptr, string name) {
626  node[name + "-min"] = *ptr;
627  node[name + "-max"] = *ptr;
628  node[name + "-sum"] = *ptr;
629 }
630 
631 
632 
633 string _profiler_output_path(string path) {
634  if (path == "") path = "profiler_info.log.json";
635  return FilePath(path, FilePath::output_file);
636 }
637 
638 std::shared_ptr<std::ostream> _profiler_output_stream(const string &path) {
639  return make_shared<ofstream>(path.c_str());
640 }
641 
642 
643 #ifdef FLOW123D_HAVE_MPI
644 template <class T>
645 void save_mpi_metric (nlohmann::json &node, MPI_Comm comm, T * ptr, string name) {
646  node[name + "-min"] = MPI_Functions::min(ptr, comm);
647  node[name + "-max"] = MPI_Functions::max(ptr, comm);
648  node[name + "-sum"] = MPI_Functions::sum(ptr, comm);
649 }
650 
651 void Profiler::output(MPI_Comm comm, ostream &os) {
652  int mpi_rank, mpi_size;
653  //wait until profiling on all processors is finished
654  MPI_Barrier(comm);
655  stop_timer(0);
656  propagate_timers();
657 
658  // stop monitoring memory
659  bool temp_memory_monitoring = global_monitor_memory;
660  set_memory_monitoring(false);
661 
662  chkerr( MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) );
663  MPI_Comm_size(comm, &mpi_size);
664 
665  // output header
666  nlohmann::json jsonRoot, jsonChildren;
667 
668  // recursively add all timers info
669  // define lambda function which reduces timer from multiple processors
670  // MPI implementation uses MPI call to reduce values
671  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
672  int call_count = timer.call_count;
673  double cumul_time = timer.cumulative_time ();
674 
675  long memory_allocated = (long)timer.total_allocated_;
676  long memory_deallocated = (long)timer.total_deallocated_;
677  long memory_peak = (long)timer.max_allocated_;
678 
679  int alloc_called = timer.alloc_called;
680  int dealloc_called = timer.dealloc_called;
681 
682 
683  save_mpi_metric<double>(node, comm, &cumul_time, "cumul-time");
684  save_mpi_metric<int>(node, comm, &call_count, "call-count");
685 
686  save_mpi_metric<long>(node, comm, &memory_allocated, "memory-alloc");
687  save_mpi_metric<long>(node, comm, &memory_deallocated, "memory-dealloc");
688  save_mpi_metric<long>(node, comm, &memory_peak, "memory-peak");
689  //
690  save_mpi_metric<int>(node, comm, &alloc_called, "memory-alloc-called");
691  save_mpi_metric<int>(node, comm, &dealloc_called, "memory-dealloc-called");
692 
693 #ifdef FLOW123D_HAVE_PETSC
694  long petsc_memory_difference = (long)timer.petsc_memory_difference;
695  long petsc_peak_memory = (long)timer.petsc_peak_memory;
696  save_mpi_metric<long>(node, comm, &petsc_memory_difference, "memory-petsc-diff");
697  save_mpi_metric<long>(node, comm, &petsc_peak_memory, "memory-petsc-peak");
698 #endif // FLOW123D_HAVE_PETSC
699 
700  return MPI_Functions::sum(&cumul_time, comm);
701  };
702 
703  add_timer_info (reduce, &jsonChildren, 0, 0.0);
704  jsonRoot["children"] = jsonChildren;
705  output_header(jsonRoot, mpi_size);
706 
707 
708  // create profiler output only once (on the first processor)
709  // only active communicator should be the one with mpi_rank 0
710  if (mpi_rank == 0) {
711  try {
712  /**
713  * indent size
714  * results in json human readable format (indents, newlines)
715  */
716  const int FLOW123D_JSON_HUMAN_READABLE = 2;
717  // write result to stream
718  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
719 
720  } catch (exception & e) {
721  stringstream ss;
722  ss << "nlohmann::json::dump error: " << e.what() << "\n";
723  THROW( ExcMessage() << EI_Message(ss.str()) );
724  }
725  }
726  // restore memory monitoring
727  set_memory_monitoring(temp_memory_monitoring);
728 }
729 
730 
731 string Profiler::output(MPI_Comm comm, string profiler_path /* = "" */) {
732  int mpi_rank;
733  chkerr(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank));
734 
735  // all processes must call output, but only rank 0 would use the output stream
736  if (mpi_rank == 0) {
737  string out_path = _profiler_output_path(profiler_path);
738  output(comm, *_profiler_output_stream(out_path));
739  return out_path;
740  } else {
741  ostringstream os;
742  output(comm, os);
743  return "";
744  }
745 }
746 
747 #endif /* FLOW123D_HAVE_MPI */
748 
749 void Profiler::output(ostream &os) {
750  // last update
751  stop_timer(0);
752  propagate_timers();
753 
754  // output header
755  nlohmann::json jsonRoot, jsonChildren;
756  /**
757  * Constant representing number of MPI processes
758  * where there is no MPI to work with (so 1 process)
759  */
760  const int FLOW123D_MPI_SINGLE_PROCESS = 1;
761  output_header(jsonRoot, FLOW123D_MPI_SINGLE_PROCESS);
762 
763 
764  // recursively add all timers info
765  // define lambda function which reduces timer from multiple processors
766  // non-MPI implementation is just dummy repetition of initial value
767  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
768  int call_count = timer.call_count;
769  double cumul_time = timer.cumulative_time ();
770 
771  long memory_allocated = (long)timer.total_allocated_;
772  long memory_deallocated = (long)timer.total_deallocated_;
773  long memory_peak = (long)timer.max_allocated_;
774 
775  int alloc_called = timer.alloc_called;
776  int dealloc_called = timer.dealloc_called;
777 
778  save_nonmpi_metric<double>(node, &cumul_time, "cumul-time");
779  save_nonmpi_metric<int>(node, &call_count, "call-count");
780 
781  save_nonmpi_metric<long>(node, &memory_allocated, "memory-alloc");
782  save_nonmpi_metric<long>(node, &memory_deallocated, "memory-dealloc");
783  save_nonmpi_metric<long>(node, &memory_peak, "memory-peak");
784 
785  save_nonmpi_metric<int>(node, &alloc_called, "memory-alloc-called");
786  save_nonmpi_metric<int>(node, &dealloc_called, "memory-dealloc-called");
787 
788 #ifdef FLOW123D_HAVE_PETSC
789  long petsc_memory_difference = (long)timer.petsc_memory_difference;
790  long petsc_peak_memory = (long)timer.petsc_peak_memory;
791  save_nonmpi_metric<long>(node, &petsc_memory_difference, "memory-petsc-diff");
792  save_nonmpi_metric<long>(node, &petsc_peak_memory, "memory-petsc-peak");
793 #endif // FLOW123D_HAVE_PETSC
794 
795  return cumul_time;
796  };
797 
798  add_timer_info(reduce, &jsonChildren, 0, 0.0);
799  jsonRoot["children"] = jsonChildren;
800 
801  try {
802  /**
803  * indent size
804  * results in json human readable format (indents, newlines)
805  */
806  const int FLOW123D_JSON_HUMAN_READABLE = 2;
807  // write result to stream
808  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
809 
810  } catch (exception & e) {
811  stringstream ss;
812  ss << "nlohmann::json::dump error: " << e.what() << "\n";
813  THROW( ExcMessage() << EI_Message(ss.str()) );
814  }
815 }
816 
817 
818 string Profiler::output(string profiler_path /* = "" */) {
819  string out_path = _profiler_output_path(profiler_path);
820  output(*_profiler_output_stream(out_path));
821  return out_path;
822 }
823 
824 void Profiler::output_header (nlohmann::json &root, int mpi_size) {
825  time_t end_time = time(NULL);
826 
827  const char format[] = "%x %X";
828  char start_time_string[BUFSIZ] = {0};
829  strftime(start_time_string, sizeof (start_time_string) - 1, format, localtime(&start_time));
830 
831  char end_time_string[BUFSIZ] = {0};
832  strftime(end_time_string, sizeof (end_time_string) - 1, format, localtime(&end_time));
833 
834  if (timers_[0].cumul_time > 60) {
835  calibrate();
836  }
837  // generate current run details
838  root["program-name"] = flow_name_;
839  root["program-version"] = flow_version_;
840  root["program-branch"] = flow_branch_;
841  root["program-revision"] = flow_revision_;
842  root["program-build"] = flow_build_;
843  root["timer-resolution"] = Profiler::get_resolution();
844  root["timer-calibration"] = calibration_time_;
845 
846  // print some information about the task at the beginning
847  root["task-description"] = task_description_;
848  root["task-size"] = task_size_;
849 
850  //print some information about the task at the beginning
851  root["run-process-count"] = mpi_size;
852  root["run-started-at"] = start_time_string;
853  root["run-finished-at"] = end_time_string;
854 }
855 
856 
857 
858 
859 void Profiler::uninitialize() {
860  if (Profiler::instance()) {
861  ASSERT_PERMANENT(Profiler::instance()->actual_node==0)(Profiler::instance()->timers_[Profiler::instance()->actual_node].tag())
862  .error("Forbidden to uninitialize the Profiler when actual timer is not zero.");
863  Profiler::instance()->stop_timer(0);
864  set_memory_monitoring(false);
865  Profiler::instance(true);
866  }
867 }
868 bool Profiler::global_monitor_memory = false;
869 bool Profiler::petsc_monitor_memory = false;
870 void Profiler::set_memory_monitoring(const bool global_monitor) {
871  global_monitor_memory = global_monitor;
872  petsc_monitor_memory = global_monitor;
873 }
874 
875 unordered_map_with_alloc & MemoryAlloc::malloc_map() {
876  static unordered_map_with_alloc static_malloc_map;
877  return static_malloc_map;
878 }
879 
880 void * Profiler::operator new (size_t size) {
881  return malloc (size);
882 }
883 
884 void Profiler::operator delete (void* p) {
885  free(p);
886 }
887 
888 void *operator new (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
889  void * p = malloc(size);
890  if (Profiler::get_global_memory_monitoring())
891  Profiler::instance()->notify_malloc(size, (long)p);
892  return p;
893 }
894 
895 void *operator new[] (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
896  void * p = malloc(size);
897  if (Profiler::get_global_memory_monitoring())
898  Profiler::instance()->notify_malloc(size, (long)p);
899  return p;
900 }
901 
902 void *operator new[] (std::size_t size, const std::nothrow_t&) throw() {
903  void * p = malloc(size);
904  if (Profiler::get_global_memory_monitoring())
905  Profiler::instance()->notify_malloc(size, (long)p);
906  return p;
907 }
908 
909 void operator delete( void *p) throw() {
910  if (Profiler::get_global_memory_monitoring())
911  Profiler::instance()->notify_free((long)p);
912  free(p);
913 }
914 
915 void operator delete( void *p, std::size_t) throw() {
916  if (Profiler::get_global_memory_monitoring())
917  Profiler::instance()->notify_free((long)p);
918  free(p);
919 }
920 
921 void operator delete[]( void *p) throw() {
922  if (Profiler::get_global_memory_monitoring())
923  Profiler::instance()->notify_free((long)p);
924  free(p);
925 }
926 
927 void operator delete[]( void *p, std::size_t) throw() {
928  if (Profiler::get_global_memory_monitoring())
929  Profiler::instance()->notify_free((long)p);
930  free(p);
931 }
932 
933 #else // def FLOW123D_DEBUG_PROFILER
934 
936  static Profiler * _instance = new Profiler();
937  return _instance;
938 }
939 
940 
941 // void Profiler::initialize() {
942 // if (_instance == NULL) {
943 // _instance = new Profiler();
944 // set_memory_monitoring(true, true);
945 // }
946 // }
947 
949 
950 
951 #endif // def FLOW123D_DEBUG_PROFILER
#define ASSERT_PERMANENT_LT(a, b)
Definition of comparative assert macro (Less Than)
Definition: asserts.hh:297
#define ASSERT_PERMANENT(expr)
Allow use shorter versions of macro names if these names is not used with external library.
Definition: asserts.hh:348
#define ASSERT_PERMANENT_EQ(a, b)
Definition of comparative assert macro (EQual)
Definition: asserts.hh:329
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54
@ output_file
Definition: file_path.hh:69
static int sum(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:42
static int min(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:60
static int max(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:78
static void uninitialize()
void notify_malloc(const size_t)
void output(MPI_Comm, ostream &)
void notify_free(const size_t)
static void set_memory_monitoring(bool)
static Profiler * instance(bool clear=false)
void set_program_info(string, string, string, string, string)
void set_task_info(string, int)
void calibrate()
double get_resolution() const
Definition: memory.cc:33
void start()
Definition: memory.cc:39
void stop()
Definition: memory.cc:42
a class to store JSON values
Definition: json.hpp:174
void push_back(basic_json &&val)
add an object to an array
Definition: json.hpp:4686
string_t dump(const int indent=-1) const
serialization
Definition: json.hpp:2079
#define THROW(whole_exception_expr)
Wrapper for throw. Saves the throwing point.
Definition: exceptions.hh:53
#define WarningOut()
Macro defining 'warning' record of log.
Definition: logger.hh:278
#define LogOut()
Macro defining 'log' record of log.
Definition: logger.hh:281
manipulators::Array< T, Delim > format(T const &deduce, Delim delim=", ")
Definition: logger.hh:325
unsigned int uint
#define MPI_COMM_WORLD
Definition: mpi.h:123
#define MPI_INT
Definition: mpi.h:160
#define MPI_Barrier(comm)
Definition: mpi.h:531
#define MPI_Comm_size
Definition: mpi.h:235
int MPI_Comm
Definition: mpi.h:141
#define MPI_MAX
Definition: mpi.h:197
#define MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm)
Definition: mpi.h:608
#define MPI_SUM
Definition: mpi.h:196
#define MPI_MIN
Definition: mpi.h:198
#define MPI_DOUBLE
Definition: mpi.h:156
#define MPI_LONG
Definition: mpi.h:161
#define MPI_Comm_rank
Definition: mpi.h:236
std::string format(CStringRef format_str, ArgList args)
Definition: format.h:3141
void swap(nlohmann::json &j1, nlohmann::json &j2) noexcept(is_nothrow_move_constructible< nlohmann::json >::value and is_nothrow_move_assignable< nlohmann::json >::value)
exchanges the values of two JSON objects
Definition: json.hpp:8688
#define END_TIMER(tag)
Ends a timer with specified tag.
#define START_TIMER(tag)
Starts a timer with specified tag.
#define CONSTEXPR_
Definition: sys_profiler.hh:86
#define OPERATOR_NEW_THROW_EXCEPTION
Definition: system.hh:54
void chkerr(unsigned int ierr)
Replacement of new/delete operator in the spirit of xmalloc.
Definition: system.hh:142
STREAM & operator<<(STREAM &s, UpdateFlags u)