Flow123d  release_3.0.0-1141-g4222d07
sys_profiler.cc
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file sys_profiler.cc
15  * @ingroup system
16  * @brief Profiler
17  */
18 
19 
20 // Fat header
21 
22 #include <fstream>
23 #include <iomanip>
24 #include <sys/param.h>
25 
26 #ifdef FLOW123D_HAVE_PYTHON
27  #include "Python.h"
28 #endif // FLOW123D_HAVE_PYTHON
29 
30 #include "sys_profiler.hh"
31 #include "system/system.hh"
32 #include "system/python_loader.hh"
33 #include <iostream>
34 #include <boost/format.hpp>
35 #include <boost/property_tree/ptree.hpp>
36 #include <boost/property_tree/json_parser.hpp>
37 #include <boost/unordered_map.hpp>
38 
39 #include "system/file_path.hh"
40 #include "system/python_loader.hh"
41 #include "mpi.h"
42 #include "time_point.hh"
43 
44 // namespace alias
45 namespace property_tree = boost::property_tree;
46 
47 /*
48  * These should be replaced by using boost MPI interface
49  */
50 int MPI_Functions::sum(int* val, MPI_Comm comm) {
51  int total = 0;
52  MPI_Reduce(val, &total, 1, MPI_INT, MPI_SUM, 0, comm);
53  return total;
54  }
55 
56 double MPI_Functions::sum(double* val, MPI_Comm comm) {
57  double total = 0;
58  MPI_Reduce(val, &total, 1, MPI_DOUBLE, MPI_SUM, 0, comm);
59  return total;
60  }
61 
62 long MPI_Functions::sum(long* val, MPI_Comm comm) {
63  long total = 0;
64  MPI_Reduce(val, &total, 1, MPI_LONG, MPI_SUM, 0, comm);
65  return total;
66  }
67 
68 int MPI_Functions::min(int* val, MPI_Comm comm) {
69  int min = 0;
70  MPI_Reduce(val, &min, 1, MPI_INT, MPI_MIN, 0, comm);
71  return min;
72  }
73 
74 double MPI_Functions::min(double* val, MPI_Comm comm) {
75  double min = 0;
76  MPI_Reduce(val, &min, 1, MPI_DOUBLE, MPI_MIN, 0, comm);
77  return min;
78  }
79 
80 long MPI_Functions::min(long* val, MPI_Comm comm) {
81  long min = 0;
82  MPI_Reduce(val, &min, 1, MPI_LONG, MPI_MIN, 0, comm);
83  return min;
84  }
85 
86 int MPI_Functions::max(int* val, MPI_Comm comm) {
87  int max = 0;
88  MPI_Reduce(val, &max, 1, MPI_INT, MPI_MAX, 0, comm);
89  return max;
90  }
91 
92 double MPI_Functions::max(double* val, MPI_Comm comm) {
93  double max = 0;
94  MPI_Reduce(val, &max, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
95  return max;
96  }
97 
98 long MPI_Functions::max(long* val, MPI_Comm comm) {
99  long max = 0;
100  MPI_Reduce(val, &max, 1, MPI_LONG, MPI_MAX, 0, comm);
101  return max;
102  }
103 
104 
105 #ifdef FLOW123D_DEBUG_PROFILER
106 /*********************************************************************************************
107  * Implementation of class Timer
108  */
109 
110 const int timer_no_child=-1;
111 
112 Timer::Timer(const CodePoint &cp, int parent)
113 : start_time(TimePoint()),
114  cumul_time(0.0),
115  call_count(0),
116  start_count(0),
117  code_point_(&cp),
118  full_hash_(cp.hash_),
119  hash_idx_(cp.hash_idx_),
120  parent_timer(parent),
121  total_allocated_(0),
122  total_deallocated_(0),
123  max_allocated_(0),
124  current_allocated_(0),
125  alloc_called(0),
126  dealloc_called(0)
127 #ifdef FLOW123D_HAVE_PETSC
128 , petsc_start_memory(0),
129  petsc_end_memory (0),
130  petsc_memory_difference(0),
131  petsc_peak_memory(0),
132  petsc_local_peak_memory(0)
133 #endif // FLOW123D_HAVE_PETSC
134 {
135  for(unsigned int i=0; i< max_n_childs ;i++) child_timers[i]=timer_no_child;
136 }
137 
138 
139 /**
140  * Debug information of the timer
141  */
142 ostream & operator <<(ostream& os, const Timer& timer) {
143  os << " Timer: " << timer.tag() << endl;
144  os << " malloc: " << timer.total_allocated_ << endl;
145  os << " dalloc: " << timer.total_deallocated_ << endl;
146  #ifdef FLOW123D_HAVE_PETSC
147  os << " start: " << timer.petsc_start_memory << endl;
148  os << " stop : " << timer.petsc_end_memory << endl;
149  os << " diff : " << timer.petsc_memory_difference << " (" << timer.petsc_end_memory - timer.petsc_start_memory << ")" << endl;
150  os << " peak : " << timer.petsc_peak_memory << " (" << timer.petsc_local_peak_memory << ")" << endl;
151  #endif // FLOW123D_HAVE_PETSC
152  os << endl;
153  return os;
154 }
155 
156 
157 double Timer::cumulative_time() const {
158  return cumul_time;
159 }
160 
161 void Profiler::accept_from_child(Timer &parent, Timer &child) {
162  int child_timer = 0;
163  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
164  child_timer = child.child_timers[i];
165  if (child_timer != timer_no_child) {
166  // propagate metrics from child to parent
167  accept_from_child(child, timers_[child_timer]);
168  }
169  }
170  // compute totals by adding values from child
171  parent.total_allocated_ += child.total_allocated_;
172  parent.total_deallocated_ += child.total_deallocated_;
173  parent.alloc_called += child.alloc_called;
174  parent.dealloc_called += child.dealloc_called;
175 
176 #ifdef FLOW123D_HAVE_PETSC
177  if (petsc_monitor_memory) {
178  // add differences from child
179  parent.petsc_memory_difference += child.petsc_memory_difference;
180  parent.current_allocated_ += child.current_allocated_;
181 
182  // when computing maximum, we take greater value from parent and child
183  // PetscMemoryGetCurrentUsage always return absolute (not relative) value
184  parent.petsc_peak_memory = max(parent.petsc_peak_memory, child.petsc_peak_memory);
185  }
186 #endif // FLOW123D_HAVE_PETSC
187 
188  parent.max_allocated_ = max(parent.max_allocated_, child.max_allocated_);
189 }
190 
191 
192 void Timer::pause() {
193 #ifdef FLOW123D_HAVE_PETSC
194  if (Profiler::get_petsc_memory_monitoring()) {
195  // get the maximum resident set size (memory used) for the program.
196  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
197  if (petsc_peak_memory < petsc_local_peak_memory)
198  petsc_peak_memory = petsc_local_peak_memory;
199  }
200 #endif // FLOW123D_HAVE_PETSC
201 }
202 
203 void Timer::resume() {
204 #ifdef FLOW123D_HAVE_PETSC
205  if (Profiler::get_petsc_memory_monitoring()) {
206  // tell PETSc to monitor the maximum memory usage so
207  // that PetscMemoryGetMaximumUsage() will work.
208  PetscMemorySetGetMaximumUsage();
209  }
210 #endif // FLOW123D_HAVE_PETSC
211 }
212 
213 void Timer::start() {
214 #ifdef FLOW123D_HAVE_PETSC
215  if (Profiler::get_petsc_memory_monitoring()) {
216  // Tell PETSc to monitor the maximum memory usage so
217  // that PetscMemoryGetMaximumUsage() will work.
218  PetscMemorySetGetMaximumUsage();
219  PetscMemoryGetCurrentUsage (&petsc_start_memory);
220  }
221 #endif // FLOW123D_HAVE_PETSC
222 
223  if (start_count == 0) {
224  start_time = TimePoint();
225  }
226  call_count++;
227  start_count++;
228 }
229 
230 
231 
232 bool Timer::stop(bool forced) {
233 #ifdef FLOW123D_HAVE_PETSC
234  if (Profiler::get_petsc_memory_monitoring()) {
235  // get current memory usage
236  PetscMemoryGetCurrentUsage (&petsc_end_memory);
237  petsc_memory_difference += petsc_end_memory - petsc_start_memory;
238 
239  // get the maximum resident set size (memory used) for the program.
240  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
241  if (petsc_peak_memory < petsc_local_peak_memory)
242  petsc_peak_memory = petsc_local_peak_memory;
243  }
244 #endif // FLOW123D_HAVE_PETSC
245 
246  if (forced) start_count=1;
247 
248  if (start_count == 1) {
249  cumul_time += (TimePoint() - start_time);
250  start_count--;
251  return true;
252  } else {
253  start_count--;
254  }
255  return false;
256 }
257 
258 
259 
260 void Timer::add_child(int child_index, const Timer &child)
261 {
262  unsigned int idx = child.hash_idx_;
263  if (child_timers[idx] != timer_no_child) {
264  // hash collision, find first empty place
265  unsigned int i=idx;
266  do {
267  i=( i < max_n_childs ? i+1 : 0);
268  } while (i!=idx && child_timers[i] != timer_no_child);
269  ASSERT(i!=idx)(tag()).error("Too many children of the timer");
270  idx=i;
271  }
272  //DebugOut().fmt("Adding child {} at index: {}\n", child_index, idx);
273  child_timers[idx] = child_index;
274 }
275 
276 
277 
278 string Timer::code_point_str() const {
279  return boost::str( boost::format("%s:%d, %s()") % code_point_->file_ % code_point_->line_ % code_point_->func_ );
280 }
281 
282 
283 /***********************************************************************************************
284  * Implementation of Profiler
285  */
286 
287 
289  if (_instance == NULL) {
290  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
291  _instance = new Profiler();
292  }
293  return _instance;
294  }
295 
296 
297 static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
299 const long Profiler::malloc_map_reserve = 100 * 1000;
300 CodePoint Profiler::null_code_point = CodePoint("__no_tag__", "__no_file__", "__no_func__", 0);
301 
302 void Profiler::initialize() {
303  instance();
304  set_memory_monitoring(true, true);
305 }
306 
307 
309 : actual_node(0),
310  task_size_(1),
311  start_time( time(NULL) ),
312  json_filepath("")
313 
314 {
315 #ifdef FLOW123D_DEBUG_PROFILER
316  timers_.push_back( Timer(main_cp, 0) );
317  timers_[0].start();
318 #endif
319 }
320 
321 
322 
323 void Profiler::propagate_timers() {
324  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
325  unsigned int child_timer = timers_[0].child_timers[i];
326  if ((signed int)child_timer != timer_no_child) {
327  // propagate metrics from child to Whole-Program time-frame
328  accept_from_child(timers_[0], timers_[child_timer]);
329  }
330  }
331 }
332 
333 
334 
335 void Profiler::set_task_info(string description, int size) {
336  task_description_ = description;
337  task_size_ = size;
338 }
339 
340 
341 
342 void Profiler::set_program_info(string program_name, string program_version, string branch, string revision, string build) {
343  flow_name_ = program_name;
344  flow_version_ = program_version;
345  flow_branch_ = branch;
346  flow_revision_ = revision;
347  flow_build_ = build;
348 }
349 
350 
351 
352 int Profiler::start_timer(const CodePoint &cp) {
353  unsigned int parent_node = actual_node;
354  //DebugOut().fmt("Start timer: {}\n", cp.tag_);
355  int child_idx = find_child(cp);
356  if (child_idx < 0) {
357  //DebugOut().fmt("Adding timer: {}\n", cp.tag_);
358  // tag not present - create new timer
359  child_idx=timers_.size();
360  timers_.push_back( Timer(cp, actual_node) );
361  timers_[actual_node].add_child(child_idx , timers_.back() );
362  }
363  actual_node=child_idx;
364 
365  // pause current timer
366  timers_[parent_node].pause();
367 
368  timers_[actual_node].start();
369 
370  return actual_node;
371 }
372 
373 
374 
375 int Profiler::find_child(const CodePoint &cp) {
376  Timer &timer =timers_[actual_node];
377  unsigned int idx = cp.hash_idx_;
378  unsigned int child_idx;
379  do {
380  if (timer.child_timers[idx] == timer_no_child) break; // tag is not there
381 
382  child_idx=timer.child_timers[idx];
383  ASSERT_LT(child_idx, timers_.size()).error();
384  if (timers_[child_idx].full_hash_ == cp.hash_) return child_idx;
385  idx = ( (unsigned int)(idx)==(Timer::max_n_childs - 1) ? 0 : idx+1 );
386  } while ( (unsigned int)(idx) != cp.hash_idx_ ); // passed through whole array
387  return -1;
388 }
389 
390 
391 
392 void Profiler::stop_timer(const CodePoint &cp) {
393 #ifdef FLOW123D_DEBUG
394  // check that all childrens are closed
395  Timer &timer=timers_[actual_node];
396  for(unsigned int i=0; i < Timer::max_n_childs; i++)
397  if (timer.child_timers[i] != timer_no_child)
398  ASSERT(! timers_[timer.child_timers[i]].running())(timers_[timer.child_timers[i]].tag())(timer.tag())
399  .error("Child timer running while closing timer.");
400 #endif
401  unsigned int child_timer = actual_node;
402  if ( cp.hash_ != timers_[actual_node].full_hash_) {
403  // timer to close is not actual - we search for it above actual
404  for(unsigned int node=actual_node; node != 0; node=timers_[node].parent_timer) {
405  if ( cp.hash_ == timers_[node].full_hash_) {
406  // found above - close all nodes between
407  for(; (unsigned int)(actual_node) != node; actual_node=timers_[actual_node].parent_timer) {
408  WarningOut() << "Timer to close '" << cp.tag_ << "' do not match actual timer '"
409  << timers_[actual_node].tag() << "'. Force closing actual." << std::endl;
410  timers_[actual_node].stop(true);
411  }
412  // close 'node' itself
413  timers_[actual_node].stop(false);
414  actual_node = timers_[actual_node].parent_timer;
415 
416  // actual_node == child_timer indicates this is root
417  if (actual_node == child_timer)
418  return;
419 
420  // resume current timer
421  timers_[actual_node].resume();
422  return;
423  }
424  }
425  // node not found - do nothing
426  return;
427  }
428  // node to close match the actual
429  timers_[actual_node].stop(false);
430  actual_node = timers_[actual_node].parent_timer;
431 
432  // actual_node == child_timer indicates this is root
433  if (actual_node == child_timer)
434  return;
435 
436  // resume current timer
437  timers_[actual_node].resume();
438 }
439 
440 
441 
442 void Profiler::stop_timer(int timer_index) {
443  // stop_timer with CodePoint type
444  // timer which is still running MUST be the same as actual_node index
445  // if timer is not running index will differ
446  if (timers_[timer_index].running()) {
447  ASSERT_EQ(timer_index, (int)actual_node).error();
448  stop_timer(*timers_[timer_index].code_point_);
449  }
450 
451 }
452 
453 
454 
455 void Profiler::add_calls(unsigned int n_calls) {
456  timers_[actual_node].call_count += n_calls-1;
457 }
458 
459 
460 
461 void Profiler::notify_malloc(const size_t size, const long p) {
462  if (!global_monitor_memory)
463  return;
464 
465  MemoryAlloc::malloc_map()[p] = static_cast<int>(size);
466  timers_[actual_node].total_allocated_ += size;
467  timers_[actual_node].current_allocated_ += size;
468  timers_[actual_node].alloc_called++;
469 
470  if (timers_[actual_node].current_allocated_ > timers_[actual_node].max_allocated_)
471  timers_[actual_node].max_allocated_ = timers_[actual_node].current_allocated_;
472 }
473 
474 
475 
476 void Profiler::notify_free(const long p) {
477  if (!global_monitor_memory)
478  return;
479 
480  int size = sizeof(p);
481  if (MemoryAlloc::malloc_map()[(long)p] > 0) {
482  size = MemoryAlloc::malloc_map()[(long)p];
483  MemoryAlloc::malloc_map().erase((long)p);
484  }
485  timers_[actual_node].total_deallocated_ += size;
486  timers_[actual_node].current_allocated_ -= size;
487  timers_[actual_node].dealloc_called++;
488 }
489 
490 
491 double Profiler::get_resolution () {
492  const int measurements = 100;
493  double result = 0;
494 
495  // perform 100 measurements
496  for (unsigned int i = 1; i < measurements; i++) {
497  TimePoint t1 = TimePoint ();
498  TimePoint t2 = TimePoint ();
499 
500  // double comparison should be avoided
501  while ((t2 - t1) == 0) t2 = TimePoint ();
502  // while ((t2.ticks - t1.ticks) == 0) t2 = TimePoint ();
503 
504  result += t2 - t1;
505  }
506 
507  return (result / measurements) * 1000; // ticks to seconds to microseconds conversion
508 }
509 
510 
511 std::string common_prefix( std::string a, std::string b ) {
512  if( a.size() > b.size() ) std::swap(a,b) ;
513  return std::string( a.begin(), std::mismatch( a.begin(), a.end(), b.begin() ).first ) ;
514 }
515 
516 
517 
518 template<typename ReduceFunctor>
519 void Profiler::add_timer_info(ReduceFunctor reduce, property_tree::ptree* holder, int timer_idx, double parent_time) {
520 
521  // get timer and check preconditions
522  Timer &timer = timers_[timer_idx];
523  ASSERT(timer_idx >=0)(timer_idx).error("Wrong timer index.");
524  ASSERT(timer.parent_timer >=0).error("Inconsistent tree.");
525 
526  // fix path
527  string filepath = timer.code_point_->file_;
528 
529  // if constant FLOW123D_SOURCE_DIR is defined, we try to erase it from beginning of each CodePoint's filepath
530  #ifdef FLOW123D_SOURCE_DIR
531  string common_path = common_prefix (string(FLOW123D_SOURCE_DIR), filepath);
532  filepath.erase (0, common_path.size());
533  #endif
534 
535 
536  // generate node representing this timer
537  // add basic information
538  property_tree::ptree node;
539  double cumul_time_sum;
540  node.put ("tag", (timer.tag()) );
541  node.put ("file-path", (filepath) );
542  node.put ("file-line", (timer.code_point_->line_) );
543  node.put ("function", (timer.code_point_->func_) );
544  cumul_time_sum = reduce (timer, node);
545 
546 
547  // statistical info
548  if (timer_idx == 0) parent_time = cumul_time_sum;
549  double percent = parent_time > 1.0e-10 ? cumul_time_sum / parent_time * 100.0 : 0.0;
550  node.put<double> ("percent", percent);
551 
552  // when collecting timer info, we need to make sure each processor contains
553  // the same time-frames, for now we simply reduce info to current timer
554  // using operation MPI_MIN, so if some processor does not contain some
555  // time-frame other processors will forget this time-frame (information lost)
556  /*
557  int child_timers[ Timer::max_n_childs];
558  MPI_Allreduce(&timer.child_timers, &child_timers, Timer::max_n_childs, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
559  #ifdef FLOW123D_DEBUG
560  int mpi_rank = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
561  for (int j = 0; j < Timer::max_n_childs; j++) {
562  if (child_timers[j] != timer.child_timers[j]) {
563  // this is severe error, timers indicies are different
564  ASSERT(child_timers[j] == timer_no_child).error("Severe error: timers indicies are different.");
565 
566  // explore the difference in more depth
567  if (timer.child_timers[j] == timer_no_child) {
568  // this processor does not have child timer
569  WarningOut() << "Inconsistent tree in '" << timer.tag() << "': this processor[" << mpi_rank
570  << "] does not have child-timer[" << child_timers[j] << "]" << std::endl;
571  } else {
572  // other processors do not have child timer
573  WarningOut() << "Inconsistent tree in '" << timer.tag() << "': this processor[" << mpi_rank
574  << "] contains child-timer[" << timer.child_timers[j] << "] which others do not" << std::endl;
575  }
576  }
577  }
578  #endif // FLOW123D_DEBUG
579 */
580  // write times children timers using secured child_timers array
581  property_tree::ptree children;
582  bool has_children = false;
583  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
584  if (timer.child_timers[i] != timer_no_child) {
585  add_timer_info (reduce, &children, timer.child_timers[i], cumul_time_sum);
586  /*
587  if (child_timers[i] != timer_no_child) {
588  add_timer_info (reduce, &children, child_timers[i], cumul_time_sum); */
589  has_children = true;
590  }
591  }
592 
593  // add children tag and other info if present
594  if (has_children)
595  node.add_child ("children", children);
596 
597  // push itself to root ptree 'array'
598  holder->push_back (std::make_pair ("", node));
599 }
600 
601 
602 template <class T>
603 void save_nonmpi_metric (property_tree::ptree &node, T * ptr, string name) {
604  node.put (name+"-min", *ptr);
605  node.put (name+"-max", *ptr);
606  node.put (name+"-sum", *ptr);
607 }
608 
609 std::shared_ptr<std::ostream> Profiler::get_default_output_stream() {
610  char filename[PATH_MAX];
611  strftime(filename, sizeof (filename) - 1, "profiler_info_%y.%m.%d_%H-%M-%S.log.json", localtime(&start_time));
612  json_filepath = FilePath(string(filename), FilePath::output_file);
613 
614  //LogOut() << "output into: " << json_filepath << std::endl;
615  return make_shared<ofstream>(json_filepath.c_str());
616 }
617 
618 
619 #ifdef FLOW123D_HAVE_MPI
620 template <class T>
621 void save_mpi_metric (property_tree::ptree &node, MPI_Comm comm, T * ptr, string name) {
622  node.put (name+"-min", MPI_Functions::min(ptr, comm));
623  node.put (name+"-max", MPI_Functions::max(ptr, comm));
624  node.put (name+"-sum", MPI_Functions::sum(ptr, comm));
625 }
626 
627 void Profiler::output(MPI_Comm comm, ostream &os) {
628  int mpi_rank, mpi_size;
629  //wait until profiling on all processors is finished
630  MPI_Barrier(comm);
631  stop_timer(0);
632  propagate_timers();
633 
634  // stop monitoring memory
635  bool temp_memory_monitoring = global_monitor_memory;
636  set_memory_monitoring(false, petsc_monitor_memory);
637 
638  chkerr( MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) );
639  MPI_Comm_size(comm, &mpi_size);
640 
641  // output header
642  property_tree::ptree root, children;
643  output_header (root, mpi_size);
644 
645  // recursively add all timers info
646  // define lambda function which reduces timer from multiple processors
647  // MPI implementation uses MPI call to reduce values
648  auto reduce = [=] (Timer &timer, property_tree::ptree &node) -> double {
649  int call_count = timer.call_count;
650  double cumul_time = timer.cumulative_time ();
651 
652  long memory_allocated = (long)timer.total_allocated_;
653  long memory_deallocated = (long)timer.total_deallocated_;
654  long memory_peak = (long)timer.max_allocated_;
655 
656  int alloc_called = timer.alloc_called;
657  int dealloc_called = timer.dealloc_called;
658 
659 
660  save_mpi_metric<double>(node, comm, &cumul_time, "cumul-time");
661  save_mpi_metric<int>(node, comm, &call_count, "call-count");
662 
663  save_mpi_metric<long>(node, comm, &memory_allocated, "memory-alloc");
664  save_mpi_metric<long>(node, comm, &memory_deallocated, "memory-dealloc");
665  save_mpi_metric<long>(node, comm, &memory_peak, "memory-peak");
666  //
667  save_mpi_metric<int>(node, comm, &alloc_called, "memory-alloc-called");
668  save_mpi_metric<int>(node, comm, &dealloc_called, "memory-dealloc-called");
669 
670 #ifdef FLOW123D_HAVE_PETSC
671  long petsc_memory_difference = (long)timer.petsc_memory_difference;
672  long petsc_peak_memory = (long)timer.petsc_peak_memory;
673  save_mpi_metric<long>(node, comm, &petsc_memory_difference, "memory-petsc-diff");
674  save_mpi_metric<long>(node, comm, &petsc_peak_memory, "memory-petsc-peak");
675 #endif // FLOW123D_HAVE_PETSC
676 
677  return MPI_Functions::sum(&cumul_time, comm);
678  };
679 
680  add_timer_info (reduce, &children, 0, 0.0);
681  root.add_child ("children", children);
682 
683 
684  // create profiler output only once (on the first processor)
685  // only active communicator should be the one with mpi_rank 0
686  if (mpi_rank == 0) {
687  try {
688  /**
689  * Flag to property_tree::write_json method
690  * resulting in json human readable format (indents, newlines)
691  */
692  const int FLOW123D_JSON_HUMAN_READABLE = 1;
693  // write result to stream
694  property_tree::write_json (os, root, FLOW123D_JSON_HUMAN_READABLE);
695  } catch (property_tree::json_parser::json_parser_error & e) {
696  stringstream ss;
697  ss << "Throw json_parser_error: " << e.message() << "\nIn file: " << e.filename() << ", at line " << e.line() << "\n";
698  THROW( ExcMessage() << EI_Message(ss.str()) );
699  }
700  }
701  // restore memory monitoring
702  set_memory_monitoring(temp_memory_monitoring, petsc_monitor_memory);
703 }
704 
705 
706 void Profiler::output(MPI_Comm comm) {
707  int mpi_rank;
708  chkerr( MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) );
709  if (mpi_rank == 0) {
710  output(comm, *get_default_output_stream());
711  } else {
712  ostringstream os;
713  output(comm, os );
714  }
715 }
716 
717 #endif /* FLOW123D_HAVE_MPI */
718 
719 void Profiler::output(ostream &os) {
720  // last update
721  stop_timer(0);
722  propagate_timers();
723 
724  // output header
725  property_tree::ptree root, children;
726  /**
727  * Constant representing number of MPI processes
728  * where there is no MPI to work with (so 1 process)
729  */
730  const int FLOW123D_MPI_SINGLE_PROCESS = 1;
731  output_header (root, FLOW123D_MPI_SINGLE_PROCESS);
732 
733 
734  // recursively add all timers info
735  // define lambda function which reduces timer from multiple processors
736  // non-MPI implementation is just dummy repetition of initial value
737  auto reduce = [=] (Timer &timer, property_tree::ptree &node) -> double {
738  int call_count = timer.call_count;
739  double cumul_time = timer.cumulative_time ();
740 
741  long memory_allocated = (long)timer.total_allocated_;
742  long memory_deallocated = (long)timer.total_deallocated_;
743  long memory_peak = (long)timer.max_allocated_;
744 
745  int alloc_called = timer.alloc_called;
746  int dealloc_called = timer.dealloc_called;
747 
748  save_nonmpi_metric<double>(node, &cumul_time, "cumul-time");
749  save_nonmpi_metric<int>(node, &call_count, "call-count");
750 
751  save_nonmpi_metric<long>(node, &memory_allocated, "memory-alloc");
752  save_nonmpi_metric<long>(node, &memory_deallocated, "memory-dealloc");
753  save_nonmpi_metric<long>(node, &memory_peak, "memory-peak");
754 
755  save_nonmpi_metric<int>(node, &alloc_called, "memory-alloc-called");
756  save_nonmpi_metric<int>(node, &dealloc_called, "memory-dealloc-called");
757 
758 #ifdef FLOW123D_HAVE_PETSC
759  long petsc_memory_difference = (long)timer.petsc_memory_difference;
760  long petsc_peak_memory = (long)timer.petsc_peak_memory;
761  save_nonmpi_metric<long>(node, &petsc_memory_difference, "memory-petsc-diff");
762  save_nonmpi_metric<long>(node, &petsc_peak_memory, "memory-petsc-peak");
763 #endif // FLOW123D_HAVE_PETSC
764 
765  return cumul_time;
766  };
767 
768  add_timer_info (reduce, &children, 0, 0.0);
769  root.add_child ("children", children);
770 
771 
772  try {
773  /**
774  * Flag to property_tree::write_json method
775  * resulting in json human readable format (indents, newlines)
776  */
777  const int FLOW123D_JSON_HUMAN_READABLE = 1;
778  // write result to stream
779  property_tree::write_json (os, root, FLOW123D_JSON_HUMAN_READABLE);
780  } catch (property_tree::json_parser::json_parser_error & e) {
781  stringstream ss;
782  ss << "Throw json_parser_error: " << e.message() << "\nIn file: " << e.filename() << ", at line " << e.line() << "\n";
783  THROW( ExcMessage() << EI_Message(ss.str()) );
784  }
785 }
786 
787 
788 void Profiler::output() {
789  output(*get_default_output_stream());
790 }
791 
792 void Profiler::output_header (property_tree::ptree &root, int mpi_size) {
793  time_t end_time = time(NULL);
794 
795  const char format[] = "%x %X";
796  char start_time_string[BUFSIZ] = {0};
797  strftime(start_time_string, sizeof (start_time_string) - 1, format, localtime(&start_time));
798 
799  char end_time_string[BUFSIZ] = {0};
800  strftime(end_time_string, sizeof (end_time_string) - 1, format, localtime(&end_time));
801 
802  // generate current run details
803 
804  root.put ("program-name", flow_name_);
805  root.put ("program-version", flow_version_);
806  root.put ("program-branch", flow_branch_);
807  root.put ("program-revision", flow_revision_);
808  root.put ("program-build", flow_build_);
809  root.put ("timer-resolution", boost::format("%1.9f") % Profiler::get_resolution());
810  // if constant FLOW123D_SOURCE_DIR is defined, we add this information to profiler (later purposes)
811  #ifdef FLOW123D_SOURCE_DIR
812  #endif
813 
814  // print some information about the task at the beginning
815  root.put ("task-description", task_description_);
816  root.put ("task-size", task_size_);
817 
818  //print some information about the task at the beginning
819  root.put ("run-process-count", mpi_size);
820  root.put ("run-started-at", start_time_string);
821  root.put ("run-finished-at", end_time_string);
822 }
823 
824 #ifdef FLOW123D_HAVE_PYTHON
825 void Profiler::transform_profiler_data (const string &output_file_suffix, const string &formatter) {
826 
827  if (json_filepath=="") return;
828 
829  // error under CYGWIN environment : more details in this repo
830  // https://github.com/x3mSpeedy/cygwin-python-issue
831  //
832  // For now we only support profiler report conversion in UNIX environment
833  // Windows users will have to use a python script located in bin folder
834  //
835 
836  #ifndef FLOW123D_HAVE_CYGWIN
837  // grab module and function by importing module profiler_formatter_module.py
838  PyObject * python_module = PythonLoader::load_module_by_name ("profiler.profiler_formatter_module");
839  //
840  // def convert (json_location, output_file, formatter):
841  //
842  PyObject * convert_method = PythonLoader::get_callable (python_module, "convert" );
843 
844  int argument_index = 0;
845  PyObject * arguments = PyTuple_New (3);
846 
847  // set json path location as first argument
848  PyObject * tmp = PyUnicode_FromString (json_filepath.c_str());
849  PyTuple_SetItem (arguments, argument_index++, tmp);
850 
851  // set output path location as second argument
852  tmp = PyUnicode_FromString ((json_filepath + output_file_suffix).c_str());
853  PyTuple_SetItem (arguments, argument_index++, tmp);
854 
855  // set Formatter class as third value
856  tmp = PyUnicode_FromString (formatter.c_str());
857  PyTuple_SetItem (arguments, argument_index++, tmp);
858 
859  // execute method with arguments
860  PyObject_CallObject (convert_method, arguments);
861 
862  PythonLoader::check_error();
863 
864  #else
865 
866  // print information about windows-cygwin issue and offer manual solution
867  MessageOut() << "# Note: converting json profiler reports is not"
868  << " supported under Windows or Cygwin environment for now.\n"
869  << "# You can use python script located in bin/python folder"
870  << " in order to convert json report to txt or csv format.\n"
871  << "python profiler_formatter_script.py --input \"" << json_filepath
872  << "\" --output \"profiler.txt\"" << std::endl;
873  #endif // FLOW123D_HAVE_CYGWIN
874 }
875 #else
876 void Profiler::transform_profiler_data (const string &output_file_suffix, const string &formatter) {
877 }
878 
879 #endif // FLOW123D_HAVE_PYTHON
880 
881 
882 void Profiler::uninitialize() {
883  if (_instance) {
884  ASSERT(_instance->actual_node==0)(_instance->timers_[_instance->actual_node].tag())
885  .error("Forbidden to uninitialize the Profiler when actual timer is not zero.");
886  _instance->stop_timer(0);
887  set_memory_monitoring(false, false);
888  delete _instance;
889  _instance = NULL;
890  }
891 }
892 bool Profiler::global_monitor_memory = false;
893 bool Profiler::petsc_monitor_memory = true;
894 void Profiler::set_memory_monitoring(const bool global_monitor, const bool petsc_monitor) {
895  global_monitor_memory = global_monitor;
896  petsc_monitor_memory = petsc_monitor;
897 }
898 
899 bool Profiler::get_global_memory_monitoring() {
900  return global_monitor_memory;
901 }
902 
903 bool Profiler::get_petsc_memory_monitoring() {
904  return petsc_monitor_memory;
905 }
906 
907 unordered_map_with_alloc & MemoryAlloc::malloc_map() {
908  static unordered_map_with_alloc static_malloc_map;
909  return static_malloc_map;
910 }
911 
912 void * Profiler::operator new (size_t size) {
913  return malloc (size);
914 }
915 
916 void Profiler::operator delete (void* p) {
917  free(p);
918 }
919 
920 void *operator new (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
921  void * p = malloc(size);
922  Profiler::instance()->notify_malloc(size, (long)p);
923  return p;
924 }
925 
926 void *operator new[] (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
927  void * p = malloc(size);
928  Profiler::instance()->notify_malloc(size, (long)p);
929  return p;
930 }
931 
932 void *operator new[] (std::size_t size, const std::nothrow_t&) throw() {
933  void * p = malloc(size);
934  Profiler::instance()->notify_malloc(size, (long)p);
935  return p;
936 }
937 
938 void operator delete( void *p) throw() {
939  Profiler::instance()->notify_free((long)p);
940  free(p);
941 }
942 
943 void operator delete[]( void *p) throw() {
944  Profiler::instance()->notify_free((long)p);
945  free(p);
946 }
947 
948 #else // def FLOW123D_DEBUG_PROFILER
949 
951  initialize();
952  return _instance;
953  }
954 
955 Profiler* Profiler::_instance = NULL;
956 
958  if (_instance == NULL) {
959  _instance = new Profiler();
960  set_memory_monitoring(true, true);
961  }
962 }
963 
965  if (_instance) {
966  ASSERT(_instance->actual_node==0)(_instance->timers_[_instance->actual_node].tag())
967  .error("Forbidden to uninitialize the Profiler when actual timer is not zero.");
968  set_memory_monitoring(false, false);
969  _instance->stop_timer(0);
970  delete _instance;
971  _instance = NULL;
972  }
973 }
974 
975 
976 #endif // def FLOW123D_DEBUG_PROFILER
#define MPI_LONG
Definition: mpi.h:161
static int min(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:68
#define CONSTEXPR_
Definition: sys_profiler.hh:88
int MPI_Comm
Definition: mpi.h:141
#define OPERATOR_NEW_THROW_EXCEPTION
Definition: system.hh:53
static int sum(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:50
#define MessageOut()
Macro defining &#39;message&#39; record of log.
Definition: logger.hh:243
double get_resolution() const
std::string format(CStringRef format_str, ArgList args)
Definition: format.h:3141
#define MPI_SUM
Definition: mpi.h:196
void chkerr(unsigned int ierr)
Replacement of new/delete operator in the spirit of xmalloc.
Definition: system.hh:147
#define ASSERT(expr)
Allow use shorter versions of macro names if these names is not used with external library...
Definition: asserts.hh:346
void notify_free(const size_t size)
#define MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm)
Definition: mpi.h:608
static void uninitialize()
#define MPI_MIN
Definition: mpi.h:198
static int max(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:86
void set_program_info(string program_name, string program_version, string branch, string revision, string build)
void start()
Definition: memory.cc:39
void swap(nlohmann::json &j1, nlohmann::json &j2) noexcept(is_nothrow_move_constructible< nlohmann::json >::value andis_nothrow_move_assignable< nlohmann::json >::value)
exchanges the values of two JSON objects
Definition: json.hpp:8688
#define MPI_Comm_size
Definition: mpi.h:235
#define MPI_DOUBLE
Definition: mpi.h:156
STREAM & operator<<(STREAM &s, UpdateFlags u)
void transform_profiler_data(const string &output_file_suffix, const string &formatter)
#define MPI_Comm_rank
Definition: mpi.h:236
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54
#define MPI_INT
Definition: mpi.h:160
static void initialize()
#define ASSERT_LT(a, b)
Definition of comparative assert macro (Less Than)
Definition: asserts.hh:295
static Profiler * instance()
#define WarningOut()
Macro defining &#39;warning&#39; record of log.
Definition: logger.hh:246
#define MPI_COMM_WORLD
Definition: mpi.h:123
#define MPI_MAX
Definition: mpi.h:197
static Profiler * _instance
Definition: memory.cc:33
#define MPI_Barrier(comm)
Definition: mpi.h:531
void stop()
Definition: memory.cc:42
#define THROW(whole_exception_expr)
Wrapper for throw. Saves the throwing point.
Definition: exceptions.hh:53
void output(MPI_Comm comm, ostream &os)
#define ASSERT_EQ(a, b)
Definition of comparative assert macro (EQual)
Definition: asserts.hh:327
void set_task_info(string description, int size)
void notify_malloc(const size_t size)