Flow123d  master-3768d5dec
sys_profiler.cc
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file sys_profiler.cc
15  * @ingroup system
16  * @brief Profiler
17  */
18 
19 
20 // Fat header
21 
22 #include <fstream>
23 #include <iomanip>
24 #include <sys/param.h>
25 #include <unordered_map>
26 
27 #ifdef FLOW123D_HAVE_PYTHON
28  #include "Python.h"
29 #endif // FLOW123D_HAVE_PYTHON
30 
31 #include "sys_profiler.hh"
32 #include "system/system.hh"
33 #include "system/python_loader.hh"
34 #include <iostream>
35 #include <boost/format.hpp>
36 
37 #include "system/file_path.hh"
38 #include "mpi.h"
39 #include "time_point.hh"
40 
41 /*
42  * These should be replaced by using boost MPI interface
43  */
44 int MPI_Functions::sum(int* val, MPI_Comm comm) {
45  int total = 0;
46  MPI_Reduce(val, &total, 1, MPI_INT, MPI_SUM, 0, comm);
47  return total;
48  }
49 
50 double MPI_Functions::sum(double* val, MPI_Comm comm) {
51  double total = 0;
52  MPI_Reduce(val, &total, 1, MPI_DOUBLE, MPI_SUM, 0, comm);
53  return total;
54  }
55 
56 long MPI_Functions::sum(long* val, MPI_Comm comm) {
57  long total = 0;
58  MPI_Reduce(val, &total, 1, MPI_LONG, MPI_SUM, 0, comm);
59  return total;
60  }
61 
62 int MPI_Functions::min(int* val, MPI_Comm comm) {
63  int min = 0;
64  MPI_Reduce(val, &min, 1, MPI_INT, MPI_MIN, 0, comm);
65  return min;
66  }
67 
68 double MPI_Functions::min(double* val, MPI_Comm comm) {
69  double min = 0;
70  MPI_Reduce(val, &min, 1, MPI_DOUBLE, MPI_MIN, 0, comm);
71  return min;
72  }
73 
74 long MPI_Functions::min(long* val, MPI_Comm comm) {
75  long min = 0;
76  MPI_Reduce(val, &min, 1, MPI_LONG, MPI_MIN, 0, comm);
77  return min;
78  }
79 
80 int MPI_Functions::max(int* val, MPI_Comm comm) {
81  int max = 0;
82  MPI_Reduce(val, &max, 1, MPI_INT, MPI_MAX, 0, comm);
83  return max;
84  }
85 
86 double MPI_Functions::max(double* val, MPI_Comm comm) {
87  double max = 0;
88  MPI_Reduce(val, &max, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
89  return max;
90  }
91 
92 long MPI_Functions::max(long* val, MPI_Comm comm) {
93  long max = 0;
94  MPI_Reduce(val, &max, 1, MPI_LONG, MPI_MAX, 0, comm);
95  return max;
96  }
97 
98 
99 #ifdef FLOW123D_DEBUG_PROFILER
100 /*********************************************************************************************
101  * Implementation of class Timer
102  */
103 
104 const int timer_no_child=-1;
105 
106 Timer::Timer(const CodePoint &cp, int parent)
107 : start_time(TimePoint()),
108  cumul_time(0.0),
109  call_count(0),
110  start_count(0),
111  code_point_(&cp),
112  full_hash_(cp.hash_),
113  hash_idx_(cp.hash_idx_),
114  parent_timer(parent),
115  total_allocated_(0),
116  total_deallocated_(0),
117  max_allocated_(0),
118  current_allocated_(0),
119  alloc_called(0),
120  dealloc_called(0)
121 #ifdef FLOW123D_HAVE_PETSC
122 , petsc_start_memory(0),
123  petsc_end_memory (0),
124  petsc_memory_difference(0),
125  petsc_peak_memory(0),
126  petsc_local_peak_memory(0)
127 #endif // FLOW123D_HAVE_PETSC
128 {
129  for(unsigned int i=0; i< max_n_childs ;i++) child_timers[i]=timer_no_child;
130 }
131 
132 
133 /**
134  * Debug information of the timer
135  */
136 ostream & operator <<(ostream& os, const Timer& timer) {
137  os << " Timer: " << timer.tag() << endl;
138  os << " malloc: " << timer.total_allocated_ << endl;
139  os << " dalloc: " << timer.total_deallocated_ << endl;
140  #ifdef FLOW123D_HAVE_PETSC
141  os << " start: " << timer.petsc_start_memory << endl;
142  os << " stop : " << timer.petsc_end_memory << endl;
143  os << " diff : " << timer.petsc_memory_difference << " (" << timer.petsc_end_memory - timer.petsc_start_memory << ")" << endl;
144  os << " peak : " << timer.petsc_peak_memory << " (" << timer.petsc_local_peak_memory << ")" << endl;
145  #endif // FLOW123D_HAVE_PETSC
146  os << endl;
147  return os;
148 }
149 
150 
151 double Timer::cumulative_time() const {
152  return cumul_time;
153 }
154 
155 void Profiler::accept_from_child(Timer &parent, Timer &child) {
156  int child_timer = 0;
157  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
158  child_timer = child.child_timers[i];
159  if (child_timer != timer_no_child) {
160  // propagate metrics from child to parent
161  accept_from_child(child, timers_[child_timer]);
162  }
163  }
164  // compute totals by adding values from child
165  parent.total_allocated_ += child.total_allocated_;
166  parent.total_deallocated_ += child.total_deallocated_;
167  parent.alloc_called += child.alloc_called;
168  parent.dealloc_called += child.dealloc_called;
169 
170 #ifdef FLOW123D_HAVE_PETSC
171  if (petsc_monitor_memory) {
172  // add differences from child
173  parent.petsc_memory_difference += child.petsc_memory_difference;
174  parent.current_allocated_ += child.current_allocated_;
175 
176  // when computing maximum, we take greater value from parent and child
177  // PetscMemoryGetCurrentUsage always return absolute (not relative) value
178  parent.petsc_peak_memory = max(parent.petsc_peak_memory, child.petsc_peak_memory);
179  }
180 #endif // FLOW123D_HAVE_PETSC
181 
182  parent.max_allocated_ = max(parent.max_allocated_, child.max_allocated_);
183 }
184 
185 
186 void Timer::pause() {
187 #ifdef FLOW123D_HAVE_PETSC
188  if (Profiler::get_petsc_memory_monitoring()) {
189  // get the maximum resident set size (memory used) for the program.
190  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
191  if (petsc_peak_memory < petsc_local_peak_memory)
192  petsc_peak_memory = petsc_local_peak_memory;
193  }
194 #endif // FLOW123D_HAVE_PETSC
195 }
196 
197 void Timer::resume() {
198 #ifdef FLOW123D_HAVE_PETSC
199  if (Profiler::get_petsc_memory_monitoring()) {
200  // tell PETSc to monitor the maximum memory usage so
201  // that PetscMemoryGetMaximumUsage() will work.
202  PetscMemorySetGetMaximumUsage();
203  }
204 #endif // FLOW123D_HAVE_PETSC
205 }
206 
207 void Timer::start() {
208 #ifdef FLOW123D_HAVE_PETSC
209  if (Profiler::get_petsc_memory_monitoring()) {
210  // Tell PETSc to monitor the maximum memory usage so
211  // that PetscMemoryGetMaximumUsage() will work.
212  PetscMemorySetGetMaximumUsage();
213  PetscMemoryGetCurrentUsage (&petsc_start_memory);
214  }
215 #endif // FLOW123D_HAVE_PETSC
216 
217  if (start_count == 0) {
218  start_time = TimePoint();
219  }
220  call_count++;
221  start_count++;
222 }
223 
224 
225 
226 bool Timer::stop(bool forced) {
227 #ifdef FLOW123D_HAVE_PETSC
228  if (Profiler::get_petsc_memory_monitoring()) {
229  // get current memory usage
230  PetscMemoryGetCurrentUsage (&petsc_end_memory);
231  petsc_memory_difference += petsc_end_memory - petsc_start_memory;
232 
233  // get the maximum resident set size (memory used) for the program.
234  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
235  if (petsc_peak_memory < petsc_local_peak_memory)
236  petsc_peak_memory = petsc_local_peak_memory;
237  }
238 #endif // FLOW123D_HAVE_PETSC
239 
240  if (forced) start_count=1;
241 
242  if (start_count == 1) {
243  cumul_time += (TimePoint() - start_time);
244  start_count--;
245  return true;
246  } else {
247  start_count--;
248  }
249  return false;
250 }
251 
252 
253 
254 void Timer::add_child(int child_index, const Timer &child)
255 {
256  unsigned int idx = child.hash_idx_;
257  if (child_timers[idx] != timer_no_child) {
258  // hash collision, find first empty place
259  unsigned int i=idx;
260  do {
261  i=( i < max_n_childs ? i+1 : 0);
262  } while (i!=idx && child_timers[i] != timer_no_child);
263  ASSERT_PERMANENT(i!=idx)(tag()).error("Too many children of the timer");
264  idx=i;
265  }
266  //DebugOut().fmt("Adding child {} at index: {}\n", child_index, idx);
267  child_timers[idx] = child_index;
268 }
269 
270 
271 
272 string Timer::code_point_str() const {
273  return fmt::format("{%s}:{%d}, {%s}()",
274  code_point_->file_, code_point_->line_, code_point_->func_ );
275 }
276 
277 
278 /***********************************************************************************************
279  * Implementation of Profiler
280  */
281 
282 
283 Profiler * Profiler::instance(bool clear) {
284  static Profiler * _instance = NULL;
285 
286  if (clear) {
287  if (_instance != NULL) {
288  delete _instance;
289  _instance = NULL;
290  }
291  return _instance;
292  }
293 
294  if (_instance == NULL) {
295  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
296  _instance = new Profiler();
297  }
298 
299  return _instance;
300  }
301 
302 
303 // static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
304 const long Profiler::malloc_map_reserve = 100 * 1000;
305 
307 : actual_node(0),
308  task_size_(1),
309  start_time( time(NULL) ),
310  json_filepath(""),
311  none_timer_(CODE_POINT("NONE TIMER"), 0),
312  calibration_time_(-1)
313 
314 {
315  static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
316  set_memory_monitoring(true, true);
317 #ifdef FLOW123D_DEBUG_PROFILER
318  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
319  timers_.push_back( Timer(main_cp, 0) );
320  timers_[0].start();
321 #endif
322 }
323 
324 
325 void Profiler::calibrate() {
326 
327  uint SIZE = 64 * 1024;
328  uint HALF = SIZE / 2;
329  START_TIMER("UNIT_PAYLOAD");
330  Timer &timer = timers_[actual_node];
331 
332  uint count = 0;
333  while(TimePoint() - timer.start_time < 0.1) {
334  double * block = new double[SIZE];
335  for(uint i=0; i<HALF; i++) {
336  block[HALF+i] = block[i]*block[i] + i;
337  }
338  delete[] block;
339  count++;
340  }
341 
342 
343 
344  END_TIMER("UNIT_PAYLOAD");
345 
346  const double reference_count = 7730;
347  // reference count is average of 5 measurements on payload free laptop:
348  // DELL Latitude E5570, Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
349  // new/delete makes about 3% of time
350  calibration_time_ = 10 * timer.cumul_time * reference_count / count;
351  LogOut() << "Profiler calibration count: " << count << std::endl;
352  LogOut() << "Profiler calibration time: " << calibration_time_ << std::endl;
353 
354 }
355 
356 
357 
358 void Profiler::propagate_timers() {
359  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
360  unsigned int child_timer = timers_[0].child_timers[i];
361  if ((signed int)child_timer != timer_no_child) {
362  // propagate metrics from child to Whole-Program time-frame
363  accept_from_child(timers_[0], timers_[child_timer]);
364  }
365  }
366 }
367 
368 
369 
370 void Profiler::set_task_info(string description, int size) {
371  task_description_ = description;
372  task_size_ = size;
373 }
374 
375 
376 
377 void Profiler::set_program_info(string program_name, string program_version, string branch, string revision, string build) {
378  flow_name_ = program_name;
379  flow_version_ = program_version;
380  flow_branch_ = branch;
381  flow_revision_ = revision;
382  flow_build_ = build;
383 }
384 
385 
386 
387 int Profiler::start_timer(const CodePoint &cp) {
388  unsigned int parent_node = actual_node;
389  //DebugOut().fmt("Start timer: {}\n", cp.tag_);
390  int child_idx = find_child(cp);
391  if (child_idx < 0) {
392  //DebugOut().fmt("Adding timer: {}\n", cp.tag_);
393  // tag not present - create new timer
394  child_idx=timers_.size();
395  timers_.push_back( Timer(cp, actual_node) );
396  timers_[actual_node].add_child(child_idx , timers_.back() );
397  }
398  actual_node=child_idx;
399 
400  // pause current timer
401  timers_[parent_node].pause();
402 
403  timers_[actual_node].start();
404 
405  return actual_node;
406 }
407 
408 
409 
410 int Profiler::find_child(const CodePoint &cp) {
411  Timer &timer =timers_[actual_node];
412  unsigned int idx = cp.hash_idx_;
413  unsigned int child_idx;
414  do {
415  if (timer.child_timers[idx] == timer_no_child) break; // tag is not there
416 
417  child_idx=timer.child_timers[idx];
418  ASSERT_PERMANENT_LT(child_idx, timers_.size()).error();
419  if (timers_[child_idx].full_hash_ == cp.hash_) return child_idx;
420  idx = ( (unsigned int)(idx)==(Timer::max_n_childs - 1) ? 0 : idx+1 );
421  } while ( (unsigned int)(idx) != cp.hash_idx_ ); // passed through whole array
422  return -1;
423 }
424 
425 
426 
427 void Profiler::stop_timer(const CodePoint &cp) {
428 #ifdef FLOW123D_DEBUG_ASSERTS
429  // check that all childrens are closed
430  Timer &timer=timers_[actual_node];
431  for(unsigned int i=0; i < Timer::max_n_childs; i++)
432  if (timer.child_timers[i] != timer_no_child)
433  ASSERT_PERMANENT(! timers_[timer.child_timers[i]].running())(timers_[timer.child_timers[i]].tag())(timer.tag())
434  .error("Child timer running while closing timer.");
435 #endif
436  unsigned int child_timer = actual_node;
437  if ( cp.hash_ != timers_[actual_node].full_hash_) {
438  // timer to close is not actual - we search for it above actual
439  for(unsigned int node=actual_node; node != 0; node=timers_[node].parent_timer) {
440  if ( cp.hash_ == timers_[node].full_hash_) {
441  // found above - close all nodes between
442  for(; (unsigned int)(actual_node) != node; actual_node=timers_[actual_node].parent_timer) {
443  WarningOut() << "Timer to close '" << cp.tag_ << "' do not match actual timer '"
444  << timers_[actual_node].tag() << "'. Force closing actual." << std::endl;
445  timers_[actual_node].stop(true);
446  }
447  // close 'node' itself
448  timers_[actual_node].stop(false);
449  actual_node = timers_[actual_node].parent_timer;
450 
451  // actual_node == child_timer indicates this is root
452  if (actual_node == child_timer)
453  return;
454 
455  // resume current timer
456  timers_[actual_node].resume();
457  return;
458  }
459  }
460  // node not found - do nothing
461  return;
462  }
463  // node to close match the actual
464  timers_[actual_node].stop(false);
465  actual_node = timers_[actual_node].parent_timer;
466 
467  // actual_node == child_timer indicates this is root
468  if (actual_node == child_timer)
469  return;
470 
471  // resume current timer
472  timers_[actual_node].resume();
473 }
474 
475 
476 
477 void Profiler::stop_timer(int timer_index) {
478  // stop_timer with CodePoint type
479  // timer which is still running MUST be the same as actual_node index
480  // if timer is not running index will differ
481  if (timers_[timer_index].running()) {
482  ASSERT_PERMANENT_EQ(timer_index, (int)actual_node).error();
483  stop_timer(*timers_[timer_index].code_point_);
484  }
485 
486 }
487 
488 
489 
490 void Profiler::add_calls(unsigned int n_calls) {
491  timers_[actual_node].call_count += n_calls-1;
492 }
493 
494 
495 
496 void Profiler::notify_malloc(const size_t size, const long p) {
497  MemoryAlloc::malloc_map()[p] = static_cast<int>(size);
498  timers_[actual_node].total_allocated_ += size;
499  timers_[actual_node].current_allocated_ += size;
500  timers_[actual_node].alloc_called++;
501 
502  if (timers_[actual_node].current_allocated_ > timers_[actual_node].max_allocated_)
503  timers_[actual_node].max_allocated_ = timers_[actual_node].current_allocated_;
504 }
505 
506 
507 
508 void Profiler::notify_free(const long p) {
509  int size = sizeof(p);
510  if (MemoryAlloc::malloc_map()[(long)p] > 0) {
511  size = MemoryAlloc::malloc_map()[(long)p];
512  MemoryAlloc::malloc_map().erase((long)p);
513  }
514  timers_[actual_node].total_deallocated_ += size;
515  timers_[actual_node].current_allocated_ -= size;
516  timers_[actual_node].dealloc_called++;
517 }
518 
519 
520 double Profiler::get_resolution () {
521  const int measurements = 100;
522  double result = 0;
523 
524  // perform 100 measurements
525  for (unsigned int i = 1; i < measurements; i++) {
526  TimePoint t1 = TimePoint ();
527  TimePoint t2 = TimePoint ();
528 
529  // double comparison should be avoided
530  while ((t2 - t1) == 0) t2 = TimePoint ();
531  // while ((t2.ticks - t1.ticks) == 0) t2 = TimePoint ();
532 
533  result += t2 - t1;
534  }
535 
536  return (result / measurements) * 1000; // ticks to seconds to microseconds conversion
537 }
538 
539 
540 Timer Profiler::find_timer(string tag) {
541  for(auto t : timers_) {
542  if (t.tag() == tag) return t;
543  }
544  return none_timer_;
545 }
546 
547 
548 std::string common_prefix( std::string a, std::string b ) {
549  if( a.size() > b.size() ) std::swap(a,b) ;
550  return std::string( a.begin(), std::mismatch( a.begin(), a.end(), b.begin() ).first ) ;
551 }
552 
553 
554 
555 template<typename ReduceFunctor>
556 void Profiler::add_timer_info(ReduceFunctor reduce, nlohmann::json* holder, int timer_idx, double parent_time) {
557 
558  // get timer and check preconditions
559  Timer &timer = timers_[timer_idx];
560  ASSERT_PERMANENT(timer_idx >=0)(timer_idx).error("Wrong timer index.");
561  ASSERT_PERMANENT(timer.parent_timer >=0).error("Inconsistent tree.");
562 
563  // fix path
564  string filepath = timer.code_point_->file_;
565 
566  // if constant FLOW123D_SOURCE_DIR is defined, we try to erase it from beginning of each CodePoint's filepath
567  #ifdef FLOW123D_SOURCE_DIR
568  string common_path = common_prefix (string(FLOW123D_SOURCE_DIR), filepath);
569  filepath.erase (0, common_path.size());
570  #endif
571 
572 
573  // generate node representing this timer
574  // add basic information
575  nlohmann::json node;
576  double cumul_time_sum;
577  node["tag"] = timer.tag();
578  node["file-path"] = filepath;
579  node["file-line"] = timer.code_point_->line_;
580  node["function"] = timer.code_point_->func_;
581  cumul_time_sum = reduce(timer, node);
582 
583 
584  // statistical info
585  if (timer_idx == 0) parent_time = cumul_time_sum;
586  double percent = parent_time > 1.0e-10 ? cumul_time_sum / parent_time * 100.0 : 0.0;
587  node["percent"] = percent;
588 
589  // write times children timers using secured child_timers array
590  nlohmann::json children;
591 
592  // sort childs by its index in Profiler::timers_ in order to preserve call order
593  std::vector<int> child_timers_values;
594  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
595  if (timer.child_timers[i] != timer_no_child)
596  child_timers_values.push_back(timer.child_timers[i]);
597  }
598  std::sort(child_timers_values.begin(), child_timers_values.end());
599 
600  for(int idx : child_timers_values)
601  add_timer_info(reduce, &children, idx, cumul_time_sum);
602 
603  // add children tag and other info if present
604  if (child_timers_values.size() > 0) {
605  node["children"] = children;
606  }
607 
608  // add to the array
609  holder->push_back(node);
610 }
611 
612 
613 template <class T>
614 void save_nonmpi_metric (nlohmann::json &node, T * ptr, string name) {
615  node[name + "-min"] = *ptr;
616  node[name + "-max"] = *ptr;
617  node[name + "-sum"] = *ptr;
618 }
619 
620 std::shared_ptr<std::ostream> Profiler::get_default_output_stream() {
621  json_filepath = FilePath("profiler_info.log.json", FilePath::output_file);
622 
623  //LogOut() << "output into: " << json_filepath << std::endl;
624  return make_shared<ofstream>(json_filepath.c_str());
625 }
626 
627 
628 #ifdef FLOW123D_HAVE_MPI
629 template <class T>
630 void save_mpi_metric (nlohmann::json &node, MPI_Comm comm, T * ptr, string name) {
631  node[name + "-min"] = MPI_Functions::min(ptr, comm);
632  node[name + "-max"] = MPI_Functions::max(ptr, comm);
633  node[name + "-sum"] = MPI_Functions::sum(ptr, comm);
634 }
635 
636 void Profiler::output(MPI_Comm comm, ostream &os) {
637  int mpi_rank, mpi_size;
638  //wait until profiling on all processors is finished
639  MPI_Barrier(comm);
640  stop_timer(0);
641  propagate_timers();
642 
643  // stop monitoring memory
644  bool temp_memory_monitoring = global_monitor_memory;
645  set_memory_monitoring(false, petsc_monitor_memory);
646 
647  chkerr( MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) );
648  MPI_Comm_size(comm, &mpi_size);
649 
650  // output header
651  nlohmann::json jsonRoot, jsonChildren;
652 
653  // recursively add all timers info
654  // define lambda function which reduces timer from multiple processors
655  // MPI implementation uses MPI call to reduce values
656  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
657  int call_count = timer.call_count;
658  double cumul_time = timer.cumulative_time ();
659 
660  long memory_allocated = (long)timer.total_allocated_;
661  long memory_deallocated = (long)timer.total_deallocated_;
662  long memory_peak = (long)timer.max_allocated_;
663 
664  int alloc_called = timer.alloc_called;
665  int dealloc_called = timer.dealloc_called;
666 
667 
668  save_mpi_metric<double>(node, comm, &cumul_time, "cumul-time");
669  save_mpi_metric<int>(node, comm, &call_count, "call-count");
670 
671  save_mpi_metric<long>(node, comm, &memory_allocated, "memory-alloc");
672  save_mpi_metric<long>(node, comm, &memory_deallocated, "memory-dealloc");
673  save_mpi_metric<long>(node, comm, &memory_peak, "memory-peak");
674  //
675  save_mpi_metric<int>(node, comm, &alloc_called, "memory-alloc-called");
676  save_mpi_metric<int>(node, comm, &dealloc_called, "memory-dealloc-called");
677 
678 #ifdef FLOW123D_HAVE_PETSC
679  long petsc_memory_difference = (long)timer.petsc_memory_difference;
680  long petsc_peak_memory = (long)timer.petsc_peak_memory;
681  save_mpi_metric<long>(node, comm, &petsc_memory_difference, "memory-petsc-diff");
682  save_mpi_metric<long>(node, comm, &petsc_peak_memory, "memory-petsc-peak");
683 #endif // FLOW123D_HAVE_PETSC
684 
685  return MPI_Functions::sum(&cumul_time, comm);
686  };
687 
688  add_timer_info (reduce, &jsonChildren, 0, 0.0);
689  jsonRoot["children"] = jsonChildren;
690  output_header(jsonRoot, mpi_size);
691 
692 
693  // create profiler output only once (on the first processor)
694  // only active communicator should be the one with mpi_rank 0
695  if (mpi_rank == 0) {
696  try {
697  /**
698  * indent size
699  * results in json human readable format (indents, newlines)
700  */
701  const int FLOW123D_JSON_HUMAN_READABLE = 2;
702  // write result to stream
703  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
704 
705  } catch (exception & e) {
706  stringstream ss;
707  ss << "nlohmann::json::dump error: " << e.what() << "\n";
708  THROW( ExcMessage() << EI_Message(ss.str()) );
709  }
710  }
711  // restore memory monitoring
712  set_memory_monitoring(temp_memory_monitoring, petsc_monitor_memory);
713 }
714 
715 
716 void Profiler::output(MPI_Comm comm, string profiler_path /* = "" */) {
717  int mpi_rank;
718  chkerr(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank));
719 
720  if (mpi_rank == 0) {
721  if (profiler_path == "") {
722  output(comm, *get_default_output_stream());
723  } else {
724  json_filepath = profiler_path;
725  std::shared_ptr<std::ostream> os = make_shared<ofstream>(profiler_path.c_str());
726  output(comm, *os);
727  }
728  } else {
729  ostringstream os;
730  output(comm, os);
731  }
732 }
733 
734 #endif /* FLOW123D_HAVE_MPI */
735 
736 void Profiler::output(ostream &os) {
737  // last update
738  stop_timer(0);
739  propagate_timers();
740 
741  // output header
742  nlohmann::json jsonRoot, jsonChildren;
743  /**
744  * Constant representing number of MPI processes
745  * where there is no MPI to work with (so 1 process)
746  */
747  const int FLOW123D_MPI_SINGLE_PROCESS = 1;
748  output_header(jsonRoot, FLOW123D_MPI_SINGLE_PROCESS);
749 
750 
751  // recursively add all timers info
752  // define lambda function which reduces timer from multiple processors
753  // non-MPI implementation is just dummy repetition of initial value
754  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
755  int call_count = timer.call_count;
756  double cumul_time = timer.cumulative_time ();
757 
758  long memory_allocated = (long)timer.total_allocated_;
759  long memory_deallocated = (long)timer.total_deallocated_;
760  long memory_peak = (long)timer.max_allocated_;
761 
762  int alloc_called = timer.alloc_called;
763  int dealloc_called = timer.dealloc_called;
764 
765  save_nonmpi_metric<double>(node, &cumul_time, "cumul-time");
766  save_nonmpi_metric<int>(node, &call_count, "call-count");
767 
768  save_nonmpi_metric<long>(node, &memory_allocated, "memory-alloc");
769  save_nonmpi_metric<long>(node, &memory_deallocated, "memory-dealloc");
770  save_nonmpi_metric<long>(node, &memory_peak, "memory-peak");
771 
772  save_nonmpi_metric<int>(node, &alloc_called, "memory-alloc-called");
773  save_nonmpi_metric<int>(node, &dealloc_called, "memory-dealloc-called");
774 
775 #ifdef FLOW123D_HAVE_PETSC
776  long petsc_memory_difference = (long)timer.petsc_memory_difference;
777  long petsc_peak_memory = (long)timer.petsc_peak_memory;
778  save_nonmpi_metric<long>(node, &petsc_memory_difference, "memory-petsc-diff");
779  save_nonmpi_metric<long>(node, &petsc_peak_memory, "memory-petsc-peak");
780 #endif // FLOW123D_HAVE_PETSC
781 
782  return cumul_time;
783  };
784 
785  add_timer_info(reduce, &jsonChildren, 0, 0.0);
786  jsonRoot["children"] = jsonChildren;
787 
788  try {
789  /**
790  * indent size
791  * results in json human readable format (indents, newlines)
792  */
793  const int FLOW123D_JSON_HUMAN_READABLE = 2;
794  // write result to stream
795  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
796 
797  } catch (exception & e) {
798  stringstream ss;
799  ss << "nlohmann::json::dump error: " << e.what() << "\n";
800  THROW( ExcMessage() << EI_Message(ss.str()) );
801  }
802 }
803 
804 
805 void Profiler::output(string profiler_path /* = "" */) {
806  if(profiler_path == "") {
807  output(*get_default_output_stream());
808  } else {
809  json_filepath = profiler_path;
810  std::shared_ptr<std::ostream> os = make_shared<ofstream>(profiler_path.c_str());
811  output(*os);
812  }
813 }
814 
815 void Profiler::output_header (nlohmann::json &root, int mpi_size) {
816  time_t end_time = time(NULL);
817 
818  const char format[] = "%x %X";
819  char start_time_string[BUFSIZ] = {0};
820  strftime(start_time_string, sizeof (start_time_string) - 1, format, localtime(&start_time));
821 
822  char end_time_string[BUFSIZ] = {0};
823  strftime(end_time_string, sizeof (end_time_string) - 1, format, localtime(&end_time));
824 
825  if (timers_[0].cumul_time > 60) {
826  calibrate();
827  }
828  // generate current run details
829  root["program-name"] = flow_name_;
830  root["program-version"] = flow_version_;
831  root["program-branch"] = flow_branch_;
832  root["program-revision"] = flow_revision_;
833  root["program-build"] = flow_build_;
834  root["timer-resolution"] = Profiler::get_resolution();
835  root["timer-calibration"] = calibration_time_;
836 
837  // print some information about the task at the beginning
838  root["task-description"] = task_description_;
839  root["task-size"] = task_size_;
840 
841  //print some information about the task at the beginning
842  root["run-process-count"] = mpi_size;
843  root["run-started-at"] = start_time_string;
844  root["run-finished-at"] = end_time_string;
845 }
846 
847 #ifdef FLOW123D_HAVE_PYTHON
848 void Profiler::transform_profiler_data (const string &output_file_suffix, const string &formatter) {
849 
850  if (json_filepath == "") return;
851 
852  // error under CYGWIN environment : more details in this repo
853  // https://github.com/x3mSpeedy/cygwin-python-issue
854  //
855  // For now we only support profiler report conversion in UNIX environment
856  // Windows users will have to use a python script located in bin folder
857  //
858 
859  #ifndef FLOW123D_HAVE_CYGWIN
860  // grab module and function by importing module profiler_formatter_module.py
861  PyObject * python_module = PythonLoader::load_module_by_name ("profiler.profiler_formatter_module");
862  //
863  // def convert (json_location, output_file, formatter):
864  //
865  PyObject * convert_method = PythonLoader::get_callable (python_module, "convert" );
866 
867  int argument_index = 0;
868  PyObject * arguments = PyTuple_New (3);
869 
870  // set json path location as first argument
871  PyObject * tmp = PyUnicode_FromString (json_filepath.c_str());
872  PyTuple_SetItem (arguments, argument_index++, tmp);
873 
874  // set output path location as second argument
875  tmp = PyUnicode_FromString ((json_filepath + output_file_suffix).c_str());
876  PyTuple_SetItem (arguments, argument_index++, tmp);
877 
878  // set Formatter class as third value
879  tmp = PyUnicode_FromString (formatter.c_str());
880  PyTuple_SetItem (arguments, argument_index++, tmp);
881 
882  // execute method with arguments
883  PyObject_CallObject (convert_method, arguments);
884 
885  PythonLoader::check_error();
886 
887  #else
888 
889  // print information about windows-cygwin issue and offer manual solution
890  MessageOut() << "# Note: converting json profiler reports is not"
891  << " supported under Windows or Cygwin environment for now.\n"
892  << "# You can use python script located in bin/python folder"
893  << " in order to convert json report to txt or csv format.\n"
894  << "python profiler_formatter_script.py --input \"" << json_filepath
895  << "\" --output \"profiler.txt\"" << std::endl;
896  #endif // FLOW123D_HAVE_CYGWIN
897 }
898 #else
899 void Profiler::transform_profiler_data (const string &, const string &) {
900 }
901 
902 #endif // FLOW123D_HAVE_PYTHON
903 
904 
905 void Profiler::uninitialize() {
906  if (Profiler::instance()) {
907  ASSERT_PERMANENT(Profiler::instance()->actual_node==0)(Profiler::instance()->timers_[Profiler::instance()->actual_node].tag())
908  .error("Forbidden to uninitialize the Profiler when actual timer is not zero.");
909  Profiler::instance()->stop_timer(0);
910  set_memory_monitoring(false, false);
911  Profiler::instance(true);
912  }
913 }
914 bool Profiler::global_monitor_memory = false;
915 bool Profiler::petsc_monitor_memory = true;
916 void Profiler::set_memory_monitoring(const bool global_monitor, const bool petsc_monitor) {
917  global_monitor_memory = global_monitor;
918  petsc_monitor_memory = petsc_monitor;
919 }
920 
921 unordered_map_with_alloc & MemoryAlloc::malloc_map() {
922  static unordered_map_with_alloc static_malloc_map;
923  return static_malloc_map;
924 }
925 
926 void * Profiler::operator new (size_t size) {
927  return malloc (size);
928 }
929 
930 void Profiler::operator delete (void* p) {
931  free(p);
932 }
933 
934 void *operator new (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
935  void * p = malloc(size);
936  if (Profiler::get_global_memory_monitoring())
937  Profiler::instance()->notify_malloc(size, (long)p);
938  return p;
939 }
940 
941 void *operator new[] (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
942  void * p = malloc(size);
943  if (Profiler::get_global_memory_monitoring())
944  Profiler::instance()->notify_malloc(size, (long)p);
945  return p;
946 }
947 
948 void *operator new[] (std::size_t size, const std::nothrow_t&) throw() {
949  void * p = malloc(size);
950  if (Profiler::get_global_memory_monitoring())
951  Profiler::instance()->notify_malloc(size, (long)p);
952  return p;
953 }
954 
955 void operator delete( void *p) throw() {
956  if (Profiler::get_global_memory_monitoring())
957  Profiler::instance()->notify_free((long)p);
958  free(p);
959 }
960 
961 void operator delete( void *p, std::size_t) throw() {
962  if (Profiler::get_global_memory_monitoring())
963  Profiler::instance()->notify_free((long)p);
964  free(p);
965 }
966 
967 void operator delete[]( void *p) throw() {
968  if (Profiler::get_global_memory_monitoring())
969  Profiler::instance()->notify_free((long)p);
970  free(p);
971 }
972 
973 void operator delete[]( void *p, std::size_t) throw() {
974  if (Profiler::get_global_memory_monitoring())
975  Profiler::instance()->notify_free((long)p);
976  free(p);
977 }
978 
979 #else // def FLOW123D_DEBUG_PROFILER
980 
982  static Profiler * _instance = new Profiler();
983  return _instance;
984 }
985 
986 
987 // void Profiler::initialize() {
988 // if (_instance == NULL) {
989 // _instance = new Profiler();
990 // set_memory_monitoring(true, true);
991 // }
992 // }
993 
995 
996 
997 #endif // def FLOW123D_DEBUG_PROFILER
format
manipulators::Array< T, Delim > format(T const &deduce, Delim delim=", ")
Definition: logger.hh:325
MPI_MIN
#define MPI_MIN
Definition: mpi.h:198
Profiler::instance
static Profiler * instance(bool clear=false)
Definition: sys_profiler.cc:981
python_loader.hh
std::swap
void swap(nlohmann::json &j1, nlohmann::json &j2) noexcept(is_nothrow_move_constructible< nlohmann::json >::value and is_nothrow_move_assignable< nlohmann::json >::value)
exchanges the values of two JSON objects
Definition: json.hpp:8688
Profiler::transform_profiler_data
void transform_profiler_data(const string &, const string &)
Definition: sys_profiler.hh:900
OPERATOR_NEW_THROW_EXCEPTION
#define OPERATOR_NEW_THROW_EXCEPTION
Definition: system.hh:54
file_path.hh
ASSERT_PERMANENT_LT
#define ASSERT_PERMANENT_LT(a, b)
Definition of comparative assert macro (Less Than)
Definition: asserts.hh:297
FilePath
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54
MPI_MAX
#define MPI_MAX
Definition: mpi.h:197
Profiler::notify_malloc
void notify_malloc(const size_t)
Definition: sys_profiler.hh:884
chkerr
void chkerr(unsigned int ierr)
Replacement of new/delete operator in the spirit of xmalloc.
Definition: system.hh:142
MPI_DOUBLE
#define MPI_DOUBLE
Definition: mpi.h:156
THROW
#define THROW(whole_exception_expr)
Wrapper for throw. Saves the throwing point.
Definition: exceptions.hh:53
std::vector< int >
system.hh
Profiler::calibrate
void calibrate()
Profiler::uninitialize
static void uninitialize()
Definition: sys_profiler.cc:994
uint
unsigned int uint
Definition: mh_dofhandler.hh:101
CONSTEXPR_
#define CONSTEXPR_
Definition: sys_profiler.hh:86
fmt::format
std::string format(CStringRef format_str, ArgList args)
Definition: format.h:3141
Profiler::get_resolution
double get_resolution() const
Definition: sys_profiler.hh:902
Profiler::set_program_info
void set_program_info(string, string, string, string, string)
Definition: sys_profiler.hh:882
MPI_Comm_rank
#define MPI_Comm_rank
Definition: mpi.h:236
Timer
Definition: memory.cc:33
MPI_Comm_size
#define MPI_Comm_size
Definition: mpi.h:235
TimePoint
Definition: time_point.hh:69
LogOut
#define LogOut()
Macro defining 'log' record of log.
Definition: logger.hh:281
Profiler
Definition: sys_profiler.hh:876
sys_profiler.hh
ASSERT_PERMANENT
#define ASSERT_PERMANENT(expr)
Allow use shorter versions of macro names if these names is not used with external library.
Definition: asserts.hh:348
Timer::start
void start()
Definition: memory.cc:39
mpi.h
operator<<
STREAM & operator<<(STREAM &s, UpdateFlags u)
Definition: update_flags.hh:164
MPI_Functions::sum
static int sum(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:44
MPI_Reduce
#define MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm)
Definition: mpi.h:608
MPI_INT
#define MPI_INT
Definition: mpi.h:160
nlohmann::basic_json::push_back
void push_back(basic_json &&val)
add an object to an array
Definition: json.hpp:4686
ASSERT_PERMANENT_EQ
#define ASSERT_PERMANENT_EQ(a, b)
Definition of comparative assert macro (EQual)
Definition: asserts.hh:329
MPI_SUM
#define MPI_SUM
Definition: mpi.h:196
MPI_Comm
int MPI_Comm
Definition: mpi.h:141
time_point.hh
nlohmann::basic_json::dump
string_t dump(const int indent=-1) const
serialization
Definition: json.hpp:2079
Profiler::Profiler
Profiler()
Definition: sys_profiler.hh:916
nlohmann::basic_json
a class to store JSON values
Definition: json.hpp:173
WarningOut
#define WarningOut()
Macro defining 'warning' record of log.
Definition: logger.hh:278
MPI_COMM_WORLD
#define MPI_COMM_WORLD
Definition: mpi.h:123
Profiler::set_task_info
void set_task_info(string, int)
Definition: sys_profiler.hh:880
MPI_Functions::max
static int max(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:80
FilePath::output_file
@ output_file
Definition: file_path.hh:69
Profiler::output
void output()
Definition: sys_profiler.hh:898
MPI_LONG
#define MPI_LONG
Definition: mpi.h:161
START_TIMER
#define START_TIMER(tag)
Starts a timer with specified tag.
Definition: sys_profiler.hh:115
Profiler::notify_free
void notify_free(const size_t)
Definition: sys_profiler.hh:886
MPI_Barrier
#define MPI_Barrier(comm)
Definition: mpi.h:531
Timer::stop
void stop()
Definition: memory.cc:42
END_TIMER
#define END_TIMER(tag)
Ends a timer with specified tag.
Definition: sys_profiler.hh:149
MPI_Functions::min
static int min(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:62
MessageOut
#define MessageOut()
Macro defining 'message' record of log.
Definition: logger.hh:275