Flow123d  JB_transport-d4c8564
sys_profiler.cc
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file sys_profiler.cc
15  * @ingroup system
16  * @brief Profiler
17  */
18 
19 
20 // Fat header
21 
22 #include <fstream>
23 #include <iomanip>
24 #include <sys/param.h>
25 #include <unordered_map>
26 
27 #include <pybind11.h>
28 
29 #include "sys_profiler.hh"
30 #include "system/system.hh"
31 #include "system/python_loader.hh"
32 #include <iostream>
33 #include <boost/format.hpp>
34 
35 #include "system/file_path.hh"
36 #include "mpi.h"
37 #include "time_point.hh"
38 
39 /*
40  * These should be replaced by using boost MPI interface
41  */
42 int MPI_Functions::sum(int* val, MPI_Comm comm) {
43  int total = 0;
44  MPI_Reduce(val, &total, 1, MPI_INT, MPI_SUM, 0, comm);
45  return total;
46  }
47 
48 double MPI_Functions::sum(double* val, MPI_Comm comm) {
49  double total = 0;
50  MPI_Reduce(val, &total, 1, MPI_DOUBLE, MPI_SUM, 0, comm);
51  return total;
52  }
53 
54 long MPI_Functions::sum(long* val, MPI_Comm comm) {
55  long total = 0;
56  MPI_Reduce(val, &total, 1, MPI_LONG, MPI_SUM, 0, comm);
57  return total;
58  }
59 
60 int MPI_Functions::min(int* val, MPI_Comm comm) {
61  int min = 0;
62  MPI_Reduce(val, &min, 1, MPI_INT, MPI_MIN, 0, comm);
63  return min;
64  }
65 
66 double MPI_Functions::min(double* val, MPI_Comm comm) {
67  double min = 0;
68  MPI_Reduce(val, &min, 1, MPI_DOUBLE, MPI_MIN, 0, comm);
69  return min;
70  }
71 
72 long MPI_Functions::min(long* val, MPI_Comm comm) {
73  long min = 0;
74  MPI_Reduce(val, &min, 1, MPI_LONG, MPI_MIN, 0, comm);
75  return min;
76  }
77 
78 int MPI_Functions::max(int* val, MPI_Comm comm) {
79  int max = 0;
80  MPI_Reduce(val, &max, 1, MPI_INT, MPI_MAX, 0, comm);
81  return max;
82  }
83 
84 double MPI_Functions::max(double* val, MPI_Comm comm) {
85  double max = 0;
86  MPI_Reduce(val, &max, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
87  return max;
88  }
89 
90 long MPI_Functions::max(long* val, MPI_Comm comm) {
91  long max = 0;
92  MPI_Reduce(val, &max, 1, MPI_LONG, MPI_MAX, 0, comm);
93  return max;
94  }
95 
96 
97 #ifdef FLOW123D_DEBUG_PROFILER
98 /*********************************************************************************************
99  * Implementation of class Timer
100  */
101 
102 const int timer_no_child=-1;
103 
104 Timer::Timer(const CodePoint &cp, int parent)
105 : start_time(TimePoint()),
106  cumul_time(0.0),
107  call_count(0),
108  start_count(0),
109  code_point_(&cp),
110  full_hash_(cp.hash_),
111  hash_idx_(cp.hash_idx_),
112  parent_timer(parent),
113  total_allocated_(0),
114  total_deallocated_(0),
115  max_allocated_(0),
116  current_allocated_(0),
117  alloc_called(0),
118  dealloc_called(0)
119 #ifdef FLOW123D_HAVE_PETSC
120 , petsc_start_memory(0),
121  petsc_end_memory (0),
122  petsc_memory_difference(0),
123  petsc_peak_memory(0),
124  petsc_local_peak_memory(0)
125 #endif // FLOW123D_HAVE_PETSC
126 {
127  for(unsigned int i=0; i< max_n_childs ;i++) child_timers[i]=timer_no_child;
128 }
129 
130 
131 /**
132  * Debug information of the timer
133  */
134 ostream & operator <<(ostream& os, const Timer& timer) {
135  os << " Timer: " << timer.tag() << endl;
136  os << " malloc: " << timer.total_allocated_ << endl;
137  os << " dalloc: " << timer.total_deallocated_ << endl;
138  #ifdef FLOW123D_HAVE_PETSC
139  os << " start: " << timer.petsc_start_memory << endl;
140  os << " stop : " << timer.petsc_end_memory << endl;
141  os << " diff : " << timer.petsc_memory_difference << " (" << timer.petsc_end_memory - timer.petsc_start_memory << ")" << endl;
142  os << " peak : " << timer.petsc_peak_memory << " (" << timer.petsc_local_peak_memory << ")" << endl;
143  #endif // FLOW123D_HAVE_PETSC
144  os << endl;
145  return os;
146 }
147 
148 
149 double Timer::cumulative_time() const {
150  return cumul_time;
151 }
152 
153 void Profiler::accept_from_child(Timer &parent, Timer &child) {
154  int child_timer = 0;
155  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
156  child_timer = child.child_timers[i];
157  if (child_timer != timer_no_child) {
158  // propagate metrics from child to parent
159  accept_from_child(child, timers_[child_timer]);
160  }
161  }
162  // compute totals by adding values from child
163  parent.total_allocated_ += child.total_allocated_;
164  parent.total_deallocated_ += child.total_deallocated_;
165  parent.alloc_called += child.alloc_called;
166  parent.dealloc_called += child.dealloc_called;
167 
168 #ifdef FLOW123D_HAVE_PETSC
169  if (petsc_monitor_memory) {
170  // add differences from child
171  parent.petsc_memory_difference += child.petsc_memory_difference;
172  parent.current_allocated_ += child.current_allocated_;
173 
174  // when computing maximum, we take greater value from parent and child
175  // PetscMemoryGetCurrentUsage always return absolute (not relative) value
176  parent.petsc_peak_memory = max(parent.petsc_peak_memory, child.petsc_peak_memory);
177  }
178 #endif // FLOW123D_HAVE_PETSC
179 
180  parent.max_allocated_ = max(parent.max_allocated_, child.max_allocated_);
181 }
182 
183 
184 void Timer::pause() {
185 #ifdef FLOW123D_HAVE_PETSC
186  if (Profiler::get_petsc_memory_monitoring()) {
187  // get the maximum resident set size (memory used) for the program.
188  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
189  if (petsc_peak_memory < petsc_local_peak_memory)
190  petsc_peak_memory = petsc_local_peak_memory;
191  }
192 #endif // FLOW123D_HAVE_PETSC
193 }
194 
195 void Timer::resume() {
196 #ifdef FLOW123D_HAVE_PETSC
197  if (Profiler::get_petsc_memory_monitoring()) {
198  // tell PETSc to monitor the maximum memory usage so
199  // that PetscMemoryGetMaximumUsage() will work.
200  PetscMemorySetGetMaximumUsage();
201  }
202 #endif // FLOW123D_HAVE_PETSC
203 }
204 
205 void Timer::start() {
206 #ifdef FLOW123D_HAVE_PETSC
207  if (Profiler::get_petsc_memory_monitoring()) {
208  // Tell PETSc to monitor the maximum memory usage so
209  // that PetscMemoryGetMaximumUsage() will work.
210  PetscMemorySetGetMaximumUsage();
211  PetscMemoryGetCurrentUsage (&petsc_start_memory);
212  }
213 #endif // FLOW123D_HAVE_PETSC
214 
215  if (start_count == 0) {
216  start_time = TimePoint();
217  }
218  call_count++;
219  start_count++;
220 }
221 
222 
223 
224 bool Timer::stop(bool forced) {
225 #ifdef FLOW123D_HAVE_PETSC
226  if (Profiler::get_petsc_memory_monitoring()) {
227  // get current memory usage
228  PetscMemoryGetCurrentUsage (&petsc_end_memory);
229  petsc_memory_difference += petsc_end_memory - petsc_start_memory;
230 
231  // get the maximum resident set size (memory used) for the program.
232  PetscMemoryGetMaximumUsage(&petsc_local_peak_memory);
233  if (petsc_peak_memory < petsc_local_peak_memory)
234  petsc_peak_memory = petsc_local_peak_memory;
235  }
236 #endif // FLOW123D_HAVE_PETSC
237 
238  if (forced) start_count=1;
239 
240  if (start_count == 1) {
241  cumul_time += (TimePoint() - start_time);
242  start_count--;
243  return true;
244  } else {
245  start_count--;
246  }
247  return false;
248 }
249 
250 
251 
252 void Timer::add_child(int child_index, const Timer &child)
253 {
254  unsigned int idx = child.hash_idx_;
255  if (child_timers[idx] != timer_no_child) {
256  // hash collision, find first empty place
257  unsigned int i=idx;
258  do {
259  i=( i < max_n_childs ? i+1 : 0);
260  } while (i!=idx && child_timers[i] != timer_no_child);
261  ASSERT_PERMANENT(i!=idx)(tag()).error("Too many children of the timer");
262  idx=i;
263  }
264  //DebugOut().fmt("Adding child {} at index: {}\n", child_index, idx);
265  child_timers[idx] = child_index;
266 }
267 
268 
269 
270 string Timer::code_point_str() const {
271  return fmt::format("{%s}:{%d}, {%s}()",
272  code_point_->file_, code_point_->line_, code_point_->func_ );
273 }
274 
275 
276 /***********************************************************************************************
277  * Implementation of Profiler
278  */
279 
280 
281 Profiler * Profiler::instance(bool clear) {
282  static Profiler * _instance = NULL;
283 
284  if (clear) {
285  if (_instance != NULL) {
286  delete _instance;
287  _instance = NULL;
288  }
289  return _instance;
290  }
291 
292  if (_instance == NULL) {
293  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
294  _instance = new Profiler();
295  }
296 
297  return _instance;
298  }
299 
300 
301 // static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
302 const long Profiler::malloc_map_reserve = 100 * 1000;
303 
305 : actual_node(0),
306  task_size_(1),
307  start_time( time(NULL) ),
308  json_filepath(""),
309  none_timer_(CODE_POINT("NONE TIMER"), 0),
310  calibration_time_(-1)
311 
312 {
313  static CONSTEXPR_ CodePoint main_cp = CODE_POINT("Whole Program");
314  set_memory_monitoring(true, true);
315 #ifdef FLOW123D_DEBUG_PROFILER
316  MemoryAlloc::malloc_map().reserve(Profiler::malloc_map_reserve);
317  timers_.push_back( Timer(main_cp, 0) );
318  timers_[0].start();
319 #endif
320 }
321 
322 
323 void Profiler::calibrate() {
324 
325  uint SIZE = 64 * 1024;
326  uint HALF = SIZE / 2;
327  START_TIMER("UNIT_PAYLOAD");
328  Timer &timer = timers_[actual_node];
329 
330  uint count = 0;
331  while(TimePoint() - timer.start_time < 0.1) {
332  double * block = new double[SIZE];
333  for(uint i=0; i<HALF; i++) {
334  block[HALF+i] = block[i]*block[i] + i;
335  }
336  delete[] block;
337  count++;
338  }
339 
340 
341 
342  END_TIMER("UNIT_PAYLOAD");
343 
344  const double reference_count = 7730;
345  // reference count is average of 5 measurements on payload free laptop:
346  // DELL Latitude E5570, Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
347  // new/delete makes about 3% of time
348  calibration_time_ = 10 * timer.cumul_time * reference_count / count;
349  LogOut() << "Profiler calibration count: " << count << std::endl;
350  LogOut() << "Profiler calibration time: " << calibration_time_ << std::endl;
351 
352 }
353 
354 
355 
356 void Profiler::propagate_timers() {
357  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
358  unsigned int child_timer = timers_[0].child_timers[i];
359  if ((signed int)child_timer != timer_no_child) {
360  // propagate metrics from child to Whole-Program time-frame
361  accept_from_child(timers_[0], timers_[child_timer]);
362  }
363  }
364 }
365 
366 
367 
368 void Profiler::set_task_info(string description, int size) {
369  task_description_ = description;
370  task_size_ = size;
371 }
372 
373 
374 
375 void Profiler::set_program_info(string program_name, string program_version, string branch, string revision, string build) {
376  flow_name_ = program_name;
377  flow_version_ = program_version;
378  flow_branch_ = branch;
379  flow_revision_ = revision;
380  flow_build_ = build;
381 }
382 
383 
384 
385 int Profiler::start_timer(const CodePoint &cp) {
386  unsigned int parent_node = actual_node;
387  //DebugOut().fmt("Start timer: {}\n", cp.tag_);
388  int child_idx = find_child(cp);
389  if (child_idx < 0) {
390  //DebugOut().fmt("Adding timer: {}\n", cp.tag_);
391  // tag not present - create new timer
392  child_idx=timers_.size();
393  timers_.push_back( Timer(cp, actual_node) );
394  timers_[actual_node].add_child(child_idx , timers_.back() );
395  }
396  actual_node=child_idx;
397 
398  // pause current timer
399  timers_[parent_node].pause();
400 
401  timers_[actual_node].start();
402 
403  return actual_node;
404 }
405 
406 
407 
408 int Profiler::find_child(const CodePoint &cp) {
409  Timer &timer =timers_[actual_node];
410  unsigned int idx = cp.hash_idx_;
411  unsigned int child_idx;
412  do {
413  if (timer.child_timers[idx] == timer_no_child) break; // tag is not there
414 
415  child_idx=timer.child_timers[idx];
416  ASSERT_PERMANENT_LT(child_idx, timers_.size()).error();
417  if (timers_[child_idx].full_hash_ == cp.hash_) return child_idx;
418  idx = ( (unsigned int)(idx)==(Timer::max_n_childs - 1) ? 0 : idx+1 );
419  } while ( (unsigned int)(idx) != cp.hash_idx_ ); // passed through whole array
420  return -1;
421 }
422 
423 
424 
425 void Profiler::stop_timer(const CodePoint &cp) {
426 #ifdef FLOW123D_DEBUG_ASSERTS
427  // check that all childrens are closed
428  Timer &timer=timers_[actual_node];
429  for(unsigned int i=0; i < Timer::max_n_childs; i++)
430  if (timer.child_timers[i] != timer_no_child)
431  ASSERT_PERMANENT(! timers_[timer.child_timers[i]].running())(timers_[timer.child_timers[i]].tag())(timer.tag())
432  .error("Child timer running while closing timer.");
433 #endif
434  unsigned int child_timer = actual_node;
435  if ( cp.hash_ != timers_[actual_node].full_hash_) {
436  // timer to close is not actual - we search for it above actual
437  for(unsigned int node=actual_node; node != 0; node=timers_[node].parent_timer) {
438  if ( cp.hash_ == timers_[node].full_hash_) {
439  // found above - close all nodes between
440  for(; (unsigned int)(actual_node) != node; actual_node=timers_[actual_node].parent_timer) {
441  WarningOut() << "Timer to close '" << cp.tag_ << "' do not match actual timer '"
442  << timers_[actual_node].tag() << "'. Force closing actual." << std::endl;
443  timers_[actual_node].stop(true);
444  }
445  // close 'node' itself
446  timers_[actual_node].stop(false);
447  actual_node = timers_[actual_node].parent_timer;
448 
449  // actual_node == child_timer indicates this is root
450  if (actual_node == child_timer)
451  return;
452 
453  // resume current timer
454  timers_[actual_node].resume();
455  return;
456  }
457  }
458  // node not found - do nothing
459  return;
460  }
461  // node to close match the actual
462  timers_[actual_node].stop(false);
463  actual_node = timers_[actual_node].parent_timer;
464 
465  // actual_node == child_timer indicates this is root
466  if (actual_node == child_timer)
467  return;
468 
469  // resume current timer
470  timers_[actual_node].resume();
471 }
472 
473 
474 
475 void Profiler::stop_timer(int timer_index) {
476  // stop_timer with CodePoint type
477  // timer which is still running MUST be the same as actual_node index
478  // if timer is not running index will differ
479  if (timers_[timer_index].running()) {
480  ASSERT_PERMANENT_EQ(timer_index, (int)actual_node).error();
481  stop_timer(*timers_[timer_index].code_point_);
482  }
483 
484 }
485 
486 
487 
488 void Profiler::add_calls(unsigned int n_calls) {
489  timers_[actual_node].call_count += n_calls-1;
490 }
491 
492 
493 
494 void Profiler::notify_malloc(const size_t size, const long p) {
495  MemoryAlloc::malloc_map()[p] = static_cast<int>(size);
496  timers_[actual_node].total_allocated_ += size;
497  timers_[actual_node].current_allocated_ += size;
498  timers_[actual_node].alloc_called++;
499 
500  if (timers_[actual_node].current_allocated_ > timers_[actual_node].max_allocated_)
501  timers_[actual_node].max_allocated_ = timers_[actual_node].current_allocated_;
502 }
503 
504 
505 
506 void Profiler::notify_free(const long p) {
507  int size = sizeof(p);
508  if (MemoryAlloc::malloc_map()[(long)p] > 0) {
509  size = MemoryAlloc::malloc_map()[(long)p];
510  MemoryAlloc::malloc_map().erase((long)p);
511  }
512  timers_[actual_node].total_deallocated_ += size;
513  timers_[actual_node].current_allocated_ -= size;
514  timers_[actual_node].dealloc_called++;
515 }
516 
517 
518 double Profiler::get_resolution () {
519  const int measurements = 100;
520  double result = 0;
521 
522  // perform 100 measurements
523  for (unsigned int i = 1; i < measurements; i++) {
524  TimePoint t1 = TimePoint ();
525  TimePoint t2 = TimePoint ();
526 
527  // double comparison should be avoided
528  while ((t2 - t1) == 0) t2 = TimePoint ();
529  // while ((t2.ticks - t1.ticks) == 0) t2 = TimePoint ();
530 
531  result += t2 - t1;
532  }
533 
534  return (result / measurements) * 1000; // ticks to seconds to microseconds conversion
535 }
536 
537 
538 Timer Profiler::find_timer(string tag) {
539  for(auto t : timers_) {
540  if (t.tag() == tag) return t;
541  }
542  return none_timer_;
543 }
544 
545 
546 std::string common_prefix( std::string a, std::string b ) {
547  if( a.size() > b.size() ) std::swap(a,b) ;
548  return std::string( a.begin(), std::mismatch( a.begin(), a.end(), b.begin() ).first ) ;
549 }
550 
551 
552 
553 template<typename ReduceFunctor>
554 void Profiler::add_timer_info(ReduceFunctor reduce, nlohmann::json* holder, int timer_idx, double parent_time) {
555 
556  // get timer and check preconditions
557  Timer &timer = timers_[timer_idx];
558  ASSERT_PERMANENT(timer_idx >=0)(timer_idx).error("Wrong timer index.");
559  ASSERT_PERMANENT(timer.parent_timer >=0).error("Inconsistent tree.");
560 
561  // fix path
562  string filepath = timer.code_point_->file_;
563 
564  // if constant FLOW123D_SOURCE_DIR is defined, we try to erase it from beginning of each CodePoint's filepath
565  #ifdef FLOW123D_SOURCE_DIR
566  string common_path = common_prefix (string(FLOW123D_SOURCE_DIR), filepath);
567  filepath.erase (0, common_path.size());
568  #endif
569 
570 
571  // generate node representing this timer
572  // add basic information
573  nlohmann::json node;
574  double cumul_time_sum;
575  node["tag"] = timer.tag();
576  node["file-path"] = filepath;
577  node["file-line"] = timer.code_point_->line_;
578  node["function"] = timer.code_point_->func_;
579  cumul_time_sum = reduce(timer, node);
580 
581 
582  // statistical info
583  if (timer_idx == 0) parent_time = cumul_time_sum;
584  double percent = parent_time > 1.0e-10 ? cumul_time_sum / parent_time * 100.0 : 0.0;
585  node["percent"] = percent;
586 
587  // write times children timers using secured child_timers array
588  nlohmann::json children;
589 
590  // sort childs by its index in Profiler::timers_ in order to preserve call order
591  std::vector<int> child_timers_values;
592  for (unsigned int i = 0; i < Timer::max_n_childs; i++) {
593  if (timer.child_timers[i] != timer_no_child)
594  child_timers_values.push_back(timer.child_timers[i]);
595  }
596  std::sort(child_timers_values.begin(), child_timers_values.end());
597 
598  for(int idx : child_timers_values)
599  add_timer_info(reduce, &children, idx, cumul_time_sum);
600 
601  // add children tag and other info if present
602  if (child_timers_values.size() > 0) {
603  node["children"] = children;
604  }
605 
606  // add to the array
607  holder->push_back(node);
608 }
609 
610 
611 template <class T>
612 void save_nonmpi_metric (nlohmann::json &node, T * ptr, string name) {
613  node[name + "-min"] = *ptr;
614  node[name + "-max"] = *ptr;
615  node[name + "-sum"] = *ptr;
616 }
617 
618 std::shared_ptr<std::ostream> Profiler::get_default_output_stream() {
619  json_filepath = FilePath("profiler_info.log.json", FilePath::output_file);
620 
621  //LogOut() << "output into: " << json_filepath << std::endl;
622  return make_shared<ofstream>(json_filepath.c_str());
623 }
624 
625 
626 #ifdef FLOW123D_HAVE_MPI
627 template <class T>
628 void save_mpi_metric (nlohmann::json &node, MPI_Comm comm, T * ptr, string name) {
629  node[name + "-min"] = MPI_Functions::min(ptr, comm);
630  node[name + "-max"] = MPI_Functions::max(ptr, comm);
631  node[name + "-sum"] = MPI_Functions::sum(ptr, comm);
632 }
633 
634 void Profiler::output(MPI_Comm comm, ostream &os) {
635  int mpi_rank, mpi_size;
636  //wait until profiling on all processors is finished
637  MPI_Barrier(comm);
638  stop_timer(0);
639  propagate_timers();
640 
641  // stop monitoring memory
642  bool temp_memory_monitoring = global_monitor_memory;
643  set_memory_monitoring(false, petsc_monitor_memory);
644 
645  chkerr( MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) );
646  MPI_Comm_size(comm, &mpi_size);
647 
648  // output header
649  nlohmann::json jsonRoot, jsonChildren;
650 
651  // recursively add all timers info
652  // define lambda function which reduces timer from multiple processors
653  // MPI implementation uses MPI call to reduce values
654  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
655  int call_count = timer.call_count;
656  double cumul_time = timer.cumulative_time ();
657 
658  long memory_allocated = (long)timer.total_allocated_;
659  long memory_deallocated = (long)timer.total_deallocated_;
660  long memory_peak = (long)timer.max_allocated_;
661 
662  int alloc_called = timer.alloc_called;
663  int dealloc_called = timer.dealloc_called;
664 
665 
666  save_mpi_metric<double>(node, comm, &cumul_time, "cumul-time");
667  save_mpi_metric<int>(node, comm, &call_count, "call-count");
668 
669  save_mpi_metric<long>(node, comm, &memory_allocated, "memory-alloc");
670  save_mpi_metric<long>(node, comm, &memory_deallocated, "memory-dealloc");
671  save_mpi_metric<long>(node, comm, &memory_peak, "memory-peak");
672  //
673  save_mpi_metric<int>(node, comm, &alloc_called, "memory-alloc-called");
674  save_mpi_metric<int>(node, comm, &dealloc_called, "memory-dealloc-called");
675 
676 #ifdef FLOW123D_HAVE_PETSC
677  long petsc_memory_difference = (long)timer.petsc_memory_difference;
678  long petsc_peak_memory = (long)timer.petsc_peak_memory;
679  save_mpi_metric<long>(node, comm, &petsc_memory_difference, "memory-petsc-diff");
680  save_mpi_metric<long>(node, comm, &petsc_peak_memory, "memory-petsc-peak");
681 #endif // FLOW123D_HAVE_PETSC
682 
683  return MPI_Functions::sum(&cumul_time, comm);
684  };
685 
686  add_timer_info (reduce, &jsonChildren, 0, 0.0);
687  jsonRoot["children"] = jsonChildren;
688  output_header(jsonRoot, mpi_size);
689 
690 
691  // create profiler output only once (on the first processor)
692  // only active communicator should be the one with mpi_rank 0
693  if (mpi_rank == 0) {
694  try {
695  /**
696  * indent size
697  * results in json human readable format (indents, newlines)
698  */
699  const int FLOW123D_JSON_HUMAN_READABLE = 2;
700  // write result to stream
701  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
702 
703  } catch (exception & e) {
704  stringstream ss;
705  ss << "nlohmann::json::dump error: " << e.what() << "\n";
706  THROW( ExcMessage() << EI_Message(ss.str()) );
707  }
708  }
709  // restore memory monitoring
710  set_memory_monitoring(temp_memory_monitoring, petsc_monitor_memory);
711 }
712 
713 
714 void Profiler::output(MPI_Comm comm, string profiler_path /* = "" */) {
715  int mpi_rank;
716  chkerr(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank));
717 
718  if (mpi_rank == 0) {
719  if (profiler_path == "") {
720  output(comm, *get_default_output_stream());
721  } else {
722  json_filepath = profiler_path;
723  std::shared_ptr<std::ostream> os = make_shared<ofstream>(profiler_path.c_str());
724  output(comm, *os);
725  }
726  } else {
727  ostringstream os;
728  output(comm, os);
729  }
730 }
731 
732 #endif /* FLOW123D_HAVE_MPI */
733 
734 void Profiler::output(ostream &os) {
735  // last update
736  stop_timer(0);
737  propagate_timers();
738 
739  // output header
740  nlohmann::json jsonRoot, jsonChildren;
741  /**
742  * Constant representing number of MPI processes
743  * where there is no MPI to work with (so 1 process)
744  */
745  const int FLOW123D_MPI_SINGLE_PROCESS = 1;
746  output_header(jsonRoot, FLOW123D_MPI_SINGLE_PROCESS);
747 
748 
749  // recursively add all timers info
750  // define lambda function which reduces timer from multiple processors
751  // non-MPI implementation is just dummy repetition of initial value
752  auto reduce = [=] (Timer &timer, nlohmann::json &node) -> double {
753  int call_count = timer.call_count;
754  double cumul_time = timer.cumulative_time ();
755 
756  long memory_allocated = (long)timer.total_allocated_;
757  long memory_deallocated = (long)timer.total_deallocated_;
758  long memory_peak = (long)timer.max_allocated_;
759 
760  int alloc_called = timer.alloc_called;
761  int dealloc_called = timer.dealloc_called;
762 
763  save_nonmpi_metric<double>(node, &cumul_time, "cumul-time");
764  save_nonmpi_metric<int>(node, &call_count, "call-count");
765 
766  save_nonmpi_metric<long>(node, &memory_allocated, "memory-alloc");
767  save_nonmpi_metric<long>(node, &memory_deallocated, "memory-dealloc");
768  save_nonmpi_metric<long>(node, &memory_peak, "memory-peak");
769 
770  save_nonmpi_metric<int>(node, &alloc_called, "memory-alloc-called");
771  save_nonmpi_metric<int>(node, &dealloc_called, "memory-dealloc-called");
772 
773 #ifdef FLOW123D_HAVE_PETSC
774  long petsc_memory_difference = (long)timer.petsc_memory_difference;
775  long petsc_peak_memory = (long)timer.petsc_peak_memory;
776  save_nonmpi_metric<long>(node, &petsc_memory_difference, "memory-petsc-diff");
777  save_nonmpi_metric<long>(node, &petsc_peak_memory, "memory-petsc-peak");
778 #endif // FLOW123D_HAVE_PETSC
779 
780  return cumul_time;
781  };
782 
783  add_timer_info(reduce, &jsonChildren, 0, 0.0);
784  jsonRoot["children"] = jsonChildren;
785 
786  try {
787  /**
788  * indent size
789  * results in json human readable format (indents, newlines)
790  */
791  const int FLOW123D_JSON_HUMAN_READABLE = 2;
792  // write result to stream
793  os << jsonRoot.dump(FLOW123D_JSON_HUMAN_READABLE) << endl;
794 
795  } catch (exception & e) {
796  stringstream ss;
797  ss << "nlohmann::json::dump error: " << e.what() << "\n";
798  THROW( ExcMessage() << EI_Message(ss.str()) );
799  }
800 }
801 
802 
803 void Profiler::output(string profiler_path /* = "" */) {
804  if(profiler_path == "") {
805  output(*get_default_output_stream());
806  } else {
807  json_filepath = profiler_path;
808  std::shared_ptr<std::ostream> os = make_shared<ofstream>(profiler_path.c_str());
809  output(*os);
810  }
811 }
812 
813 void Profiler::output_header (nlohmann::json &root, int mpi_size) {
814  time_t end_time = time(NULL);
815 
816  const char format[] = "%x %X";
817  char start_time_string[BUFSIZ] = {0};
818  strftime(start_time_string, sizeof (start_time_string) - 1, format, localtime(&start_time));
819 
820  char end_time_string[BUFSIZ] = {0};
821  strftime(end_time_string, sizeof (end_time_string) - 1, format, localtime(&end_time));
822 
823  if (timers_[0].cumul_time > 60) {
824  calibrate();
825  }
826  // generate current run details
827  root["program-name"] = flow_name_;
828  root["program-version"] = flow_version_;
829  root["program-branch"] = flow_branch_;
830  root["program-revision"] = flow_revision_;
831  root["program-build"] = flow_build_;
832  root["timer-resolution"] = Profiler::get_resolution();
833  root["timer-calibration"] = calibration_time_;
834 
835  // print some information about the task at the beginning
836  root["task-description"] = task_description_;
837  root["task-size"] = task_size_;
838 
839  //print some information about the task at the beginning
840  root["run-process-count"] = mpi_size;
841  root["run-started-at"] = start_time_string;
842  root["run-finished-at"] = end_time_string;
843 }
844 
845 void Profiler::transform_profiler_data (const string &output_file_suffix, const string &formatter) {
846  namespace py = pybind11;
847 
848  if (json_filepath == "") return;
849 
850  // error under CYGWIN environment : more details in this repo
851  // https://github.com/x3mSpeedy/cygwin-python-issue
852  //
853  // For now we only support profiler report conversion in UNIX environment
854  // Windows users will have to use a python script located in bin folder
855  //
856 
857  // grab module and function by importing module profiler_formatter_module.py
858  auto python_module = PythonLoader::load_module_by_name ("profiler.profiler_formatter_module");
859  //
860  // def convert (json_location, output_file, formatter):
861  //
862  auto convert_method = python_module.attr("convert");
863  // execute method with arguments
864  convert_method(json_filepath, (json_filepath + output_fiel_suffix), formatter);
865 
866 }
867 
868 
869 void Profiler::uninitialize() {
870  if (Profiler::instance()) {
871  ASSERT_PERMANENT(Profiler::instance()->actual_node==0)(Profiler::instance()->timers_[Profiler::instance()->actual_node].tag())
872  .error("Forbidden to uninitialize the Profiler when actual timer is not zero.");
873  Profiler::instance()->stop_timer(0);
874  set_memory_monitoring(false, false);
875  Profiler::instance(true);
876  }
877 }
878 bool Profiler::global_monitor_memory = false;
879 bool Profiler::petsc_monitor_memory = true;
880 void Profiler::set_memory_monitoring(const bool global_monitor, const bool petsc_monitor) {
881  global_monitor_memory = global_monitor;
882  petsc_monitor_memory = petsc_monitor;
883 }
884 
885 unordered_map_with_alloc & MemoryAlloc::malloc_map() {
886  static unordered_map_with_alloc static_malloc_map;
887  return static_malloc_map;
888 }
889 
890 void * Profiler::operator new (size_t size) {
891  return malloc (size);
892 }
893 
894 void Profiler::operator delete (void* p) {
895  free(p);
896 }
897 
898 void *operator new (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
899  void * p = malloc(size);
900  if (Profiler::get_global_memory_monitoring())
901  Profiler::instance()->notify_malloc(size, (long)p);
902  return p;
903 }
904 
905 void *operator new[] (std::size_t size) OPERATOR_NEW_THROW_EXCEPTION {
906  void * p = malloc(size);
907  if (Profiler::get_global_memory_monitoring())
908  Profiler::instance()->notify_malloc(size, (long)p);
909  return p;
910 }
911 
912 void *operator new[] (std::size_t size, const std::nothrow_t&) throw() {
913  void * p = malloc(size);
914  if (Profiler::get_global_memory_monitoring())
915  Profiler::instance()->notify_malloc(size, (long)p);
916  return p;
917 }
918 
919 void operator delete( void *p) throw() {
920  if (Profiler::get_global_memory_monitoring())
921  Profiler::instance()->notify_free((long)p);
922  free(p);
923 }
924 
925 void operator delete( void *p, std::size_t) throw() {
926  if (Profiler::get_global_memory_monitoring())
927  Profiler::instance()->notify_free((long)p);
928  free(p);
929 }
930 
931 void operator delete[]( void *p) throw() {
932  if (Profiler::get_global_memory_monitoring())
933  Profiler::instance()->notify_free((long)p);
934  free(p);
935 }
936 
937 void operator delete[]( void *p, std::size_t) throw() {
938  if (Profiler::get_global_memory_monitoring())
939  Profiler::instance()->notify_free((long)p);
940  free(p);
941 }
942 
943 #else // def FLOW123D_DEBUG_PROFILER
944 
946  static Profiler * _instance = new Profiler();
947  return _instance;
948 }
949 
950 
951 // void Profiler::initialize() {
952 // if (_instance == NULL) {
953 // _instance = new Profiler();
954 // set_memory_monitoring(true, true);
955 // }
956 // }
957 
959 
960 
961 #endif // def FLOW123D_DEBUG_PROFILER
format
manipulators::Array< T, Delim > format(T const &deduce, Delim delim=", ")
Definition: logger.hh:325
MPI_MIN
#define MPI_MIN
Definition: mpi.h:198
Profiler::instance
static Profiler * instance(bool clear=false)
Definition: sys_profiler.cc:945
python_loader.hh
std::swap
void swap(nlohmann::json &j1, nlohmann::json &j2) noexcept(is_nothrow_move_constructible< nlohmann::json >::value and is_nothrow_move_assignable< nlohmann::json >::value)
exchanges the values of two JSON objects
Definition: json.hpp:8688
Profiler::transform_profiler_data
void transform_profiler_data(const string &, const string &)
Definition: sys_profiler.hh:900
OPERATOR_NEW_THROW_EXCEPTION
#define OPERATOR_NEW_THROW_EXCEPTION
Definition: system.hh:54
file_path.hh
ASSERT_PERMANENT_LT
#define ASSERT_PERMANENT_LT(a, b)
Definition of comparative assert macro (Less Than)
Definition: asserts.hh:297
FilePath
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54
MPI_MAX
#define MPI_MAX
Definition: mpi.h:197
Profiler::notify_malloc
void notify_malloc(const size_t)
Definition: sys_profiler.hh:884
chkerr
void chkerr(unsigned int ierr)
Replacement of new/delete operator in the spirit of xmalloc.
Definition: system.hh:142
MPI_DOUBLE
#define MPI_DOUBLE
Definition: mpi.h:156
THROW
#define THROW(whole_exception_expr)
Wrapper for throw. Saves the throwing point.
Definition: exceptions.hh:53
std::vector< int >
system.hh
Profiler::calibrate
void calibrate()
Profiler::uninitialize
static void uninitialize()
Definition: sys_profiler.cc:958
uint
unsigned int uint
Definition: mh_dofhandler.hh:101
CONSTEXPR_
#define CONSTEXPR_
Definition: sys_profiler.hh:86
fmt::format
std::string format(CStringRef format_str, ArgList args)
Definition: format.h:3141
Profiler::get_resolution
double get_resolution() const
Definition: sys_profiler.hh:902
Profiler::set_program_info
void set_program_info(string, string, string, string, string)
Definition: sys_profiler.hh:882
MPI_Comm_rank
#define MPI_Comm_rank
Definition: mpi.h:236
Timer
Definition: memory.cc:33
MPI_Comm_size
#define MPI_Comm_size
Definition: mpi.h:235
TimePoint
Definition: time_point.hh:69
LogOut
#define LogOut()
Macro defining 'log' record of log.
Definition: logger.hh:281
Profiler
Definition: sys_profiler.hh:876
sys_profiler.hh
ASSERT_PERMANENT
#define ASSERT_PERMANENT(expr)
Allow use shorter versions of macro names if these names is not used with external library.
Definition: asserts.hh:348
Timer::start
void start()
Definition: memory.cc:39
mpi.h
operator<<
STREAM & operator<<(STREAM &s, UpdateFlags u)
Definition: update_flags.hh:164
MPI_Functions::sum
static int sum(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:42
MPI_Reduce
#define MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm)
Definition: mpi.h:608
MPI_INT
#define MPI_INT
Definition: mpi.h:160
nlohmann::basic_json::push_back
void push_back(basic_json &&val)
add an object to an array
Definition: json.hpp:4686
ASSERT_PERMANENT_EQ
#define ASSERT_PERMANENT_EQ(a, b)
Definition of comparative assert macro (EQual)
Definition: asserts.hh:329
MPI_SUM
#define MPI_SUM
Definition: mpi.h:196
MPI_Comm
int MPI_Comm
Definition: mpi.h:141
time_point.hh
nlohmann::basic_json::dump
string_t dump(const int indent=-1) const
serialization
Definition: json.hpp:2079
Profiler::Profiler
Profiler()
Definition: sys_profiler.hh:916
nlohmann::basic_json
a class to store JSON values
Definition: json.hpp:173
WarningOut
#define WarningOut()
Macro defining 'warning' record of log.
Definition: logger.hh:278
PythonLoader::load_module_by_name
static pybind11::module_ load_module_by_name(const std::string &module_name)
Definition: python_loader.cc:78
MPI_COMM_WORLD
#define MPI_COMM_WORLD
Definition: mpi.h:123
Profiler::set_task_info
void set_task_info(string, int)
Definition: sys_profiler.hh:880
MPI_Functions::max
static int max(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:78
FilePath::output_file
@ output_file
Definition: file_path.hh:69
Profiler::output
void output()
Definition: sys_profiler.hh:898
MPI_LONG
#define MPI_LONG
Definition: mpi.h:161
START_TIMER
#define START_TIMER(tag)
Starts a timer with specified tag.
Definition: sys_profiler.hh:115
Profiler::notify_free
void notify_free(const size_t)
Definition: sys_profiler.hh:886
MPI_Barrier
#define MPI_Barrier(comm)
Definition: mpi.h:531
Timer::stop
void stop()
Definition: memory.cc:42
END_TIMER
#define END_TIMER(tag)
Ends a timer with specified tag.
Definition: sys_profiler.hh:149
MPI_Functions::min
static int min(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:60