Flow123d  jenkins-Flow123d-windows-release-multijob-285
sys_profiler.hh
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2007 Technical University of Liberec. All rights reserved.
4  *
5  * Please make a following refer to Flow123d on your project site if you use the program for any purpose,
6  * especially for academic research:
7  * Flow123d, Research Centre: Advanced Remedial Technologies, Technical University of Liberec, Czech Republic
8  *
9  * This program is free software; you can redistribute it and/or modify it under the terms
10  * of the GNU General Public License version 3 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
13  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14  * See the GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along with this program; if not,
17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 021110-1307, USA.
18  *
19  *
20  * $Id: profiler.hh 842 2011-01-08 17:58:15Z tomas.bambuch $
21  * $Revision: 842 $
22  * $LastChangedBy: tomas.bambuch $
23  * $LastChangedDate: 2011-01-08 18:58:15 +0100 (So, 08 led 2011) $
24  *
25  *
26  *
27  * TODO:
28  * - START_GLOBAL_TIMER(tag) - this calls the start_timer, which creates local timer on the correct place in the hierarchy,
29  * further this timer is added to the list of global timers, this contains groups of timers with same tag, and
30  * collect/sum data from these timers in the report.
31  *
32  * - Allow output even during calculation (not complete, but at least some thing)
33  * Report should contain time of start as well as time of creation of the report or time from start of the program.
34  *
35  * - When generating report we has to deal with possibly different trees at every MPI process.
36  *
37  * - test memory profiling
38  * in our own new and xmalloc functions - register allocatied and deallocated memory to active Profiler frame.
39  *
40  * - test in parallel
41  * - extended output:
42  * cas na jedno volani (jina redukce nez pro kumulativni cas, pokud je pocet volani ruzny)
43  * procenta vuci predkovi
44  * code point (az nekde na konci radky)
45  *
46  *
47  * !!! Unfortunately using constexpr is worse (without optimization).
48  * This is probably due to use of static variable for
49  * CodePoint, the access could be slow, and computation of hash is done only once. Actually timing results
50  * are:
51  *
52  * OPTIONS OVERHEAD (compared to call 2x clock())
53  * -g, no c++11 : 18%
54  * -g, c++11 : 60%
55  * -O3,no c++11 : 6%
56  * -O3, c++11 : 6%
57  */
58 
59 
60 #ifndef PROFILER_H
61 #define PROFILER_H
62 
63 #include "global_defs.h"
64 #include "system/system.hh"
65 #include <mpi.h>
66 
67 //instead of #include "mpi.h"
68 //mpi declarations follows:
70 public:
71  static int sum(int* val, MPI_Comm comm);
72  static double sum(double* val, MPI_Comm comm);
73  static int min(int* val, MPI_Comm comm);
74  static double min(double* val, MPI_Comm comm);
75  static int max(int* val, MPI_Comm comm);
76  static double max(double* val, MPI_Comm comm);
77 };
78 
79 // Workaround for older compilers, that do not support constexpr feature
80 #if defined(HAVE_CXX11_FULL) || defined(HAVE_CXX11_DRAFT)
81  #define CONSTEXPR_ constexpr
82 #else
83  #define CONSTEXPR_
84 #endif
85 
86 
87 using namespace std;
88 
89 
90 // These helper macros are necessary due to use of _LINE_ variable in START_TIMER macro.
91 #define _PASTE(a,b) a ## b
92 #define PASTE(a,b) _PASTE(a, b)
93 
94 
95 
96 /**
97  * \def START_TIMER(tag)
98  *
99  * @brief Starts a timer with specified tag.
100  *
101  * In fact it creates an static constant expression that identifies the point in the code and
102  * contains tag of the involved timer and its hash. Then it creates local variable that
103  * calls @p Profiler::start_timer() in constructor and @p Profiler::stop_timer() in destructor.
104  * This way the timer is automatically closed at the end of current block.
105  *
106  * ATTENTION: This macro expands to two statements so following code is illegal:
107  * @code
108  * if (some_condition) START_TIMER(tag);
109  * @endcode
110  */
111 #ifdef DEBUG_PROFILER
112 #define START_TIMER(tag) static CONSTEXPR_ CodePoint PASTE(cp_,__LINE__) = CODE_POINT(tag); TimerFrame PASTE(timer_,__LINE__) = TimerFrame( PASTE(cp_,__LINE__) )
113 #else
114 #define START_TIMER(tag)
115 #endif
116 
117 /**
118  * \def END_TIMER(tag)
119  *
120  * @brief Ends a timer with specified tag.
121  *
122  * Use only if you want to end timer before the end of block. Again this expands into two lines, see ATTENTION in previous macro.
123  */
124 #ifdef DEBUG_PROFILER
125 #define END_TIMER(tag) static CONSTEXPR_ CodePoint PASTE(cp_,__LINE__) = CODE_POINT(tag); Profiler::instance()->stop_timer( PASTE(cp_,__LINE__) )
126 #else
127 #define END_TIMER(tag)
128 #endif
129 
130 /**
131  * \def END_START_TIMER(tag)
132  *
133  * Ends current timer and starts the new one with given tag. Again this expands into two lines, see ATTENTION in previous macro.
134  */
135 #ifdef DEBUG_PROFILER
136 #define END_START_TIMER(tag) Profiler::instance()->stop_timer(); START_TIMER(tag);
137 #else
138 #define END_START_TIMER(tag)
139 #endif
140 
141 
142 /**
143  * \def ADD_CALLS(n_calls)
144  *
145  * @brief Increase number of calls in actual timer.
146  *
147  * Some time you want to measure a performance of a cycle with body that is below resolution of the Timer implementation.
148  * If you know number of cycles, you can use this macro in following way:
149  *
150  * @code
151  * START_TIMER("cycle");
152  * unsigned int i;
153  * for(i =0; i<1000000; i++) i*i*i;
154  * ADD_CALLS(i);
155  * END_TIMER("cycle");
156  * @endcode
157  *
158  * In the profiler report you get the total time spent in the cycle, and time per one call which will be average
159  * time spent in the body of the cycle.
160  */
161 #ifdef DEBUG_PROFILER
162 #define ADD_CALLS(n_calls) Profiler::instance()->add_calls(n_calls)
163 #else
164 #define ADD_CALLS(n_calls)
165 #endif
166 
167 
168 
169 
170 //////////////////////////////////////////////////////////////////////////////////////////////
171 #ifdef DEBUG_PROFILER
172 
173 /**
174  * @brief Function for compile-time hash computation. (Needs C++x11 standard.)
175  * Input, @p str, is constant null terminated string, result is unsigned int (usually 4 bytes).
176  * Function has to be recursive, since standard requires that the body consists only from the return statement.
177  *
178  * SALT is hash for the empty string. Currently zero for simpler testing.
179  */
180 inline CONSTEXPR_ unsigned int str_hash(const char * str) {
181  #define SALT 0 //0xef50e38f
182  return (*str == 0 ? SALT : str_hash(str+1) * 101 + (unsigned int)(*str) );
183 }
184 
185 /**
186  * Macro to generate constexpr CodePoint object.
187  */
188 #define CODE_POINT(tag) CodePoint(tag, __FILE__, __func__, __LINE__)
189 
190 /**
191  * @brief Class that represents point in the code.
192  *
193  * This class allow construction at compile time. And includes the information about the code point as well
194  * as the 'tag' of the timer and cimpile-time computed hashes of this 'tag'. The @p hash_ is long one with
195  * very small probability of collisions - this we use for comparison of tags. The @p hash_idx_ is the long hash modulo
196  * length of the array of Timer's children, this is used for fast loop up into this array that servers as a simple hash table.
197  */
198 class CodePoint {
199 public:
200  CONSTEXPR_ CodePoint(const char *tag, const char * file, const char * func, const unsigned int line)
201  : tag_(tag), file_(file), func_(func), line_(line),
202  hash_(str_hash(tag)), hash_idx_( str_hash(tag)%max_n_timer_childs )
203  {}
204 
205  /// Size of child arrays in timer nodes.
206  static const unsigned int max_n_timer_childs=13;
207 
208  /// Tag of the code point.
209  const char * const tag_;
210 
211  /// file name of the code point
212  const char * const file_;
213 
214  /// file name of the code point
215  const char * const func_;
216 
217  /// file name of the code point
218  const unsigned int line_;
219 
220  /// Full 32-bit hash of the tag ( practically no chance of collision)
221  unsigned int hash_;
222 
223  /// Hash modulo size of array of timer childs ( we have to check full hash to prevent collision)
224  unsigned int hash_idx_;
225 };
226 
227 
228 
229 /**
230  * @brief Class for profiling tree nodes.
231  *
232  * One Timer represents one particular time frame in the execution tree.
233  * It collects information about total time, number of calls, allocated and deallocated memory.
234  *
235  * It should be accessed only through Profiler, which is its friend class.
236  *
237  * TODO: for better performance: move copy hash_ and hash_idx_ into Timer since CodePoint are in static
238  * variables, that may be slower to acces.
239  *
240  */
241 class Timer {
242 
243 
244 public:
245  /// Size of array @p child_timers, the hash table containing descendants in the call tree.
246  static const unsigned int max_n_childs=CodePoint::max_n_timer_childs;
247 
248  /**
249  * TimeData is a type returned by method get_time and used to store time data in timer nodes.
250  * Timer node also provides method @p cumulative_time to retrieve true cumulative time in ms.
251  */
252  typedef clock_t TimeData;
253 
254  /**
255  * Creates the timer node object. Should not be called directly, but through the START_TIMER macro.
256  */
257  Timer(const CodePoint &cp, int parent);
258 
259 
260  /**
261  * @brief Returns measure of the time spent in the process since the start.
262  *
263  * TODO: We should detect precision of the clock() command and report it with timing data.
264  * It is usually about 10ms. If this is not enough one is forced to use high-resolution timers
265  * which are system dependent.
266  */
267  static TimeData get_time();
268 
269  /**
270  * Start the timer. If it is already started, just increase number of starts (recursions) and calls.
271  */
272  void start();
273 
274  /**
275  * Updates cumulative time of the timer.
276  */
277  void update();
278 
279  /**
280  * If number of starts (recursions) drop back to zero, we stop the timer and add the period to the cumulative time.
281  * This method do not take care of its childs (it has no access to the other timers).
282  * When the parameter 2p forced is 'true', we stop the timer immediately regardless the number of recursions.
283  * Returns true if the timer is not closed (recursions didn't drop to zero yet).
284  */
285  bool stop(bool forced = false);
286 
287 
288  /// Getter for the 'tag'.
289  inline const char *tag() const
290  { return code_point_->tag_; }
291 
292  /// Returns true if the timer is open, number of starts (recursions) is nonzero.
293  inline bool running() const
294  { return start_count >0; }
295 
296  /// Returns string with description of the code point where the timer was first started.
297  std::string code_point_str() const;
298 
299  /**
300  * Returns cumulative time of the timer in 'ms'.
301  */
302  double cumulative_time() const;
303 
304  /*
305  * Adds given index @p child_index of the timer @p child to the correct place in the hash table.
306  */
307  void add_child(int child_index, const Timer &child);
308 
309 
310 protected:
311  /**
312  * Start time when frame opens.
313  */
314  TimeData start_time;
315  /**
316  * Cumulative time spent in the frame.
317  */
318  TimeData cumul_time;
319  /**
320  * Total number of opening of the frame.
321  */
322  unsigned int call_count;
323  /**
324  * Number of recursive openings.
325  */
326  unsigned int start_count;
327 
328 
329  /**
330  * Code point of the first START_TIMER for the particular tag. The 'tag' identifies timer
331  * and is used in reported profiler table.
332  */
333  const CodePoint *code_point_;
334  /// Full tag hash. Copy from code_point_.
335  unsigned int full_hash_;
336  /// Hash modulo size of array of timer childs. Copy from code_point_.
337  unsigned int hash_idx_;
338 
339  /**
340  * Index of the parent timer node in the tree. Negative value means 'not set'.
341  */
342  int parent_timer;
343  /**
344  * Indices of the child timers in the Profiler::timers_ vector. Negative values means 'not set'.
345  */
346  int child_timers[max_n_childs];
347 
348  /**
349  * Total number of bytes allocated directly in this frame (not include subframes).
350  */
351  size_t total_allocated_;
352  /**
353  * Total number of bytes deallocated directly in this frame (not include subframes).
354  */
355  size_t total_deallocated_;
356 
357 
358  friend class Profiler;
359 
360 };
361 
362 
363 /**
364  *
365  * @brief Main class for profiling by measuring time intervals.
366  *
367  * These time intervals form a tree structure where each interval is represented
368  * by a Timer object. The root node of the tree is automatically created and
369  * started after creating the Profiler object and cannot be stopped manually.
370  *
371  * The class implements a singleton pattern and all the functions are accessible trough
372  * Profiler::instance(), but in most cases the programmer will access the profiler
373  * functions via the #START_TIMER and #END_TIMER macros. The #START_TIMER macro
374  * is responsible for the fact that we don't have to call #END_TIMER macro to stop the timer and
375  * the timer will be stopped at the end of the block in which #START_TIMER was used.
376  * These macros internally use the TimerFrame objects and the programmer should
377  * not use the TimerFrame objects directly.
378  *
379  * By using #SET_TIMER_SUBFRAMES macro, the programmer can specify the number of subframes (eg. iterations)
380  * for the currently active timer.
381  *
382  *
383  * Currently the Profiler system is not thread safe. No idea how to do this.
384  *
385  */
386 class Profiler {
387 public:
388 
389  /**
390  * Initializes the Profiler with specific MPI communicator object
391  */
392  //static void initialize(MPI_Comm communicator = MPI_COMM_WORLD);
393  static void initialize();
394  /**
395  * Returns unique Profiler object.
396  */
397  inline static Profiler* instance() {
398  ASSERT( _instance , "Can not get Profiler instance. Profiler not initialized yet.\n");
399  return _instance;
400  }
401  /**
402  * Sets task specific information. The string @p description with textual description of the task and the
403  * number of elements of the mesh (parameter @p size). This is used for weak scaling graphs so it should
404  * measure size of the task of the same type (same description).
405  *
406  */
407  void set_task_info(string description, int size);
408  /**
409  * Sets informations about program version. This consists of @p program_version (includes program name), @p branch in the repository or rather full URL of the branch,
410  * and SVN @p revision (or hash for GIT).
411  *
412  */
413  void set_program_info(string program_name, string program_version, string branch, string revision, string build);
414 
415 
416  /**
417  * Starts a timer with code point, tag and hashes specified by CodePoint object @p cp.
418  * If the timer is not already created, it creates a new one. It returns index of
419  * the actual timer.
420  */
421  int start_timer(const CodePoint &cp);
422  /**
423  * Stops actual timer. It check if the hash of given code point match hash of the
424  * tag of actual timer node. If not we print out warning and try to find the correct tag
425  * towards the tree root closing all nodes we pass through.
426  *
427  * If Flow123d_DEBUG is set, we check that all children are closed.
428  */
429  void stop_timer(const CodePoint &cp);
430 
431  /**
432  * Stop timer with index given by @p timer_index. If this is not equal to @p actual_node, we
433  * traverse the tree towards root while force closing nodes by the way.
434  *
435  * Negative @p timer_index means close @p actual_node
436  */
437  void stop_timer(int timer_index = -1);
438 
439 
440 
441  /**
442  * Adds @p n_calls - 1 to the total number of calls of the current timer. Minus one, since one call is counted when
443  * timer was started. You should use macro ADD_CALLS above.
444  */
445  void add_calls(unsigned int n_calls);
446  /**
447  * Notification about allocation of given size.
448  * Increase total allocated memory in current profiler frame.
449  */
450  void notify_malloc(const size_t size );
451  /**
452  * Notification about freeing memory of given size.
453  * Increase total deallocated memory in current profiler frame.
454  */
455  void notify_free(const size_t size );
456 
457 
458  /**
459  * Returns tag of current timer.
460  */
461  inline const char *actual_tag() const
462  { return timers_[actual_node].tag(); }
463  /**
464  * Returns total number of calls of current timer.
465  */
466  inline unsigned int actual_count() const
467  { return timers_[actual_node].call_count; }
468  /**
469  * Returns total time of current timer.
470  */
471  inline double actual_cumulative_time() const
472  { return timers_[actual_node].cumulative_time(); }
473 
474 
475  /**
476  * @brief Output current timing information into the given stream.
477  *
478  * COLECTIVE - all processes in the communicator have to call this
479  * method. It temporally stops all timers, synchronize all processes, collect
480  * profiling informations and write it to the given stream.
481  *
482  * Pass through the profiling tree (collective over processors)
483  * Print cumulative times average, balance (max/min), count (denote differences)
484  *
485  */
486  void output(MPI_Comm comm, ostream &os);
487  /**
488  * Same as previous, but output to the file with default name: "profiler_info_YYMMDD_HH::MM:SS.log".
489  * Empty body if macro DEBUG_PROFILER is not defined.
490  *
491  * TODO: move this outside to minimize dependencies
492  */
493  void output(MPI_Comm comm);
494  /**
495  * Stop all timers and destroys the Profiler object.
496  * If you want some output call @p output method just before.
497  */
498  static void uninitialize();
499 
500  /**
501  * Check if the instance was created.
502  */
503  static bool is_initialized() { return (_instance != NULL); }
504 
505 
506 private:
507  /**
508  * Try to find timer with tag (in fact only its 32-bit hash) from given code point @p cp.
509  * Returns -1 if it is not found otherwise it returns its index.
510  */
511  int find_child(const CodePoint &cp);
512 
513  /**
514  * Used to update cumulative times of running timers
515  * in the @p output method in order to get actual results.
516  */
517  void update_running_timers();
518 
519 
520  /// Default code point.
521  static CodePoint null_code_point;
522 
523  /// Pointer to the unique instance of singleton Profiler class.
524  static Profiler* _instance;
525 
526  /// Vector of all timers. Whole tree is stored in this array.
527  vector<Timer> timers_;
528 
529  /// Index of the actual timer node. Negative value means 'unset'.
530  int actual_node;
531 
532  /// MPI communicator used for final reduce of the timer node tree.
533  //MPI_Comm communicator_;
534  /// MPI_rank
535  //int mpi_rank_;
536 
537  // header informations
538 
539  /// Some measure of the size of the task in the set of the tasks that differs
540  /// only by size - used for scaling tests.
541  int task_size_;
542  /// Task description and identifier in possible database of all Profiler results.
543  string task_description_;
544  /// Time and date of the start of the task solution. In fact start of the Profiler.
545  time_t start_time;
546 
547  /// Name of the program.
548  string flow_name_;
549  /// Version of the program.
550  string flow_version_;
551  /// Http address of the branch in a repository.
552  string flow_branch_;
553  /// Revision or GIT hash.
554  string flow_revision_;
555  /// Build date and time.
556  string flow_build_;
557 
558 
559  /**
560  * Use DFS to pass through the tree and collect information about all timers reduced from the processes in the communicator.
561  * For every timer the information strings are stored in the struct TimerInfo in order to pad fields correctly
562  * to have alligned columns on the output. The alligning is performed in the output() method.
563  */
564  void add_timer_info(MPI_Comm comm, vector<vector<string> > &timersInfo, int timer_idx, int indent, double parent_time);
565 
566  //Profiler(MPI_Comm comm); // private constructor
567  Profiler(); // private constructor
568  Profiler(Profiler const&); // copy constructor is private
569  Profiler & operator=(Profiler const&); // assignment operator is private
570 };
571 
572 
573 
574 
575 
576 
577 /**
578  *
579  * @brief Class for automatic timer closing. This class is used by #START_TIMER macro
580  * and is responsible for the fact that we don't have to call #END_TIMER macro to stop the timer,
581  * the timer will be stopped at the end of the block in which #START_TIMER was used.
582  *
583  * The main idea of the approach described is that the TimerFrame variable will be destroyed
584  * at the end of the block where #START_TIMER macro was used. In order to work properly
585  * in situations where #END_TIMER was used to stop the timer manually before (but there is still the
586  * variable which will be later destroyed), we have to store references to these variables and
587  * destroy them on-demand.
588  *
589  * TODO:
590  * Should only contain pointer to the Timer. And destructor, that close the timer.
591  */
592 class TimerFrame {
593 private:
594  int const timer_index_;
595 public:
596  inline TimerFrame(const CodePoint &cp)
597  : timer_index_( Profiler::instance()->start_timer(cp) )
598  {}
599 
600  ~TimerFrame() {
601  Profiler::instance()->stop_timer(timer_index_);
602  }
603 };
604 
605 #else // DEBUG_PROFILER
606 
607 
608 // dummy declaration of Profiler class
609 class Profiler {
610 public:
611  static void initialize();
612  inline static Profiler* instance() {
613  ASSERT( _instance , "Can not get Profiler instance. Profiler not initialized yet.\n");
614  return _instance;
615  }
616 
617  void set_task_info(string description, int size)
618  {}
619  void set_program_info(string program_name, string program_version, string branch, string revision, string build)
620  {}
621  void notify_malloc(const size_t size )
622  {}
623  void notify_free(const size_t size )
624  {}
625  void output(MPI_Comm comm, ostream &os)
626  {}
627  void output(MPI_Comm comm)
628  {}
629  const char *actual_tag() const
630  { return NULL; }
631  inline unsigned int actual_count() const
632  { return 0; }
633  inline double actual_cumulative_time() const
634  { return 0.0; }
635  static void uninitialize();
636 
637  static bool is_initialized() { return (_instance != NULL); }
638 
639 private:
641  Profiler() {}
642 };
643 
644 
645 
646 
647 #endif
648 
649 
650 #endif
static int min(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:59
#define CONSTEXPR_
Definition: sys_profiler.hh:83
int MPI_Comm
Definition: mpi.h:141
static int sum(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:47
static Profiler * instance()
unsigned int actual_count() const
double actual_cumulative_time() const
void notify_free(const size_t size)
void output(MPI_Comm comm)
Global macros to enhance readability and debugging, general constants.
#define ASSERT(...)
Definition: global_defs.h:121
static int max(int *val, MPI_Comm comm)
Definition: sys_profiler.cc:71
void set_program_info(string program_name, string program_version, string branch, string revision, string build)
static Profiler * _instance
static bool is_initialized()
void output(MPI_Comm comm, ostream &os)
void set_task_info(string description, int size)
void notify_malloc(const size_t size)
const char * actual_tag() const