Flow123d  jenkins-Flow123d-windows32-release-multijob-51
tokenizer.hh
Go to the documentation of this file.
1 /*
2  * tokenizer.hh
3  *
4  * Created on: Nov 9, 2012
5  * Author: jb
6  */
7 
8 #ifndef TOKENIZER_HH_
9 #define TOKENIZER_HH_
10 
11 #include <boost/tokenizer.hpp>
12 #include <istream>
13 
14 
15 class FilePath;
16 
17 
18 /**
19  * @brief Simple class for parsing text files.
20  *
21  * Boost library provides nice tokenizer. The string is viewed as a container of tokens and
22  * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify
23  * reading of the text files. Actual tokenizer use backslash '\\' as the escape character, double quotas '"'as quotation
24  * character, and space ' ' or tabelator '\\t' as the separator of tokens.
25  *
26  * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive
27  * separator characters into one separator. Consequently, there appears empty tokens when there more spaces
28  * then one separating tokens. To overcome this, we drop every empty token.
29  *
30  * Provides:
31  * - method to read @p next_line, automatically skipping empty lines
32  * - iterating over tokens on current line
33  * - number of lines that the tokenizer has read -- method line_num
34  *
35  * Example of usage:
36  * @code
37  * Tokenizer(in_stream);
38  * @endcode
39  *
40  * TODO:
41  * - method to reopen - skip to the beginning
42  *
43  *
44  */
45 class Tokenizer {
46 public:
47  /**
48  * Shortcut for boost tokenizer.
49  */
50  typedef boost::escaped_list_separator<char> Separator;
51  typedef boost::tokenizer<Separator> BT;
52 
53  /**
54  * Opens a file given by file path @p fp. And construct the tokenizer over the
55  * input stream for this file.
56  * The stream is read from its actual position. The separator of the tokens is
57  * either tabelator '\\t' or space ' '.
58  *
59  */
60  Tokenizer(const FilePath &fp);
61  /**
62  * Construct the tokenizer over given input stream @p in.
63  * The stream is read from its actual position. The separator of the tokens is
64  * either tabelator '\\t' or space ' '.
65  *
66  * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.
67  *
68  * @code Tokenizer( ifstream("my_file") );
69  *
70  */
71  Tokenizer( std::istream &in);
72 
73  /**
74  * Skip whole line if the beginning of the trimmed line match the given @p pattern string.
75  *
76  * TODO: Allow end line comments.
77  */
78  void set_comment_pattern( const std::string &pattern);
79  /**
80  * Skip forward to the line that match given string.
81  * The tokenizer is set to the begining of that line.
82  * Returns true if the @p pattern has been found before the end of file.
83  * Optionally, if the parameter @p end_search_pattern is provided, the search is
84  * stopped after search for @p pattern on the line where @p end_search_pattern was detected.
85  * Next line is not read.
86  *
87  * TODO: similar method that use regular expressions (e.g. from boost)
88  * TODO: add option to find the pattern in the whole file (wrap over the end of file)
89  */
90  bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");
91 
92  /**
93  * Drops remaining tokens on the current line and reads the new one.
94  * A warning is reported in the case of unprocessed tokens.
95  * The lines without any tokens are skipped, but counted into
96  * number reported by @p line_num. Retuns false if we reach the end of file
97  * otherwise returns true.
98  *
99  * Optional parameter @p assert_for_remaining_tokens can be set false if you
100  * want to ignore remaining tokens on current line. Otherwise an warning for the user is
101  * produced since possibly there is error in the data format.
102  */
103  bool next_line(bool assert_for_remaining_tokens=true);
104  /**
105  * Dereference of the tokenizer iterator. Returns reference to the string
106  * that contains current token.
107  */
108  const std::string & operator *() const;
109 
110  /**
111  * Moves to the next token on the line.
112  */
113  inline BT::iterator & operator ++() {
114  if (! eol()) {position_++; ++tok_;}
115  // skip empty tokens (consecutive separators)
116  while (! eol() && (*tok_).size()==0 ) {position_++; ++tok_;}
117  return tok_;
118  }
119 
120  /**
121  * Returns true if the iterator is over the last token on the current line.
122  */
123  inline bool eol() const
124  { return tok_ == line_tokenizer_.end(); }
125 
126  /**
127  * Returns true if at the end of the input stream.
128  */
129  inline bool eof() const
130  { return in_->eof(); }
131 
132  /**
133  * Returns position on line.
134  */
135  inline unsigned int pos() const
136  { return position_;}
137 
138  /**
139  * Returns number of lines read by the tokenizer.
140  * After first call of @p next_line this returns '1'.
141  */
142  inline unsigned int line_num() const
143  {return line_counter_;}
144 
145  /**
146  * Returns file name.
147  */
148  inline const std::string &f_name() const
149  {return f_name_;}
150 
151  /**
152  * Returns full position description.
153  */
154  std::string position_msg() const;
155 
156  /**
157  * Read access to current input line.
158  */
159  inline const std::string &line() const
160  { return line_;}
161 
162  /**
163  * Destructor close the file if it was opened by tokenizer itself.
164  */
165  ~Tokenizer();
166 
167 private:
168  // reset tokenizer for actual line
169  void set_tokenizer();
170 
171  /// File name (for better error messages)
172  std::string f_name_;
173  /// Pointer to internal stream , if tokenizer is constructed form FilePath object.
174  std::ifstream *own_stream_;
175  /// Input stream.
176  std::istream *in_;
177  /// Current line
178  std::string line_;
179  /// Possible comment pattern
180  std::string comment_pattern_;
181 
182  /// Number of liner read by the tokenizer.
183  unsigned int line_counter_;
184  unsigned int position_;
185 
186  /// Line token iterator
187  BT::iterator tok_;
188  /// Separator function used by the tokenizer
189  Separator separator_;
190  /// Line tokenizer (container like object).
191  BT line_tokenizer_;
192 };
193 
194 
195 
196 
197 #endif /* TOKENIZER_HH_ */
UnitSI operator*(const UnitSI &a, const UnitSI &b)
Product of two units.
Definition: unit_si.cc:150
Dedicated class for storing path to input and output files.
Definition: file_path.hh:32