Flow123d  jenkins-Flow123d-linux-release-multijob-282
tokenizer.hh
Go to the documentation of this file.
1 /*
2  * tokenizer.hh
3  *
4  * Created on: Nov 9, 2012
5  * Author: jb
6  */
7 
8 #ifndef TOKENIZER_HH_
9 #define TOKENIZER_HH_
10 
11 #include <boost/tokenizer.hpp>
12 #include <istream>
13 
14 
15 class FilePath;
16 
17 
18 /**
19  * @brief Simple class for parsing text files.
20  *
21  * Boost library provides nice tokenizer. The string is viewed as a container of tokens and
22  * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify
23  * reading of the text files. Actual tokenizer use backslash '\\' as the escape character, double quotas '"'as quotation
24  * character, and space ' ' or tabelator '\\t' as the separator of tokens.
25  *
26  * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive
27  * separator characters into one separator. Consequently, there appears empty tokens when there more spaces
28  * then one separating tokens. To overcome this, we drop every empty token.
29  *
30  * Provides:
31  * - method to read @p next_line, automatically skipping empty lines
32  * - iterating over tokens on current line
33  * - number of lines that the tokenizer has read -- method line_num
34  *
35  * Example of usage:
36  * @code
37  * Tokenizer(in_stream);
38  * @endcode
39  *
40  * TODO:
41  * - method to reopen - skip to the beginning
42  *
43  *
44  */
45 class Tokenizer {
46 public:
47  /**
48  * Shortcut for boost tokenizer.
49  */
50  typedef boost::escaped_list_separator<char> Separator;
51  typedef boost::tokenizer<Separator> BT;
52 
53  /**
54  * Struct represents actual position of Tokenizer in file.
55  *
56  * It is necessary to check if stored values are correct. Out of Tokenizer values can be set
57  * only during construction. Entered file_position_ must correspond with values line_counter_
58  * and line_position_. Unfortunately, any control mechanism of entered values doesn't exist.
59  * If Position object is returned out of Tokenizer, value of file_position_ must be set
60  * according to the position of Tokenizer.
61  */
62  struct Position {
63  std::streampos file_position_; ///< Actual (global) position in file.
64  unsigned int line_counter_; ///< Actual line in file.
65  unsigned int line_position_; ///< Actual position in line.
66 
67  /// Empty constructor
68  Position():
69  file_position_(0), line_counter_(0), line_position_(0) {}
70 
71  /// Constructor
72  Position(std::streampos file_pos, unsigned int line, unsigned int line_pos):
73  file_position_(file_pos), line_counter_(line), line_position_(line_pos) {}
74  };
75 
76 
77  /**
78  * Opens a file given by file path @p fp. And construct the tokenizer over the
79  * input stream for this file.
80  * The stream is read from its actual position. The separator of the tokens is
81  * either tabelator '\\t' or space ' '.
82  *
83  */
84  Tokenizer(const FilePath &fp);
85  /**
86  * Construct the tokenizer over given input stream @p in.
87  * The stream is read from its actual position. The separator of the tokens is
88  * either tabelator '\\t' or space ' '.
89  *
90  * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.
91  *
92  * @code Tokenizer( ifstream("my_file") );
93  *
94  */
95  Tokenizer( std::istream &in);
96 
97  /**
98  * Skip whole line if the beginning of the trimmed line match the given @p pattern string.
99  *
100  * TODO: Allow end line comments.
101  */
102  void set_comment_pattern( const std::string &pattern);
103  /**
104  * Skip forward to the line that match given string.
105  * The tokenizer is set to the begining of that line.
106  * Returns true if the @p pattern has been found before the end of file.
107  * Optionally, if the parameter @p end_search_pattern is provided, the search is
108  * stopped after search for @p pattern on the line where @p end_search_pattern was detected.
109  * Next line is not read.
110  *
111  * TODO: similar method that use regular expressions (e.g. from boost)
112  * TODO: add option to find the pattern in the whole file (wrap over the end of file)
113  */
114  bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");
115 
116  /**
117  * Drops remaining tokens on the current line and reads the new one.
118  * A warning is reported in the case of unprocessed tokens.
119  * The lines without any tokens are skipped, but counted into
120  * number reported by @p line_num. Retuns false if we reach the end of file
121  * otherwise returns true.
122  *
123  * Optional parameter @p assert_for_remaining_tokens can be set false if you
124  * want to ignore remaining tokens on current line. Otherwise an warning for the user is
125  * produced since possibly there is error in the data format.
126  */
127  bool next_line(bool assert_for_remaining_tokens=true);
128  /**
129  * Dereference of the tokenizer iterator. Returns reference to the string
130  * that contains current token.
131  */
132  const std::string & operator *() const;
133 
134  /**
135  * Moves to the next token on the line.
136  */
137  inline BT::iterator & operator ++() {
138  if (! eol()) {position_.line_position_++; ++tok_;}
139  // skip empty tokens (consecutive separators)
140  while (! eol() && (*tok_).size()==0 ) {position_.line_position_++; ++tok_;}
141  return tok_;
142  }
143 
144  /**
145  * Returns true if the iterator is over the last token on the current line.
146  */
147  inline bool eol() const
148  { return tok_ == line_tokenizer_.end(); }
149 
150  /**
151  * Returns true if at the end of the input stream.
152  */
153  inline bool eof() const
154  { return in_->eof(); }
155 
156  /**
157  * Returns position on line.
158  */
159  inline unsigned int pos() const
160  { return position_.line_position_;}
161 
162  /**
163  * Returns number of lines read by the tokenizer.
164  * After first call of @p next_line this returns '1'.
165  */
166  inline unsigned int line_num() const
167  {return position_.line_counter_;}
168 
169  /**
170  * Returns file name.
171  */
172  inline const std::string &f_name() const
173  {return f_name_;}
174 
175  /**
176  * Returns full position description.
177  */
178  std::string position_msg() const;
179 
180  /**
181  * Read access to current input line.
182  */
183  inline const std::string &line() const
184  { return line_;}
185 
186  /**
187  * Returns actual position in file.
188  */
189  const Tokenizer::Position get_position();
190 
191  /**
192  * Set new position of tokenizer in file.
193  *
194  * Warning! Actual file_position_ must correspond with values line_counter_
195  * and line_position_. Method can't check if the values are entered correctly.
196  */
197  void set_position(const Tokenizer::Position pos);
198 
199  /**
200  * Destructor close the file if it was opened by tokenizer itself.
201  */
202  ~Tokenizer();
203 
204 private:
205  // reset tokenizer for actual line
206  void set_tokenizer();
207 
208  /// File name (for better error messages)
209  std::string f_name_;
210  /// Pointer to internal stream , if tokenizer is constructed form FilePath object.
211  std::ifstream *own_stream_;
212  /// Input stream.
213  std::istream *in_;
214  /// Current line
215  std::string line_;
216  /// Possible comment pattern
217  std::string comment_pattern_;
218 
219  /// Number of liner read by the tokenizer.
220  Position position_;
221 
222  /// Line token iterator
223  BT::iterator tok_;
224  /// Separator function used by the tokenizer
225  Separator separator_;
226  /// Line tokenizer (container like object).
227  BT line_tokenizer_;
228 };
229 
230 
231 
232 
233 #endif /* TOKENIZER_HH_ */
UnitSI operator*(const UnitSI &a, const UnitSI &b)
Product of two units.
Definition: unit_si.cc:172
Dedicated class for storing path to input and output files.
Definition: file_path.hh:32