Flow123d  release_2.2.0-914-gf1a3a4f
tokenizer.hh
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file tokenizer.hh
15  * @brief
16  */
17 
18 #ifndef TOKENIZER_HH_
19 #define TOKENIZER_HH_
20 
21 #include <boost/tokenizer.hpp>
22 #include <istream>
23 
24 
25 class FilePath;
26 
27 
28 /**
29  * @brief Simple class for parsing text files.
30  *
31  * Boost library provides nice tokenizer. The string is viewed as a container of tokens and
32  * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify
33  * reading of the text files. Actual tokenizer use backslash '\\' as the escape character and double quotas '"' as quotation
34  * character. The separator of tokens can be set in constructor, default value is space ' ' or tabelator '\\t'.
35  *
36  * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive
37  * separator characters into one separator. Consequently, there appears empty tokens when there more spaces
38  * then one separating tokens. To overcome this, we drop every empty token.
39  *
40  * Provides:
41  * - method to read @p next_line, automatically skipping empty lines
42  * - iterating over tokens on current line
43  * - number of lines that the tokenizer has read -- method line_num
44  *
45  * Example of usage:
46  * @code
47  * Tokenizer(in_stream);
48  * @endcode
49  *
50  * TODO:
51  * - method to reopen - skip to the beginning
52  *
53  *
54  */
55 class Tokenizer {
56 public:
57  /**
58  * Shortcut for boost tokenizer.
59  */
60  typedef boost::escaped_list_separator<char> Separator;
61  typedef boost::tokenizer<Separator> BT;
62 
63  /**
64  * Struct represents actual position of Tokenizer in file.
65  *
66  * It is necessary to check if stored values are correct. Out of Tokenizer values can be set
67  * only during construction. Entered file_position_ must correspond with values line_counter_
68  * and line_position_. Unfortunately, any control mechanism of entered values doesn't exist.
69  * If Position object is returned out of Tokenizer, value of file_position_ must be set
70  * according to the position of Tokenizer.
71  */
72  struct Position {
73  std::streampos file_position_; ///< Actual (global) position in file.
74  unsigned int line_counter_; ///< Actual line in file.
75  unsigned int line_position_; ///< Actual position in line.
76 
77  /// Empty constructor
78  Position():
79  file_position_(0), line_counter_(0), line_position_(0) {}
80 
81  /// Constructor
82  Position(std::streampos file_pos, unsigned int line, unsigned int line_pos):
83  file_position_(file_pos), line_counter_(line), line_position_(line_pos) {}
84  };
85 
86 
87  /**
88  * Opens a file given by file path @p fp. And construct the tokenizer over the
89  * input stream for this file.
90  * The stream is read from its actual position. Default value of the separator of the tokens is
91  * either tabelator '\\t' or space ' ' and can be overwrite.
92  *
93  */
94  Tokenizer(const FilePath &fp, Separator separator = Separator("\\"," \t","\"") );
95  /**
96  * Construct the tokenizer over given input stream @p in.
97  * The stream is read from its actual position. Default value of the separator of the tokens is
98  * either tabelator '\\t' or space ' ' and can be overwrite.
99  *
100  * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.
101  *
102  * @code Tokenizer( ifstream("my_file") );
103  *
104  */
105  Tokenizer( std::istream &in, Separator separator = Separator("\\"," \t","\"") );
106 
107  /**
108  * Skip whole line if the beginning of the trimmed line match the given @p pattern string.
109  *
110  * TODO: Allow end line comments.
111  */
112  void set_comment_pattern( const std::string &pattern);
113  /**
114  * Skip forward to the line that match given string.
115  * The tokenizer is set to the begining of that line.
116  * Returns true if the @p pattern has been found before the end of file.
117  * Optionally, if the parameter @p end_search_pattern is provided, the search is
118  * stopped after search for @p pattern on the line where @p end_search_pattern was detected.
119  * Next line is not read.
120  *
121  * TODO: similar method that use regular expressions (e.g. from boost)
122  * TODO: add option to find the pattern in the whole file (wrap over the end of file)
123  */
124  bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");
125 
126  /**
127  * Drops remaining tokens on the current line and reads the new one.
128  * A warning is reported in the case of unprocessed tokens.
129  * The lines without any tokens are skipped, but counted into
130  * number reported by @p line_num. Retuns false if we reach the end of file
131  * otherwise returns true.
132  *
133  * Optional parameter @p assert_for_remaining_tokens can be set false if you
134  * want to ignore remaining tokens on current line. Otherwise an warning for the user is
135  * produced since possibly there is error in the data format.
136  */
137  bool next_line(bool assert_for_remaining_tokens=true);
138  /**
139  * Dereference of the tokenizer iterator. Returns reference to the string
140  * that contains current token.
141  */
142  const std::string & operator *() const;
143 
144  /**
145  * Moves to the next token on the line.
146  */
147  inline BT::iterator & operator ++() {
148  if (! eol()) {position_.line_position_++; ++tok_;}
149  // skip empty tokens (consecutive separators)
150  while (! eol() && (*tok_).size()==0 ) {position_.line_position_++; ++tok_;}
151  return tok_;
152  }
153 
154  /**
155  * Returns true if the iterator is over the last token on the current line.
156  */
157  inline bool eol() const
158  { return tok_ == line_tokenizer_.end(); }
159 
160  /**
161  * Returns true if at the end of the input stream.
162  */
163  inline bool eof() const
164  { return in_->eof(); }
165 
166  /**
167  * Returns position on line.
168  */
169  inline unsigned int pos() const
170  { return position_.line_position_;}
171 
172  /**
173  * Returns number of lines read by the tokenizer.
174  * After first call of @p next_line this returns '1'.
175  */
176  inline unsigned int line_num() const
177  {return position_.line_counter_;}
178 
179  /**
180  * Returns file name.
181  */
182  inline const std::string &f_name() const
183  {return f_name_;}
184 
185  /**
186  * Returns full position description.
187  */
188  std::string position_msg() const;
189 
190  /**
191  * Read access to current input line.
192  */
193  inline const std::string &line() const
194  { return line_;}
195 
196  /**
197  * Returns actual position in file.
198  */
199  const Tokenizer::Position get_position();
200 
201  /**
202  * Set new position of tokenizer in file.
203  *
204  * Warning! Actual file_position_ must correspond with values line_counter_
205  * and line_position_. Method can't check if the values are entered correctly.
206  */
207  void set_position(const Tokenizer::Position pos);
208 
209  /**
210  * Destructor close the file if it was opened by tokenizer itself.
211  */
212  ~Tokenizer();
213 
214 protected:
215  // reset tokenizer for actual line
216  void set_tokenizer();
217 
218  /// File name (for better error messages)
219  std::string f_name_;
220  /**
221  * Internal stream , if tokenizer is constructed form FilePath object.
222  * Automatic destruction.
223  */
224  std::ifstream *own_stream_;
225  /// Input stream.
226  std::istream *in_;
227  /// Current line
228  std::string line_;
229  /// Possible comment pattern
230  std::string comment_pattern_;
231 
232  /// Number of liner read by the tokenizer.
233  Position position_;
234 
235  /// Line token iterator
236  BT::iterator tok_;
237  /// Separator function used by the tokenizer
238  Separator separator_;
239  /// Line tokenizer (container like object).
240  BT line_tokenizer_;
241 };
242 
243 
244 
245 
246 #endif /* TOKENIZER_HH_ */
UnitSI operator*(const UnitSI &a, const UnitSI &b)
Product of two units.
Definition: unit_si.cc:235
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54