Flow123d  JS_before_hm-1621-g63a12c7
tokenizer.hh
Go to the documentation of this file.
1 /*!
2  *
3  * Copyright (C) 2015 Technical University of Liberec. All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License version 3 as published by the
7  * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12  *
13  *
14  * @file tokenizer.hh
15  * @brief
16  */
17 
18 #ifndef TOKENIZER_HH_
19 #define TOKENIZER_HH_
20 
21 #include <boost/tokenizer.hpp>
22 #include <istream>
23 #include "system/exceptions.hh"
24 
25 
26 class FilePath;
27 
28 
29 /**
30  * @brief Simple class for parsing text files.
31  *
32  * Boost library provides nice tokenizer. The string is viewed as a container of tokens and
33  * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify
34  * reading of the text files. Actual tokenizer use backslash '\\' as the escape character and double quotas '"' as quotation
35  * character. The separator of tokens can be set in constructor, default value is space ' ' or tabelator '\\t'.
36  *
37  * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive
38  * separator characters into one separator. Consequently, there appears empty tokens when there more spaces
39  * then one separating tokens. To overcome this, we drop every empty token.
40  *
41  * Provides:
42  * - method to read @p next_line, automatically skipping empty lines
43  * - iterating over tokens on current line
44  * - number of lines that the tokenizer has read -- method line_num
45  *
46  * Example of usage:
47  * @code
48  * Tokenizer(in_stream);
49  * @endcode
50  *
51  * TODO:
52  * - method to reopen - skip to the beginning
53  *
54  *
55  */
56 class Tokenizer {
57 public:
58  /**
59  * Shortcut for boost tokenizer.
60  */
61  typedef boost::escaped_list_separator<char> Separator;
62  typedef boost::tokenizer<Separator> BT;
63 
64  TYPEDEF_ERR_INFO( EI_File, std::string);
65  TYPEDEF_ERR_INFO( EI_Line, unsigned int);
66  TYPEDEF_ERR_INFO( EI_Pos, unsigned int);
67  DECLARE_EXCEPTION( ExcMissingToken, << "Missing token, file: " << EI_File::qval << ", line: " << EI_Line::qval << ", position: " << EI_Pos::qval << ".\n" );
68  DECLARE_EXCEPTION( ExcCannotRead, << "Can not read from stream, file: " << EI_File::qval << ", line: " << EI_Line::qval << ".\n" );
69 
70  /**
71  * Struct represents actual position of Tokenizer in file.
72  *
73  * It is necessary to check if stored values are correct. Out of Tokenizer values can be set
74  * only during construction. Entered file_position_ must correspond with values line_counter_
75  * and line_position_. Unfortunately, any control mechanism of entered values doesn't exist.
76  * If Position object is returned out of Tokenizer, value of file_position_ must be set
77  * according to the position of Tokenizer.
78  */
79  struct Position {
80  std::streampos file_position_; ///< Actual (global) position in file.
81  unsigned int line_counter_; ///< Actual line in file.
82  unsigned int line_position_; ///< Actual position in line.
83 
84  /// Empty constructor
85  Position():
86  file_position_(0), line_counter_(0), line_position_(0) {}
87 
88  /// Constructor
89  Position(std::streampos file_pos, unsigned int line, unsigned int line_pos):
90  file_position_(file_pos), line_counter_(line), line_position_(line_pos) {}
91  };
92 
93 
94  /**
95  * Opens a file given by file path @p fp. And construct the tokenizer over the
96  * input stream for this file.
97  * The stream is read from its actual position. Default value of the separator of the tokens is
98  * either tabelator '\\t' or space ' ' and can be overwrite.
99  *
100  */
101  Tokenizer(const FilePath &fp, Separator separator = Separator("\\"," \t","\"") );
102  /**
103  * Construct the tokenizer over given input stream @p in.
104  * The stream is read from its actual position. Default value of the separator of the tokens is
105  * either tabelator '\\t' or space ' ' and can be overwrite.
106  *
107  * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.
108  *
109  * @code Tokenizer( ifstream("my_file") );
110  *
111  */
112  Tokenizer( std::istream &in, Separator separator = Separator("\\"," \t","\"") );
113 
114  /**
115  * Skip whole line if the beginning of the trimmed line match the given @p pattern string.
116  *
117  * TODO: Allow end line comments.
118  */
119  void set_comment_pattern( const std::string &pattern);
120  /**
121  * Skip forward to the line that match given string.
122  * The tokenizer is set to the begining of that line.
123  * Returns true if the @p pattern has been found before the end of file.
124  * Optionally, if the parameter @p end_search_pattern is provided, the search is
125  * stopped after search for @p pattern on the line where @p end_search_pattern was detected.
126  * Next line is not read.
127  *
128  * TODO: similar method that use regular expressions (e.g. from boost)
129  * TODO: add option to find the pattern in the whole file (wrap over the end of file)
130  */
131  bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");
132 
133  /**
134  * Drops remaining tokens on the current line and reads the new one.
135  * A warning is reported in the case of unprocessed tokens.
136  * The lines without any tokens are skipped, but counted into
137  * number reported by @p line_num. Retuns false if we reach the end of file
138  * otherwise returns true.
139  *
140  * Optional parameter @p assert_for_remaining_tokens can be set false if you
141  * want to ignore remaining tokens on current line. Otherwise an warning for the user is
142  * produced since possibly there is error in the data format.
143  */
144  bool next_line(bool assert_for_remaining_tokens=true);
145  /**
146  * Dereference of the tokenizer iterator. Returns reference to the string
147  * that contains current token.
148  */
149  const std::string & operator *() const;
150 
151  /**
152  * Moves to the next token on the line.
153  */
154  inline BT::iterator & operator ++() {
155  if (! eol()) {position_.line_position_++; ++tok_;}
156  // skip empty tokens (consecutive separators)
157  while (! eol() && (*tok_).size()==0 ) {position_.line_position_++; ++tok_;}
158  return tok_;
159  }
160 
161  /**
162  * Returns true if the iterator is over the last token on the current line.
163  */
164  inline bool eol() const
165  { return tok_ == line_tokenizer_.end(); }
166 
167  /**
168  * Returns true if at the end of the input stream.
169  */
170  inline bool eof() const
171  { return in_->eof(); }
172 
173  /**
174  * Returns position on line.
175  */
176  inline unsigned int pos() const
177  { return position_.line_position_;}
178 
179  /**
180  * Returns number of lines read by the tokenizer.
181  * After first call of @p next_line this returns '1'.
182  */
183  inline unsigned int line_num() const
184  {return position_.line_counter_;}
185 
186  /**
187  * Returns file name.
188  */
189  inline const std::string &f_name() const
190  {return f_name_;}
191 
192  /**
193  * Returns full position description.
194  */
195  std::string position_msg() const;
196 
197  /**
198  * Read access to current input line.
199  */
200  inline const std::string &line() const
201  { return line_;}
202 
203  /**
204  * Returns actual position in file.
205  */
206  const Tokenizer::Position get_position();
207 
208  /**
209  * Set new position of tokenizer in file.
210  *
211  * Warning! Actual file_position_ must correspond with values line_counter_
212  * and line_position_. Method can't check if the values are entered correctly.
213  */
214  void set_position(const Tokenizer::Position pos);
215 
216  /**
217  * Destructor close the file if it was opened by tokenizer itself.
218  */
219  ~Tokenizer();
220 
221 protected:
222  // reset tokenizer for actual line
223  void set_tokenizer();
224 
225  /// File name (for better error messages)
226  std::string f_name_;
227  /**
228  * Internal stream , if tokenizer is constructed form FilePath object.
229  * Automatic destruction.
230  */
231  std::ifstream *own_stream_;
232  /// Input stream.
233  std::istream *in_;
234  /// Current line
235  std::string line_;
236  /// Possible comment pattern
237  std::string comment_pattern_;
238 
239  /// Number of liner read by the tokenizer.
240  Position position_;
241 
242  /// Line token iterator
243  BT::iterator tok_;
244  /// Separator function used by the tokenizer
245  Separator separator_;
246  /// Line tokenizer (container like object).
247  BT line_tokenizer_;
248 };
249 
250 
251 
252 
253 #endif /* TOKENIZER_HH_ */
#define DECLARE_EXCEPTION(ExcName, Format)
Macro for simple definition of exceptions.
Definition: exceptions.hh:158
UnitSI operator*(const UnitSI &a, const UnitSI &b)
Product of two units.
Definition: unit_si.cc:235
#define TYPEDEF_ERR_INFO(EI_Type, Type)
Macro to simplify declaration of error_info types.
Definition: exceptions.hh:194
Dedicated class for storing path to input and output files.
Definition: file_path.hh:54