Flow123d
tokenizer.hh
Go to the documentation of this file.
1 /*
2  * tokenizer.hh
3  *
4  * Created on: Nov 9, 2012
5  * Author: jb
6  */
7 
8 #ifndef TOKENIZER_HH_
9 #define TOKENIZER_HH_
10 
11 #include <boost/tokenizer.hpp>
12 #include <istream>
13 
14 
15 class FilePath;
16 
17 
18 /**
19  * @brief Simple class for parsing text files.
20  *
21  * Boost library provides nice tokenizer. The string is viewed as a container of tokens and
22  * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify
23  * reading of the text files. Actual tokenizer use backslash '\\' as the escape character, double quotas '"'as quotation
24  * character, and space ' ' or tabelator '\\t' as the separator of tokens.
25  *
26  * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive
27  * separator characters into one separator. Consequently, there appears empty tokens when there more spaces
28  * then one separating tokens. To overcome this, we drop every empty token.
29  *
30  * Provides:
31  * - method to read @p next_line, automatically skipping empty lines
32  * - iterating over tokens on current line
33  * - number of lines that the tokenizer has read -- method line_num
34  *
35  * Example of usage:
36  * @code
37  * Tokenizer(in_stream);
38  * @endcode
39  *
40  * TODO:
41  * - method to reopen - skip to the beginning
42  *
43  *
44  */
45 class Tokenizer {
46 public:
47  /**
48  * Shortcut for boost tokenizer.
49  */
50  typedef boost::escaped_list_separator<char> Separator;
51  //typedef boost::tokenizer<boost::char_separator<char> > BT;
52  typedef boost::tokenizer<Separator> BT;
53 
54  /**
55  * Opens a file given by file path @p fp. And construct the tokenizer over the
56  * input stream for this file.
57  * The stream is read from its actual position. The separator of the tokens is
58  * either tabelator '\\t' or space ' '.
59  *
60  */
61  Tokenizer(const FilePath &fp);
62  /**
63  * Construct the tokenizer over given input stream @p in.
64  * The stream is read from its actual position. The separator of the tokens is
65  * either tabelator '\\t' or space ' '.
66  *
67  * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.
68  *
69  * @code Tokenizer( ifstream("my_file") );
70  *
71  */
72  Tokenizer( std::istream &in);
73 
74  /**
75  * Skip whole line if the beginning of the trimmed line match the given @p pattern string.
76  *
77  * TODO: Allow end line comments.
78  */
79  void set_comment_pattern( const std::string &pattern);
80  /**
81  * Skip forward to the line that match given string.
82  * The tokenizer is set to the begining of that line.
83  * Returns true if the @p pattern has been found before the end of file.
84  * Optionally, if the parameter @p end_search_pattern is provided, the search is
85  * stopped after search for @p pattern on the line where @p end_search_pattern was detected.
86  * Next line is not read.
87  *
88  * TODO: similar method that use regular expressions (e.g. from boost)
89  * TODO: add option to find the pattern in the whole file (wrap over the end of file)
90  */
91  bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");
92 
93  /**
94  * Drops remaining tokens on the current line and reads the new one.
95  * A warning is reported in the case of unprocessed tokens.
96  * The lines without any tokens are skipped, but counted into
97  * number reported by @p line_num. Retuns false if we reach the end of file
98  * otherwise returns true.
99  *
100  * Optional parameter @p assert_for_remaining_tokens can be set false if you
101  * want to ignore remaining tokens on current line. Otherwise an warning for the user is
102  * produced since possibly there is error in the data format.
103  */
104  bool next_line(bool assert_for_remaining_tokens=true);
105  /**
106  * Dereference of the tokenizer iterator. Returns reference to the string
107  * that contains current token.
108  */
109  const std::string & operator *() const;
110 
111  /**
112  * Moves to the next token on the line.
113  */
114  inline BT::iterator & operator ++() {
115  if (! eol()) {position_++; ++tok_;}
116  // skip empty tokens (consecutive separators)
117  while (! eol() && (*tok_).size()==0 ) {position_++; ++tok_;}
118  return tok_;
119  }
120 
121  /**
122  * Returns true if the iterator is over the last token on the current line.
123  */
124  inline bool eol() const
125  { return tok_ == line_tokenizer_.end(); }
126 
127  /**
128  * Returns true if at the end of the input stream.
129  */
130  inline bool eof() const
131  { return in_->eof(); }
132 
133  /**
134  * Returns position on line.
135  */
136  inline unsigned int pos() const
137  { return position_;}
138 
139  /**
140  * Returns number of lines read by the tokenizer.
141  * After first call of @p next_line this returns '1'.
142  */
143  inline unsigned int line_num() const
144  {return line_counter_;}
145 
146  /**
147  * Returns file name.
148  */
149  inline const std::string &f_name() const
150  {return f_name_;}
151 
152  /**
153  * Returns full position description.
154  */
155  std::string position_msg() const;
156 
157  /**
158  * Read access to current input line.
159  */
160  inline const std::string &line() const
161  { return line_;}
162 
163  /**
164  * Destructor close the file if it was opened by tokenizer itself.
165  */
166  ~Tokenizer();
167 
168 private:
169  // reset tokenizer for actual line
170  void set_tokenizer();
171 
172  /// File name (for better error messages)
173  std::string f_name_;
174  /// Pointer to internal stream , if tokenizer is constructed form FilePath object.
175  std::ifstream *own_stream_;
176  /// Input stream.
177  std::istream *in_;
178  /// Current line
179  std::string line_;
180  /// Possible comment pattern
181  std::string comment_pattern_;
182 
183  /// Number of liner read by the tokenizer.
184  unsigned int line_counter_;
185  unsigned int position_;
186 
187  /// Line token iterator
188  BT::iterator tok_;
189  /// Separator function used by the tokenizer
190  Separator separator_;
191  /// Line tokenizer (container like object).
192  BT line_tokenizer_;
193 };
194 
195 
196 
197 
198 #endif /* TOKENIZER_HH_ */