3.9.1_703bcf/doxygen/tokenizer_8hh_source.html

/*!

 *

 * Copyright (C) 2015 Technical University of Liberec.  All rights reserved.

 *

 * This program is free software; you can redistribute it and/or modify it under

 * the terms of the GNU General Public License version 3 as published by the

 * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)

 *

 * This program is distributed in the hope that it will be useful, but WITHOUT

 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

 *

 *

 * @file    tokenizer.hh

 * @brief

 */


#ifndef TOKENIZER_HH_

#define TOKENIZER_HH_


#include <boost/tokenizer.hpp>

#include <istream>

#include "system/exceptions.hh"


class FilePath;


/**

 * @brief Simple class for parsing text files.

 *

 * Boost library provides nice tokenizer. The string is viewed as a container of tokens and

 * you can iterate over them. This class simplify the usage of the boost's tokenizer and further simplify

 * reading of the text files. Actual tokenizer use backslash '\\' as the escape character and double quotas '"' as quotation

 * character. The separator of tokens can be set in constructor, default value is space ' ' or tabelator '\\t'.

 *

 * !! Used token separator @p escaped_list_separator do not provide possibility to merge several consecutive

 * separator characters into one separator. Consequently, there appears empty tokens when there more spaces

 * then one separating tokens. To overcome this, we drop every empty token.

 *

 * Provides:

 * - method to read @p next_line, automatically skipping empty lines

 * - iterating over tokens on current line

 * - number of lines that the tokenizer has read -- method line_num

 *

 * Example of usage:

 * @code

 * Tokenizer(in_stream);

 * @endcode

 *

 * TODO:

 * - method to reopen - skip to the beginning

 *

 *

 */

class Tokenizer {

public:

    /**

     * Shortcut for boost tokenizer.

     */

    typedef boost::escaped_list_separator<char> Separator;

    typedef boost::tokenizer<Separator> BT;


    TYPEDEF_ERR_INFO( EI_File, std::string);

    TYPEDEF_ERR_INFO( EI_Line, unsigned int);

    TYPEDEF_ERR_INFO( EI_Pos, unsigned int);

    DECLARE_EXCEPTION( ExcMissingToken, << "Missing token, file: " << EI_File::qval << ", line: " << EI_Line::qval << ", position: " << EI_Pos::qval << ".\n" );

    DECLARE_EXCEPTION( ExcCannotRead, << "Can not read from stream, file: " << EI_File::qval << ", line: " << EI_Line::qval << ".\n" );


    /**

     * Struct represents actual position of Tokenizer in file.

     *

     * It is necessary to check if stored values are correct. Out of Tokenizer values can be set

     * only during construction. Entered file_position_ must correspond with values line_counter_

     * and line_position_. Unfortunately, any control mechanism of entered values doesn't exist.

     * If Position object is returned out of Tokenizer, value of file_position_ must be set

     * according to the position of Tokenizer.

     */

    struct Position {

        std::streampos file_position_;     ///< Actual (global) position in file.

        unsigned int line_counter_;        ///< Actual line in file.

        unsigned int line_position_;       ///< Actual position in line.


        /// Empty constructor

        Position():

            file_position_(0), line_counter_(0), line_position_(0) {}


        /// Constructor

        Position(std::streampos file_pos, unsigned int line, unsigned int line_pos):

            file_position_(file_pos), line_counter_(line), line_position_(line_pos) {}

    };


    /**

     * Opens a file given by file path @p fp. And construct the tokenizer over the

     * input stream for this file.

     * The stream is read from its actual position. Default value of the separator of the tokens is

     * either tabelator '\\t' or space ' ' and can be overwrite.

     *

     */

    Tokenizer(const  FilePath &fp, Separator separator = Separator("\\"," \t","\"") );

    /**

     * Construct the tokenizer over given input stream @p in.

     * The stream is read from its actual position. Default value of the separator of the tokens is

     * either tabelator '\\t' or space ' ' and can be overwrite.

     *

     * Unfortunately, std::istream can not be passed by value nor by const reference. Thus you can not write, e.g.

     *

     * @code Tokenizer( ifstream("my_file") );

     *

     */

    Tokenizer( std::istream &in, Separator separator = Separator("\\"," \t","\"") );


    /**

     * Skip whole line if the beginning of the trimmed line match the given @p pattern string.

     *

     * TODO: Allow end line comments.

     */

    void set_comment_pattern( const std::string &pattern);

    /**

     * Skip forward to the line that match given string.

     * The tokenizer is set to the begining of that line.

     * Returns true if the @p pattern has been found before the end of file.

     * Optionally, if the parameter @p end_search_pattern is provided, the search is

     * stopped after search for @p pattern on the line where @p end_search_pattern was detected.

     * Next line is not read.

     *

     * TODO: similar method that use regular expressions (e.g. from boost)

     * TODO: add option to find the pattern in the whole file (wrap over the end of file)

     */

    bool skip_to(const std::string &pattern, const std::string &end_search_pattern="");


    /**

     * Drops remaining tokens on the current line and reads the new one.

     * A warning is reported in the case of unprocessed tokens.

     * The lines without any tokens are skipped, but counted into

     * number reported by @p line_num. Retuns false if we reach the end of file

     * otherwise returns true.

     *

     * Optional parameter @p assert_for_remaining_tokens can be set false if you

     * want to ignore remaining tokens on current line. Otherwise an warning for the user is

     * produced since possibly there is error in the data format.

     */

    bool next_line(bool assert_for_remaining_tokens=true);

    /**

     * Dereference of the tokenizer iterator. Returns reference to the string

     * that contains current token.

     */

    const std::string & operator *() const;


    /**

     * Moves to the next token on the line.

     */

    inline BT::iterator & operator ++() {

      if (! eol()) {position_.line_position_++; ++tok_;}

      // skip empty tokens (consecutive separators)

      while (! eol() && (*tok_).size()==0 ) {position_.line_position_++; ++tok_;}

      return tok_;

    }


    /**

     * Returns true if the iterator is over the last token on the current line.

     */

    inline bool eol() const

        { return tok_ == line_tokenizer_.end(); }


    /**

     *  Returns true if at the end of the input stream.

     */

    inline bool eof() const

        { return in_->eof(); }


    /**

     * Returns position on line.

     */

    inline unsigned int pos() const

        { return position_.line_position_;}


    /**

     * Returns number of lines read by the tokenizer.

     * After first call of @p next_line this returns '1'.

     */

    inline unsigned int line_num() const

        {return position_.line_counter_;}


    /**

     * Returns file name.

     */

    inline const std::string &f_name() const

        {return f_name_;}


    /**

     * Returns full position description.

     */

    std::string position_msg() const;


    /**

     * Read access to current input line.

     */

    inline const std::string &line() const

        { return line_;}


    /**

     * Returns actual position in file.

     */

    const Tokenizer::Position get_position();


    /**

     * Set new position of tokenizer in file.

     *

     * Warning! Actual file_position_ must correspond with values line_counter_

     * and line_position_. Method can't check if the values are entered correctly.

     */

    void set_position(const Tokenizer::Position pos);


    /**

     * Destructor close the file if it was opened by tokenizer itself.

     */

    ~Tokenizer();


protected:

    // reset tokenizer for actual line

    void set_tokenizer();


    /// File name (for better error messages)

    std::string f_name_;

    /**

     * Internal stream , if tokenizer is constructed form FilePath object.

     * Automatic destruction.

     */

    std::ifstream *own_stream_;

    /// Input stream.

    std::istream *in_;

    /// Current line

    std::string line_;

    /// Possible comment pattern

    std::string comment_pattern_;


    /// Number of liner read by the tokenizer.

    Position position_;


    /// Line token iterator

    BT::iterator tok_;

    /// Separator function used by the tokenizer

    Separator separator_;

    /// Line tokenizer (container like object).

    BT line_tokenizer_;

};


#endif /* TOKENIZER_HH_ */