#include #include "SBuf.h" /** * Efficiently converts raw input into a stream of basic tokens. * Custom token boundary/separation rules are supported via caller-provided, * pre-computed character sets. The caller (a parser of some kind) defines * the input grammar by using an appropriate sequence of token(), prefix(), * and skip() calls, with the right parameters restricting token composition. */ class Tokenizer { public: /// a collection of unique characters; TODO: support negation, merging typedef std::set CharacterSet; // TODO: optimize using a bool array explicit Tokenizer(const SBuf &inBuf); bool atEnd() const { return !buf_.length(); } const SBuf &remaining() const { return buf_; } void reset(const SBuf &newBuf) { buf_ = newBuf; } /* The following methods start from the beginning of the input buffer. * They return true and consume parsed chars if a non-empty token is found. * Otherwise, they return false without any side-effects. */ /** Basic strtok(3): * Skips all leading delimiters (if any), * accumulates all characters up to the first delimiter (a token), and * skips all trailing delimiters (if any). * Want to extract delimiters? Use three prefix() calls instead. */ bool token(SBuf &token, const CharacterSet &whitespace); /// Accumulates all sequential permitted characters (a token). bool prefix(SBuf &token, const CharacterSet &tokenChars); /// Skips all sequential permitted characters (a token). bool skip(const CharacterSet &tokenChars); /// Skips a given token. bool skip(const SBuf &token); /// Skips a given character (a token). bool skip(const char token); private: SBuf buf_; ///< yet unparsed input };