From 5661b404b0104c3065a40ad622bdd3c11d748a99 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 20 Apr 2017 17:31:26 +0300 Subject: Implement string_parser --- butl/buildfile | 1 + butl/string-parser | 23 ++++++--- butl/string-parser.cxx | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ butl/tab-parser | 16 ++---- butl/tab-parser.cxx | 116 ++++++++++++------------------------------- 5 files changed, 186 insertions(+), 102 deletions(-) create mode 100644 butl/string-parser.cxx (limited to 'butl') diff --git a/butl/buildfile b/butl/buildfile index 328afc0..75217fa 100644 --- a/butl/buildfile +++ b/butl/buildfile @@ -28,6 +28,7 @@ lib{butl}: \ {hxx ixx cxx}{ sendmail } \ {hxx cxx}{ sha256 } \ {hxx }{ small-vector } \ + {hxx cxx}{ string-parser } \ {hxx txx }{ string-table } \ {hxx cxx}{ tab-parser } \ {hxx cxx}{ target-triplet } \ diff --git a/butl/string-parser b/butl/string-parser index e3fa507..74391cd 100644 --- a/butl/string-parser +++ b/butl/string-parser @@ -8,6 +8,7 @@ #include #include #include // size_t +#include // pair #include // invalid_argument #include @@ -18,9 +19,9 @@ namespace butl { public: invalid_string (std::size_t p, const std::string& d) - : invalid_argument (d), pos (p) {} + : invalid_argument (d), position (p) {} - std::size_t pos; // Zero-based. + std::size_t position; // Zero-based. }; class LIBBUTL_EXPORT string_parser @@ -31,14 +32,24 @@ namespace butl // return one-level unquoted values. Throw invalid_string in case of // invalid quoting. // - std::vector - parse_quoted (const string&, bool unquote); + static std::vector + parse_quoted (const std::string&, bool unquote); + + // As above but return a list of string and zero-based position pairs. + // Position is useful for issuing diagnostics about an invalid string + // during second-level parsing. + // + static std::vector> + parse_quoted_position (const std::string&, bool unquote); // Remove a single level of quotes. Note that the format or the // correctness of the quotation is not validated. // - std::string - unquote (const string&); + static std::string + unquote (const std::string&); + + static std::vector + unquote (const std::vector&); }; } diff --git a/butl/string-parser.cxx b/butl/string-parser.cxx new file mode 100644 index 0000000..9228d66 --- /dev/null +++ b/butl/string-parser.cxx @@ -0,0 +1,132 @@ +// file : butl/string-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include // move() + +using namespace std; + +namespace butl +{ + // Utility functions + // + inline static bool + space (char c) noexcept + { + return c == ' ' || c == '\t'; + } + + // string_parser + // + vector> string_parser:: + parse_quoted_position (const string& s, bool unquote) + { + vector> r; + for (auto b (s.begin ()), i (b), e (s.end ()); i != e; ) + { + for (; i != e && space (*i); ++i) ; // Skip spaces. + + if (i == e) // No more strings. + break; + + string s; + char quoting ('\0'); // Current quoting mode, can be used as bool. + size_t pos (i - b); // String position. + + for (; i != e; ++i) + { + char c (*i); + + if (!quoting) + { + if (space (c)) // End of string. + break; + + if (c == '"' || c == '\'') // Begin of quoted substring. + { + quoting = c; + + if (!unquote) + s += c; + + continue; + } + } + else if (c == quoting) // End of quoted substring. + { + quoting = '\0'; + + if (!unquote) + s += c; + + continue; + } + + s += c; + } + + if (quoting) + throw invalid_string (i - b, "unterminated quoted string"); + + r.emplace_back (move (s), pos); + } + + return r; + } + + vector string_parser:: + parse_quoted (const string& s, bool unquote) + { + vector> sp (parse_quoted_position (s, unquote)); + + vector r; + r.reserve (sp.size ()); + for (auto& s: sp) + r.emplace_back (move (s.first)); + + return r; + } + + string string_parser:: + unquote (const string& s) + { + string r; + char quoting ('\0'); // Current quoting mode, can be used as bool. + + for (auto i (s.begin ()), e (s.end ()); i != e; ++i) + { + char c (*i); + + if (!quoting) + { + if (c == '"' || c == '\'') // Begin of quoted substring. + { + quoting = c; + continue; + } + } + else if (c == quoting) // End of quoted substring. + { + quoting = '\0'; + continue; + } + + r += c; + } + + return r; + } + + vector string_parser:: + unquote (const vector& v) + { + vector r; + r.reserve (v.size ()); + for (auto& s: v) + r.emplace_back (unquote (s)); + + return r; + } +} diff --git a/butl/tab-parser b/butl/tab-parser index 6aa0705..f140b71 100644 --- a/butl/tab-parser +++ b/butl/tab-parser @@ -7,14 +7,12 @@ #include #include -#include +#include #include // uint64_t #include // runtime_error #include -#include - namespace butl { class LIBBUTL_EXPORT tab_parsing: public std::runtime_error @@ -52,11 +50,11 @@ namespace butl // supported. Blank lines and lines that start with # (collectively called // empty lines) are ignored. // - class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner + class LIBBUTL_EXPORT tab_parser { public: tab_parser (std::istream& is, const std::string& name) - : char_scanner (is), name_ (name) {} + : is_ (is), name_ (name) {} // Return next line of fields. Skip empty lines. Empty result denotes the // end of stream. @@ -65,13 +63,9 @@ namespace butl next (); private: - // Skip spaces and return the first peeked non-space character. - // - xchar - skip_spaces (); - - private: + std::istream& is_; const std::string name_; + std::uint64_t line_ = 0; }; } diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx index bae9327..4743e69 100644 --- a/butl/tab-parser.cxx +++ b/butl/tab-parser.cxx @@ -7,6 +7,8 @@ #include #include +#include + using namespace std; namespace butl @@ -19,106 +21,50 @@ namespace butl next () { tab_fields r; - xchar c (skip_spaces ()); // Skip empty lines and leading spaces. - - auto eol = [&c] () -> bool {return eos (c) || c == '\n';}; - auto space = [&c] () -> bool {return c == ' ' || c == '\t';}; - auto next = [&c, this] () {get (); c = peek ();}; - - r.line = c.line; - // Read line fields until eos or the newline character. + // Read lines until a non-empty one or EOF is encountered. In the first + // case parse the line and bail out. // - while (!eol ()) + // Note that we check for character presence in the stream prior to the + // getline() call, to prevent it from setting the failbit. + // + while (!is_.eof () && is_.peek () != istream::traits_type::eof ()) { - for (; !eol () && space (); next ()) ; // Skip space characters. + string s; + getline (is_, s); - if (eol ()) // No more fields. - break; + ++line_; - // Read the field. Here we scan until the first whitespace character that - // appears out of quotes. + // Skip empty line. // - tab_field tf ({string (), c.column}); - char quoting ('\0'); // Current quoting mode, can be used as bool. - - for (; !eol (); next ()) - { - if (!quoting) - { - if (space ()) // End of the field. - break; - else if (c == '"' || c == '\'') // Begin of quoted string. - quoting = c; - } - else if (c == quoting) // End of quoted string. - quoting = '\0'; - - tf.value += c; - } + auto i (s.begin ()); + auto e (s.end ()); + for (; i != e && (*i == ' ' || *i == '\t'); ++i) ; // Skip spaces. - if (quoting) - throw parsing (name_, c.line, c.column, "unterminated quoted string"); + if (i == e || *i == '#') + continue; - r.emplace_back (move (tf)); - } + r.line = line_; + r.end_column = s.size () + 1; // Newline position. - r.end_column = c.column; + vector> sp; - // Read out eof or newline character from the stream. Note that "reading" - // eof multiple times is safe. - // - get (); - return r; - } - - tab_parser::xchar tab_parser:: - skip_spaces () - { - xchar c (peek ()); - bool start (c.column == 1); - - for (; !eos (c); c = peek ()) - { - switch (c) + try { - case ' ': - case '\t': - break; - case '\n': - { - // Skip empty lines. - // - if (!start) - return c; - - break; - } - case '#': - { - // We only recognize '#' as a start of a comment at the beginning - // of the line (sans leading spaces). - // - if (!start) - return c; - - get (); - - // Read until newline or eos. - // - for (c = peek (); !eos (c) && c != '\n'; c = peek ()) - get (); - - continue; - } - default: - return c; // Not a space. + sp = string_parser::parse_quoted_position (s, false); + } + catch (const invalid_string& e) + { + throw parsing (name_, line_, e.position + 1, e.what ()); } - get (); + for (auto& s: sp) + r.emplace_back (tab_field ({move (s.first), s.second + 1})); + + break; } - return c; + return r; } // tab_parsing -- cgit v1.1