From 5661b404b0104c3065a40ad622bdd3c11d748a99 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 20 Apr 2017 17:31:26 +0300 Subject: Implement string_parser --- butl/buildfile | 1 + butl/string-parser | 23 +++++-- butl/string-parser.cxx | 132 +++++++++++++++++++++++++++++++++++++++++ butl/tab-parser | 16 ++--- butl/tab-parser.cxx | 116 ++++++++++-------------------------- tests/string-parser/buildfile | 7 +++ tests/string-parser/driver.cxx | 93 +++++++++++++++++++++++++++++ tests/string-parser/testscript | 42 +++++++++++++ tests/tab-parser/testscript | 7 ++- 9 files changed, 334 insertions(+), 103 deletions(-) create mode 100644 butl/string-parser.cxx create mode 100644 tests/string-parser/buildfile create mode 100644 tests/string-parser/driver.cxx create mode 100644 tests/string-parser/testscript diff --git a/butl/buildfile b/butl/buildfile index 328afc0..75217fa 100644 --- a/butl/buildfile +++ b/butl/buildfile @@ -28,6 +28,7 @@ lib{butl}: \ {hxx ixx cxx}{ sendmail } \ {hxx cxx}{ sha256 } \ {hxx }{ small-vector } \ + {hxx cxx}{ string-parser } \ {hxx txx }{ string-table } \ {hxx cxx}{ tab-parser } \ {hxx cxx}{ target-triplet } \ diff --git a/butl/string-parser b/butl/string-parser index e3fa507..74391cd 100644 --- a/butl/string-parser +++ b/butl/string-parser @@ -8,6 +8,7 @@ #include #include #include // size_t +#include // pair #include // invalid_argument #include @@ -18,9 +19,9 @@ namespace butl { public: invalid_string (std::size_t p, const std::string& d) - : invalid_argument (d), pos (p) {} + : invalid_argument (d), position (p) {} - std::size_t pos; // Zero-based. + std::size_t position; // Zero-based. }; class LIBBUTL_EXPORT string_parser @@ -31,14 +32,24 @@ namespace butl // return one-level unquoted values. Throw invalid_string in case of // invalid quoting. // - std::vector - parse_quoted (const string&, bool unquote); + static std::vector + parse_quoted (const std::string&, bool unquote); + + // As above but return a list of string and zero-based position pairs. + // Position is useful for issuing diagnostics about an invalid string + // during second-level parsing. + // + static std::vector> + parse_quoted_position (const std::string&, bool unquote); // Remove a single level of quotes. Note that the format or the // correctness of the quotation is not validated. // - std::string - unquote (const string&); + static std::string + unquote (const std::string&); + + static std::vector + unquote (const std::vector&); }; } diff --git a/butl/string-parser.cxx b/butl/string-parser.cxx new file mode 100644 index 0000000..9228d66 --- /dev/null +++ b/butl/string-parser.cxx @@ -0,0 +1,132 @@ +// file : butl/string-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include // move() + +using namespace std; + +namespace butl +{ + // Utility functions + // + inline static bool + space (char c) noexcept + { + return c == ' ' || c == '\t'; + } + + // string_parser + // + vector> string_parser:: + parse_quoted_position (const string& s, bool unquote) + { + vector> r; + for (auto b (s.begin ()), i (b), e (s.end ()); i != e; ) + { + for (; i != e && space (*i); ++i) ; // Skip spaces. + + if (i == e) // No more strings. + break; + + string s; + char quoting ('\0'); // Current quoting mode, can be used as bool. + size_t pos (i - b); // String position. + + for (; i != e; ++i) + { + char c (*i); + + if (!quoting) + { + if (space (c)) // End of string. + break; + + if (c == '"' || c == '\'') // Begin of quoted substring. + { + quoting = c; + + if (!unquote) + s += c; + + continue; + } + } + else if (c == quoting) // End of quoted substring. + { + quoting = '\0'; + + if (!unquote) + s += c; + + continue; + } + + s += c; + } + + if (quoting) + throw invalid_string (i - b, "unterminated quoted string"); + + r.emplace_back (move (s), pos); + } + + return r; + } + + vector string_parser:: + parse_quoted (const string& s, bool unquote) + { + vector> sp (parse_quoted_position (s, unquote)); + + vector r; + r.reserve (sp.size ()); + for (auto& s: sp) + r.emplace_back (move (s.first)); + + return r; + } + + string string_parser:: + unquote (const string& s) + { + string r; + char quoting ('\0'); // Current quoting mode, can be used as bool. + + for (auto i (s.begin ()), e (s.end ()); i != e; ++i) + { + char c (*i); + + if (!quoting) + { + if (c == '"' || c == '\'') // Begin of quoted substring. + { + quoting = c; + continue; + } + } + else if (c == quoting) // End of quoted substring. + { + quoting = '\0'; + continue; + } + + r += c; + } + + return r; + } + + vector string_parser:: + unquote (const vector& v) + { + vector r; + r.reserve (v.size ()); + for (auto& s: v) + r.emplace_back (unquote (s)); + + return r; + } +} diff --git a/butl/tab-parser b/butl/tab-parser index 6aa0705..f140b71 100644 --- a/butl/tab-parser +++ b/butl/tab-parser @@ -7,14 +7,12 @@ #include #include -#include +#include #include // uint64_t #include // runtime_error #include -#include - namespace butl { class LIBBUTL_EXPORT tab_parsing: public std::runtime_error @@ -52,11 +50,11 @@ namespace butl // supported. Blank lines and lines that start with # (collectively called // empty lines) are ignored. // - class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner + class LIBBUTL_EXPORT tab_parser { public: tab_parser (std::istream& is, const std::string& name) - : char_scanner (is), name_ (name) {} + : is_ (is), name_ (name) {} // Return next line of fields. Skip empty lines. Empty result denotes the // end of stream. @@ -65,13 +63,9 @@ namespace butl next (); private: - // Skip spaces and return the first peeked non-space character. - // - xchar - skip_spaces (); - - private: + std::istream& is_; const std::string name_; + std::uint64_t line_ = 0; }; } diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx index bae9327..4743e69 100644 --- a/butl/tab-parser.cxx +++ b/butl/tab-parser.cxx @@ -7,6 +7,8 @@ #include #include +#include + using namespace std; namespace butl @@ -19,106 +21,50 @@ namespace butl next () { tab_fields r; - xchar c (skip_spaces ()); // Skip empty lines and leading spaces. - - auto eol = [&c] () -> bool {return eos (c) || c == '\n';}; - auto space = [&c] () -> bool {return c == ' ' || c == '\t';}; - auto next = [&c, this] () {get (); c = peek ();}; - - r.line = c.line; - // Read line fields until eos or the newline character. + // Read lines until a non-empty one or EOF is encountered. In the first + // case parse the line and bail out. // - while (!eol ()) + // Note that we check for character presence in the stream prior to the + // getline() call, to prevent it from setting the failbit. + // + while (!is_.eof () && is_.peek () != istream::traits_type::eof ()) { - for (; !eol () && space (); next ()) ; // Skip space characters. + string s; + getline (is_, s); - if (eol ()) // No more fields. - break; + ++line_; - // Read the field. Here we scan until the first whitespace character that - // appears out of quotes. + // Skip empty line. // - tab_field tf ({string (), c.column}); - char quoting ('\0'); // Current quoting mode, can be used as bool. - - for (; !eol (); next ()) - { - if (!quoting) - { - if (space ()) // End of the field. - break; - else if (c == '"' || c == '\'') // Begin of quoted string. - quoting = c; - } - else if (c == quoting) // End of quoted string. - quoting = '\0'; - - tf.value += c; - } + auto i (s.begin ()); + auto e (s.end ()); + for (; i != e && (*i == ' ' || *i == '\t'); ++i) ; // Skip spaces. - if (quoting) - throw parsing (name_, c.line, c.column, "unterminated quoted string"); + if (i == e || *i == '#') + continue; - r.emplace_back (move (tf)); - } + r.line = line_; + r.end_column = s.size () + 1; // Newline position. - r.end_column = c.column; + vector> sp; - // Read out eof or newline character from the stream. Note that "reading" - // eof multiple times is safe. - // - get (); - return r; - } - - tab_parser::xchar tab_parser:: - skip_spaces () - { - xchar c (peek ()); - bool start (c.column == 1); - - for (; !eos (c); c = peek ()) - { - switch (c) + try { - case ' ': - case '\t': - break; - case '\n': - { - // Skip empty lines. - // - if (!start) - return c; - - break; - } - case '#': - { - // We only recognize '#' as a start of a comment at the beginning - // of the line (sans leading spaces). - // - if (!start) - return c; - - get (); - - // Read until newline or eos. - // - for (c = peek (); !eos (c) && c != '\n'; c = peek ()) - get (); - - continue; - } - default: - return c; // Not a space. + sp = string_parser::parse_quoted_position (s, false); + } + catch (const invalid_string& e) + { + throw parsing (name_, line_, e.position + 1, e.what ()); } - get (); + for (auto& s: sp) + r.emplace_back (tab_field ({move (s.first), s.second + 1})); + + break; } - return c; + return r; } // tab_parsing diff --git a/tests/string-parser/buildfile b/tests/string-parser/buildfile new file mode 100644 index 0000000..9ccf480 --- /dev/null +++ b/tests/string-parser/buildfile @@ -0,0 +1,7 @@ +# file : tests/string-parser/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +exe{driver}: cxx{driver} ../../butl/lib{butl} test{testscript} + +include ../../butl/ diff --git a/tests/string-parser/driver.cxx b/tests/string-parser/driver.cxx new file mode 100644 index 0000000..2aad3a7 --- /dev/null +++ b/tests/string-parser/driver.cxx @@ -0,0 +1,93 @@ +// file : tests/string-parser/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include // ios::failbit, ios::badbit +#include +#include +#include +#include + +#include // operator<<(ostream,exception) +#include + +using namespace std; +using namespace butl; + +// Usage: argv[0] [-l] [-u] [-p] +// +// Read and parse lines into strings from STDIN and print them to STDOUT. +// +// -l output each string on a separate line +// -u unquote strings +// -p output positions +// +int +main (int argc, char* argv[]) +try +{ + bool spl (false); // Print string per line. + bool unquote (false); + bool pos (false); + + for (int i (1); i != argc; ++i) + { + string o (argv[i]); + + if (o == "-l") + spl = true; + else if (o == "-u") + unquote = true; + else if (o == "-p") + pos = true; + else + assert (false); + } + + // Do not throw when eofbit is set (end of stream reached), and when failbit + // is set (getline() failed to extract any character). + // + cin.exceptions (ios::badbit); + + cout.exceptions (ios::failbit | ios::badbit); + + string l; + while (getline (cin, l)) + { + vector> v ( + string_parser::parse_quoted_position (l, unquote)); + + if (!spl) + { + for (auto b (v.cbegin ()), i (b), e (v.cend ()); i != e; ++i) + { + if (i != b) + cout << ' '; + + if (pos) + cout << i->second << ":"; + + cout << i->first; + } + + cout << endl; + } + else + { + for (const auto& s: v) + { + if (pos) + cout << s.second << ":"; + + cout << s.first << endl; + } + } + } + + return 0; +} +catch (const invalid_string& e) +{ + cerr << e.position << ": " << e << endl; + return 1; +} diff --git a/tests/string-parser/testscript b/tests/string-parser/testscript new file mode 100644 index 0000000..83c8346 --- /dev/null +++ b/tests/string-parser/testscript @@ -0,0 +1,42 @@ +# file : tests/string-parser/testscript +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +: valid +: +{ + test.options += -l + sp=' ' # For line-terminating spaces. + + : quoted + : + { + $* -p <<"EOI" >>EOO + abc "d 'ef " 'x "y z'$sp + EOI + 0:abc + 5:"d 'ef " + 14:'x "y z' + EOO + } + + : unquoted + : + { + $* -u <<"EOI" >>"EOO" + abc "d 'ef " 'x "y z'$sp + EOI + abc + d 'ef$sp + x "y z + EOO + } +} + +: invalid +: +{ + : unterm-quoting + : + $* <'ab"c' 2>'4: unterminated quoted string' == 1 +} diff --git a/tests/tab-parser/testscript b/tests/tab-parser/testscript index 1b0a816..de3b167 100644 --- a/tests/tab-parser/testscript +++ b/tests/tab-parser/testscript @@ -45,5 +45,10 @@ { : unterm-quoting : - $* <'ab"c' 2>'cin:1:5: error: unterminated quoted string' == 1 + $* <'123' 2>'cin:3:5: error: unterminated quoted string' == 1 + + 123 + ab"c + xyz + EOI } -- cgit v1.1