diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2017-03-31 23:29:49 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2017-04-04 13:28:18 +0300 |
commit | d53c8a6ce3d868da66d97a9243365e88d0879343 (patch) | |
tree | 7d1268f62808f706c3a4d5631456afb4cbe24fb2 | |
parent | f4f30ed51b9bcd84cf25b601fab0a0064aae7af8 (diff) |
Add tab_parser
-rw-r--r-- | butl/buildfile | 1 | ||||
-rw-r--r-- | butl/tab-parser | 78 | ||||
-rw-r--r-- | butl/tab-parser.cxx | 142 | ||||
-rw-r--r-- | tests/tab-parser/buildfile | 7 | ||||
-rw-r--r-- | tests/tab-parser/driver.cxx | 67 | ||||
-rw-r--r-- | tests/tab-parser/testscript | 49 |
6 files changed, 344 insertions, 0 deletions
diff --git a/butl/buildfile b/butl/buildfile index 0aa0aae..b54a887 100644 --- a/butl/buildfile +++ b/butl/buildfile @@ -25,6 +25,7 @@ lib{butl}: \ {hxx cxx}{ sha256 } \ {hxx }{ small-vector } \ {hxx txx }{ string-table } \ + {hxx cxx}{ tab-parser } \ {hxx cxx}{ target-triplet } \ {hxx cxx}{ timestamp } \ {hxx ixx cxx}{ utility } \ diff --git a/butl/tab-parser b/butl/tab-parser new file mode 100644 index 0000000..6aa0705 --- /dev/null +++ b/butl/tab-parser @@ -0,0 +1,78 @@ +// file : butl/tab-parser -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUTL_TAB_PARSER +#define BUTL_TAB_PARSER + +#include <string> +#include <vector> +#include <iosfwd> +#include <cstdint> // uint64_t +#include <stdexcept> // runtime_error + +#include <butl/export> + +#include <butl/char-scanner> + +namespace butl +{ + class LIBBUTL_EXPORT tab_parsing: public std::runtime_error + { + public: + tab_parsing (const std::string& name, + std::uint64_t line, + std::uint64_t column, + const std::string& description); + + std::string name; + std::uint64_t line; + std::uint64_t column; + std::string description; + }; + + // Line and columns are useful for issuing diagnostics about invalid or + // missing fields. + // + struct tab_field + { + std::string value; // Field string (quoting preserved). + std::uint64_t column; // Field start column number (one-based). + }; + + struct tab_fields: std::vector<tab_field> + { + std::uint64_t line; // Line number (one-based). + std::uint64_t end_column; // End-of-line column (line length). + }; + + // Read and parse lines consisting of space-separated fields. Field can + // contain single or double quoted substrings (with spaces) which are + // interpreted but preserved. No escaping of the quote characters is + // supported. Blank lines and lines that start with # (collectively called + // empty lines) are ignored. + // + class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner + { + public: + tab_parser (std::istream& is, const std::string& name) + : char_scanner (is), name_ (name) {} + + // Return next line of fields. Skip empty lines. Empty result denotes the + // end of stream. + // + tab_fields + next (); + + private: + // Skip spaces and return the first peeked non-space character. + // + xchar + skip_spaces (); + + private: + const std::string name_; + }; +} + +#endif // BUTL_TAB_PARSER diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx new file mode 100644 index 0000000..bae9327 --- /dev/null +++ b/butl/tab-parser.cxx @@ -0,0 +1,142 @@ +// file : butl/tab-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <butl/tab-parser> + +#include <cassert> +#include <sstream> + +using namespace std; + +namespace butl +{ + using parsing = tab_parsing; + + // tab_parser + // + tab_fields tab_parser:: + next () + { + tab_fields r; + xchar c (skip_spaces ()); // Skip empty lines and leading spaces. + + auto eol = [&c] () -> bool {return eos (c) || c == '\n';}; + auto space = [&c] () -> bool {return c == ' ' || c == '\t';}; + auto next = [&c, this] () {get (); c = peek ();}; + + r.line = c.line; + + // Read line fields until eos or the newline character. + // + while (!eol ()) + { + for (; !eol () && space (); next ()) ; // Skip space characters. + + if (eol ()) // No more fields. + break; + + // Read the field. Here we scan until the first whitespace character that + // appears out of quotes. + // + tab_field tf ({string (), c.column}); + char quoting ('\0'); // Current quoting mode, can be used as bool. + + for (; !eol (); next ()) + { + if (!quoting) + { + if (space ()) // End of the field. + break; + else if (c == '"' || c == '\'') // Begin of quoted string. + quoting = c; + } + else if (c == quoting) // End of quoted string. + quoting = '\0'; + + tf.value += c; + } + + if (quoting) + throw parsing (name_, c.line, c.column, "unterminated quoted string"); + + r.emplace_back (move (tf)); + } + + r.end_column = c.column; + + // Read out eof or newline character from the stream. Note that "reading" + // eof multiple times is safe. + // + get (); + return r; + } + + tab_parser::xchar tab_parser:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column == 1); + + for (; !eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (!start) + return c; + + break; + } + case '#': + { + // We only recognize '#' as a start of a comment at the beginning + // of the line (sans leading spaces). + // + if (!start) + return c; + + get (); + + // Read until newline or eos. + // + for (c = peek (); !eos (c) && c != '\n'; c = peek ()) + get (); + + continue; + } + default: + return c; // Not a space. + } + + get (); + } + + return c; + } + + // tab_parsing + // + static string + format (const string& n, uint64_t l, uint64_t c, const string& d) + { + ostringstream os; + if (!n.empty ()) + os << n << ':'; + os << l << ':' << c << ": error: " << d; + return os.str (); + } + + tab_parsing:: + tab_parsing (const string& n, uint64_t l, uint64_t c, const string& d) + : runtime_error (format (n, l, c, d)), + name (n), line (l), column (c), description (d) + { + } +} diff --git a/tests/tab-parser/buildfile b/tests/tab-parser/buildfile new file mode 100644 index 0000000..4afb691 --- /dev/null +++ b/tests/tab-parser/buildfile @@ -0,0 +1,7 @@ +# file : tests/tab-parser/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +exe{driver}: cxx{driver} ../../butl/lib{butl} test{testscript} + +include ../../butl/ diff --git a/tests/tab-parser/driver.cxx b/tests/tab-parser/driver.cxx new file mode 100644 index 0000000..8b0cc4d --- /dev/null +++ b/tests/tab-parser/driver.cxx @@ -0,0 +1,67 @@ +// file : tests/tab-parser/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <ios> // ios::failbit, ios::badbit +#include <cassert> +#include <iostream> + +#include <butl/utility> // operator<<(ostream,exception) +#include <butl/tab-parser> + +using namespace std; +using namespace butl; + +// Usage: argv[0] [-l] +// +// Read and parse tab-file from STDIN and print fields to STDOUT. +// +// -l output each field on a separate line +// +int +main (int argc, char* argv[]) +try +{ + assert (argc <= 2); + bool fpl (false); // Print field per line. + + if (argc == 2) + { + assert (argv[1] == string ("-l")); + fpl = true; + } + + cin.exceptions (ios::failbit | ios::badbit); + cout.exceptions (ios::failbit | ios::badbit); + + tab_fields tl; + tab_parser parser (cin, "cin"); + + while (!(tl = parser.next ()).empty ()) + { + if (!fpl) + { + for (auto b (tl.cbegin ()), i (b), e (tl.cend ()); i != e; ++i) + { + if (i != b) + cout << ' '; + + cout << i->value; + } + + cout << '\n'; + } + else + { + for (const auto& tf: tl) + cout << tf.value << '\n'; + } + } + + return 0; +} +catch (const tab_parsing& e) +{ + cerr << e << endl; + return 1; +} diff --git a/tests/tab-parser/testscript b/tests/tab-parser/testscript new file mode 100644 index 0000000..1b0a816 --- /dev/null +++ b/tests/tab-parser/testscript @@ -0,0 +1,49 @@ +# file : tests/tab-parser/testscript +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +: valid +: +: Roundtrip tab-files. +: +{ + : newline-term + : + $* <<EOF >>EOF + abc + def xyz + fff + EOF + + : eos-term + : + $* <:'abc' >'abc' + + : empty-lines + : + $* <<EOI >'def' + # abc + + # abc + def + + EOI + + : quoting + : + $* -l <<EOI >>EOO + def k" l'"'m n"' xyz + EOI + def + k" l'"'m n"' + xyz + EOO +} + +: invalid +: +{ + : unterm-quoting + : + $* <'ab"c' 2>'cin:1:5: error: unterminated quoted string' == 1 +} |