aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-03-31 23:29:49 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-04-04 13:28:18 +0300
commitd53c8a6ce3d868da66d97a9243365e88d0879343 (patch)
tree7d1268f62808f706c3a4d5631456afb4cbe24fb2
parentf4f30ed51b9bcd84cf25b601fab0a0064aae7af8 (diff)
Add tab_parser
-rw-r--r--butl/buildfile1
-rw-r--r--butl/tab-parser78
-rw-r--r--butl/tab-parser.cxx142
-rw-r--r--tests/tab-parser/buildfile7
-rw-r--r--tests/tab-parser/driver.cxx67
-rw-r--r--tests/tab-parser/testscript49
6 files changed, 344 insertions, 0 deletions
diff --git a/butl/buildfile b/butl/buildfile
index 0aa0aae..b54a887 100644
--- a/butl/buildfile
+++ b/butl/buildfile
@@ -25,6 +25,7 @@ lib{butl}: \
{hxx cxx}{ sha256 } \
{hxx }{ small-vector } \
{hxx txx }{ string-table } \
+ {hxx cxx}{ tab-parser } \
{hxx cxx}{ target-triplet } \
{hxx cxx}{ timestamp } \
{hxx ixx cxx}{ utility } \
diff --git a/butl/tab-parser b/butl/tab-parser
new file mode 100644
index 0000000..6aa0705
--- /dev/null
+++ b/butl/tab-parser
@@ -0,0 +1,78 @@
+// file : butl/tab-parser -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUTL_TAB_PARSER
+#define BUTL_TAB_PARSER
+
+#include <string>
+#include <vector>
+#include <iosfwd>
+#include <cstdint> // uint64_t
+#include <stdexcept> // runtime_error
+
+#include <butl/export>
+
+#include <butl/char-scanner>
+
+namespace butl
+{
+ class LIBBUTL_EXPORT tab_parsing: public std::runtime_error
+ {
+ public:
+ tab_parsing (const std::string& name,
+ std::uint64_t line,
+ std::uint64_t column,
+ const std::string& description);
+
+ std::string name;
+ std::uint64_t line;
+ std::uint64_t column;
+ std::string description;
+ };
+
+ // Line and columns are useful for issuing diagnostics about invalid or
+ // missing fields.
+ //
+ struct tab_field
+ {
+ std::string value; // Field string (quoting preserved).
+ std::uint64_t column; // Field start column number (one-based).
+ };
+
+ struct tab_fields: std::vector<tab_field>
+ {
+ std::uint64_t line; // Line number (one-based).
+ std::uint64_t end_column; // End-of-line column (line length).
+ };
+
+ // Read and parse lines consisting of space-separated fields. Field can
+ // contain single or double quoted substrings (with spaces) which are
+ // interpreted but preserved. No escaping of the quote characters is
+ // supported. Blank lines and lines that start with # (collectively called
+ // empty lines) are ignored.
+ //
+ class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner
+ {
+ public:
+ tab_parser (std::istream& is, const std::string& name)
+ : char_scanner (is), name_ (name) {}
+
+ // Return next line of fields. Skip empty lines. Empty result denotes the
+ // end of stream.
+ //
+ tab_fields
+ next ();
+
+ private:
+ // Skip spaces and return the first peeked non-space character.
+ //
+ xchar
+ skip_spaces ();
+
+ private:
+ const std::string name_;
+ };
+}
+
+#endif // BUTL_TAB_PARSER
diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx
new file mode 100644
index 0000000..bae9327
--- /dev/null
+++ b/butl/tab-parser.cxx
@@ -0,0 +1,142 @@
+// file : butl/tab-parser.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <butl/tab-parser>
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+namespace butl
+{
+ using parsing = tab_parsing;
+
+ // tab_parser
+ //
+ tab_fields tab_parser::
+ next ()
+ {
+ tab_fields r;
+ xchar c (skip_spaces ()); // Skip empty lines and leading spaces.
+
+ auto eol = [&c] () -> bool {return eos (c) || c == '\n';};
+ auto space = [&c] () -> bool {return c == ' ' || c == '\t';};
+ auto next = [&c, this] () {get (); c = peek ();};
+
+ r.line = c.line;
+
+ // Read line fields until eos or the newline character.
+ //
+ while (!eol ())
+ {
+ for (; !eol () && space (); next ()) ; // Skip space characters.
+
+ if (eol ()) // No more fields.
+ break;
+
+ // Read the field. Here we scan until the first whitespace character that
+ // appears out of quotes.
+ //
+ tab_field tf ({string (), c.column});
+ char quoting ('\0'); // Current quoting mode, can be used as bool.
+
+ for (; !eol (); next ())
+ {
+ if (!quoting)
+ {
+ if (space ()) // End of the field.
+ break;
+ else if (c == '"' || c == '\'') // Begin of quoted string.
+ quoting = c;
+ }
+ else if (c == quoting) // End of quoted string.
+ quoting = '\0';
+
+ tf.value += c;
+ }
+
+ if (quoting)
+ throw parsing (name_, c.line, c.column, "unterminated quoted string");
+
+ r.emplace_back (move (tf));
+ }
+
+ r.end_column = c.column;
+
+ // Read out eof or newline character from the stream. Note that "reading"
+ // eof multiple times is safe.
+ //
+ get ();
+ return r;
+ }
+
+ tab_parser::xchar tab_parser::
+ skip_spaces ()
+ {
+ xchar c (peek ());
+ bool start (c.column == 1);
+
+ for (; !eos (c); c = peek ())
+ {
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ break;
+ case '\n':
+ {
+ // Skip empty lines.
+ //
+ if (!start)
+ return c;
+
+ break;
+ }
+ case '#':
+ {
+ // We only recognize '#' as a start of a comment at the beginning
+ // of the line (sans leading spaces).
+ //
+ if (!start)
+ return c;
+
+ get ();
+
+ // Read until newline or eos.
+ //
+ for (c = peek (); !eos (c) && c != '\n'; c = peek ())
+ get ();
+
+ continue;
+ }
+ default:
+ return c; // Not a space.
+ }
+
+ get ();
+ }
+
+ return c;
+ }
+
+ // tab_parsing
+ //
+ static string
+ format (const string& n, uint64_t l, uint64_t c, const string& d)
+ {
+ ostringstream os;
+ if (!n.empty ())
+ os << n << ':';
+ os << l << ':' << c << ": error: " << d;
+ return os.str ();
+ }
+
+ tab_parsing::
+ tab_parsing (const string& n, uint64_t l, uint64_t c, const string& d)
+ : runtime_error (format (n, l, c, d)),
+ name (n), line (l), column (c), description (d)
+ {
+ }
+}
diff --git a/tests/tab-parser/buildfile b/tests/tab-parser/buildfile
new file mode 100644
index 0000000..4afb691
--- /dev/null
+++ b/tests/tab-parser/buildfile
@@ -0,0 +1,7 @@
+# file : tests/tab-parser/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+exe{driver}: cxx{driver} ../../butl/lib{butl} test{testscript}
+
+include ../../butl/
diff --git a/tests/tab-parser/driver.cxx b/tests/tab-parser/driver.cxx
new file mode 100644
index 0000000..8b0cc4d
--- /dev/null
+++ b/tests/tab-parser/driver.cxx
@@ -0,0 +1,67 @@
+// file : tests/tab-parser/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <ios> // ios::failbit, ios::badbit
+#include <cassert>
+#include <iostream>
+
+#include <butl/utility> // operator<<(ostream,exception)
+#include <butl/tab-parser>
+
+using namespace std;
+using namespace butl;
+
+// Usage: argv[0] [-l]
+//
+// Read and parse tab-file from STDIN and print fields to STDOUT.
+//
+// -l output each field on a separate line
+//
+int
+main (int argc, char* argv[])
+try
+{
+ assert (argc <= 2);
+ bool fpl (false); // Print field per line.
+
+ if (argc == 2)
+ {
+ assert (argv[1] == string ("-l"));
+ fpl = true;
+ }
+
+ cin.exceptions (ios::failbit | ios::badbit);
+ cout.exceptions (ios::failbit | ios::badbit);
+
+ tab_fields tl;
+ tab_parser parser (cin, "cin");
+
+ while (!(tl = parser.next ()).empty ())
+ {
+ if (!fpl)
+ {
+ for (auto b (tl.cbegin ()), i (b), e (tl.cend ()); i != e; ++i)
+ {
+ if (i != b)
+ cout << ' ';
+
+ cout << i->value;
+ }
+
+ cout << '\n';
+ }
+ else
+ {
+ for (const auto& tf: tl)
+ cout << tf.value << '\n';
+ }
+ }
+
+ return 0;
+}
+catch (const tab_parsing& e)
+{
+ cerr << e << endl;
+ return 1;
+}
diff --git a/tests/tab-parser/testscript b/tests/tab-parser/testscript
new file mode 100644
index 0000000..1b0a816
--- /dev/null
+++ b/tests/tab-parser/testscript
@@ -0,0 +1,49 @@
+# file : tests/tab-parser/testscript
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+: valid
+:
+: Roundtrip tab-files.
+:
+{
+ : newline-term
+ :
+ $* <<EOF >>EOF
+ abc
+ def xyz
+ fff
+ EOF
+
+ : eos-term
+ :
+ $* <:'abc' >'abc'
+
+ : empty-lines
+ :
+ $* <<EOI >'def'
+ # abc
+
+ # abc
+ def
+
+ EOI
+
+ : quoting
+ :
+ $* -l <<EOI >>EOO
+ def k" l'"'m n"' xyz
+ EOI
+ def
+ k" l'"'m n"'
+ xyz
+ EOO
+}
+
+: invalid
+:
+{
+ : unterm-quoting
+ :
+ $* <'ab"c' 2>'cin:1:5: error: unterminated quoted string' == 1
+}