From d53c8a6ce3d868da66d97a9243365e88d0879343 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Fri, 31 Mar 2017 23:29:49 +0300 Subject: Add tab_parser --- butl/buildfile | 1 + butl/tab-parser | 78 +++++++++++++++++++++++++++++ butl/tab-parser.cxx | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 butl/tab-parser create mode 100644 butl/tab-parser.cxx (limited to 'butl') diff --git a/butl/buildfile b/butl/buildfile index 0aa0aae..b54a887 100644 --- a/butl/buildfile +++ b/butl/buildfile @@ -25,6 +25,7 @@ lib{butl}: \ {hxx cxx}{ sha256 } \ {hxx }{ small-vector } \ {hxx txx }{ string-table } \ + {hxx cxx}{ tab-parser } \ {hxx cxx}{ target-triplet } \ {hxx cxx}{ timestamp } \ {hxx ixx cxx}{ utility } \ diff --git a/butl/tab-parser b/butl/tab-parser new file mode 100644 index 0000000..6aa0705 --- /dev/null +++ b/butl/tab-parser @@ -0,0 +1,78 @@ +// file : butl/tab-parser -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUTL_TAB_PARSER +#define BUTL_TAB_PARSER + +#include +#include +#include +#include // uint64_t +#include // runtime_error + +#include + +#include + +namespace butl +{ + class LIBBUTL_EXPORT tab_parsing: public std::runtime_error + { + public: + tab_parsing (const std::string& name, + std::uint64_t line, + std::uint64_t column, + const std::string& description); + + std::string name; + std::uint64_t line; + std::uint64_t column; + std::string description; + }; + + // Line and columns are useful for issuing diagnostics about invalid or + // missing fields. + // + struct tab_field + { + std::string value; // Field string (quoting preserved). + std::uint64_t column; // Field start column number (one-based). + }; + + struct tab_fields: std::vector + { + std::uint64_t line; // Line number (one-based). + std::uint64_t end_column; // End-of-line column (line length). + }; + + // Read and parse lines consisting of space-separated fields. Field can + // contain single or double quoted substrings (with spaces) which are + // interpreted but preserved. No escaping of the quote characters is + // supported. Blank lines and lines that start with # (collectively called + // empty lines) are ignored. + // + class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner + { + public: + tab_parser (std::istream& is, const std::string& name) + : char_scanner (is), name_ (name) {} + + // Return next line of fields. Skip empty lines. Empty result denotes the + // end of stream. + // + tab_fields + next (); + + private: + // Skip spaces and return the first peeked non-space character. + // + xchar + skip_spaces (); + + private: + const std::string name_; + }; +} + +#endif // BUTL_TAB_PARSER diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx new file mode 100644 index 0000000..bae9327 --- /dev/null +++ b/butl/tab-parser.cxx @@ -0,0 +1,142 @@ +// file : butl/tab-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include +#include + +using namespace std; + +namespace butl +{ + using parsing = tab_parsing; + + // tab_parser + // + tab_fields tab_parser:: + next () + { + tab_fields r; + xchar c (skip_spaces ()); // Skip empty lines and leading spaces. + + auto eol = [&c] () -> bool {return eos (c) || c == '\n';}; + auto space = [&c] () -> bool {return c == ' ' || c == '\t';}; + auto next = [&c, this] () {get (); c = peek ();}; + + r.line = c.line; + + // Read line fields until eos or the newline character. + // + while (!eol ()) + { + for (; !eol () && space (); next ()) ; // Skip space characters. + + if (eol ()) // No more fields. + break; + + // Read the field. Here we scan until the first whitespace character that + // appears out of quotes. + // + tab_field tf ({string (), c.column}); + char quoting ('\0'); // Current quoting mode, can be used as bool. + + for (; !eol (); next ()) + { + if (!quoting) + { + if (space ()) // End of the field. + break; + else if (c == '"' || c == '\'') // Begin of quoted string. + quoting = c; + } + else if (c == quoting) // End of quoted string. + quoting = '\0'; + + tf.value += c; + } + + if (quoting) + throw parsing (name_, c.line, c.column, "unterminated quoted string"); + + r.emplace_back (move (tf)); + } + + r.end_column = c.column; + + // Read out eof or newline character from the stream. Note that "reading" + // eof multiple times is safe. + // + get (); + return r; + } + + tab_parser::xchar tab_parser:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column == 1); + + for (; !eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (!start) + return c; + + break; + } + case '#': + { + // We only recognize '#' as a start of a comment at the beginning + // of the line (sans leading spaces). + // + if (!start) + return c; + + get (); + + // Read until newline or eos. + // + for (c = peek (); !eos (c) && c != '\n'; c = peek ()) + get (); + + continue; + } + default: + return c; // Not a space. + } + + get (); + } + + return c; + } + + // tab_parsing + // + static string + format (const string& n, uint64_t l, uint64_t c, const string& d) + { + ostringstream os; + if (!n.empty ()) + os << n << ':'; + os << l << ':' << c << ": error: " << d; + return os.str (); + } + + tab_parsing:: + tab_parsing (const string& n, uint64_t l, uint64_t c, const string& d) + : runtime_error (format (n, l, c, d)), + name (n), line (l), column (c), description (d) + { + } +} -- cgit v1.1