aboutsummaryrefslogtreecommitdiff
path: root/butl
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-04-20 17:31:26 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-04-20 21:08:32 +0300
commit5661b404b0104c3065a40ad622bdd3c11d748a99 (patch)
treeb1e1d7aefa9fda7214fa0fcce92cf1b85f87fc76 /butl
parent972f89d5a1d0c094241eb6ce1b8f499e3fcf151b (diff)
Implement string_parser
Diffstat (limited to 'butl')
-rw-r--r--butl/buildfile1
-rw-r--r--butl/string-parser23
-rw-r--r--butl/string-parser.cxx132
-rw-r--r--butl/tab-parser16
-rw-r--r--butl/tab-parser.cxx116
5 files changed, 186 insertions, 102 deletions
diff --git a/butl/buildfile b/butl/buildfile
index 328afc0..75217fa 100644
--- a/butl/buildfile
+++ b/butl/buildfile
@@ -28,6 +28,7 @@ lib{butl}: \
{hxx ixx cxx}{ sendmail } \
{hxx cxx}{ sha256 } \
{hxx }{ small-vector } \
+ {hxx cxx}{ string-parser } \
{hxx txx }{ string-table } \
{hxx cxx}{ tab-parser } \
{hxx cxx}{ target-triplet } \
diff --git a/butl/string-parser b/butl/string-parser
index e3fa507..74391cd 100644
--- a/butl/string-parser
+++ b/butl/string-parser
@@ -8,6 +8,7 @@
#include <string>
#include <vector>
#include <cstddef> // size_t
+#include <utility> // pair
#include <stdexcept> // invalid_argument
#include <butl/export>
@@ -18,9 +19,9 @@ namespace butl
{
public:
invalid_string (std::size_t p, const std::string& d)
- : invalid_argument (d), pos (p) {}
+ : invalid_argument (d), position (p) {}
- std::size_t pos; // Zero-based.
+ std::size_t position; // Zero-based.
};
class LIBBUTL_EXPORT string_parser
@@ -31,14 +32,24 @@ namespace butl
// return one-level unquoted values. Throw invalid_string in case of
// invalid quoting.
//
- std::vector<string>
- parse_quoted (const string&, bool unquote);
+ static std::vector<std::string>
+ parse_quoted (const std::string&, bool unquote);
+
+ // As above but return a list of string and zero-based position pairs.
+ // Position is useful for issuing diagnostics about an invalid string
+ // during second-level parsing.
+ //
+ static std::vector<std::pair<std::string, std::size_t>>
+ parse_quoted_position (const std::string&, bool unquote);
// Remove a single level of quotes. Note that the format or the
// correctness of the quotation is not validated.
//
- std::string
- unquote (const string&);
+ static std::string
+ unquote (const std::string&);
+
+ static std::vector<std::string>
+ unquote (const std::vector<std::string>&);
};
}
diff --git a/butl/string-parser.cxx b/butl/string-parser.cxx
new file mode 100644
index 0000000..9228d66
--- /dev/null
+++ b/butl/string-parser.cxx
@@ -0,0 +1,132 @@
+// file : butl/string-parser.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <butl/string-parser>
+
+#include <utility> // move()
+
+using namespace std;
+
+namespace butl
+{
+ // Utility functions
+ //
+ inline static bool
+ space (char c) noexcept
+ {
+ return c == ' ' || c == '\t';
+ }
+
+ // string_parser
+ //
+ vector<pair<string, size_t>> string_parser::
+ parse_quoted_position (const string& s, bool unquote)
+ {
+ vector<pair<string, size_t>> r;
+ for (auto b (s.begin ()), i (b), e (s.end ()); i != e; )
+ {
+ for (; i != e && space (*i); ++i) ; // Skip spaces.
+
+ if (i == e) // No more strings.
+ break;
+
+ string s;
+ char quoting ('\0'); // Current quoting mode, can be used as bool.
+ size_t pos (i - b); // String position.
+
+ for (; i != e; ++i)
+ {
+ char c (*i);
+
+ if (!quoting)
+ {
+ if (space (c)) // End of string.
+ break;
+
+ if (c == '"' || c == '\'') // Begin of quoted substring.
+ {
+ quoting = c;
+
+ if (!unquote)
+ s += c;
+
+ continue;
+ }
+ }
+ else if (c == quoting) // End of quoted substring.
+ {
+ quoting = '\0';
+
+ if (!unquote)
+ s += c;
+
+ continue;
+ }
+
+ s += c;
+ }
+
+ if (quoting)
+ throw invalid_string (i - b, "unterminated quoted string");
+
+ r.emplace_back (move (s), pos);
+ }
+
+ return r;
+ }
+
+ vector<string> string_parser::
+ parse_quoted (const string& s, bool unquote)
+ {
+ vector<pair<string, size_t>> sp (parse_quoted_position (s, unquote));
+
+ vector<string> r;
+ r.reserve (sp.size ());
+ for (auto& s: sp)
+ r.emplace_back (move (s.first));
+
+ return r;
+ }
+
+ string string_parser::
+ unquote (const string& s)
+ {
+ string r;
+ char quoting ('\0'); // Current quoting mode, can be used as bool.
+
+ for (auto i (s.begin ()), e (s.end ()); i != e; ++i)
+ {
+ char c (*i);
+
+ if (!quoting)
+ {
+ if (c == '"' || c == '\'') // Begin of quoted substring.
+ {
+ quoting = c;
+ continue;
+ }
+ }
+ else if (c == quoting) // End of quoted substring.
+ {
+ quoting = '\0';
+ continue;
+ }
+
+ r += c;
+ }
+
+ return r;
+ }
+
+ vector<string> string_parser::
+ unquote (const vector<string>& v)
+ {
+ vector<string> r;
+ r.reserve (v.size ());
+ for (auto& s: v)
+ r.emplace_back (unquote (s));
+
+ return r;
+ }
+}
diff --git a/butl/tab-parser b/butl/tab-parser
index 6aa0705..f140b71 100644
--- a/butl/tab-parser
+++ b/butl/tab-parser
@@ -7,14 +7,12 @@
#include <string>
#include <vector>
-#include <iosfwd>
+#include <istream>
#include <cstdint> // uint64_t
#include <stdexcept> // runtime_error
#include <butl/export>
-#include <butl/char-scanner>
-
namespace butl
{
class LIBBUTL_EXPORT tab_parsing: public std::runtime_error
@@ -52,11 +50,11 @@ namespace butl
// supported. Blank lines and lines that start with # (collectively called
// empty lines) are ignored.
//
- class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner
+ class LIBBUTL_EXPORT tab_parser
{
public:
tab_parser (std::istream& is, const std::string& name)
- : char_scanner (is), name_ (name) {}
+ : is_ (is), name_ (name) {}
// Return next line of fields. Skip empty lines. Empty result denotes the
// end of stream.
@@ -65,13 +63,9 @@ namespace butl
next ();
private:
- // Skip spaces and return the first peeked non-space character.
- //
- xchar
- skip_spaces ();
-
- private:
+ std::istream& is_;
const std::string name_;
+ std::uint64_t line_ = 0;
};
}
diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx
index bae9327..4743e69 100644
--- a/butl/tab-parser.cxx
+++ b/butl/tab-parser.cxx
@@ -7,6 +7,8 @@
#include <cassert>
#include <sstream>
+#include <butl/string-parser>
+
using namespace std;
namespace butl
@@ -19,106 +21,50 @@ namespace butl
next ()
{
tab_fields r;
- xchar c (skip_spaces ()); // Skip empty lines and leading spaces.
-
- auto eol = [&c] () -> bool {return eos (c) || c == '\n';};
- auto space = [&c] () -> bool {return c == ' ' || c == '\t';};
- auto next = [&c, this] () {get (); c = peek ();};
-
- r.line = c.line;
- // Read line fields until eos or the newline character.
+ // Read lines until a non-empty one or EOF is encountered. In the first
+ // case parse the line and bail out.
//
- while (!eol ())
+ // Note that we check for character presence in the stream prior to the
+ // getline() call, to prevent it from setting the failbit.
+ //
+ while (!is_.eof () && is_.peek () != istream::traits_type::eof ())
{
- for (; !eol () && space (); next ()) ; // Skip space characters.
+ string s;
+ getline (is_, s);
- if (eol ()) // No more fields.
- break;
+ ++line_;
- // Read the field. Here we scan until the first whitespace character that
- // appears out of quotes.
+ // Skip empty line.
//
- tab_field tf ({string (), c.column});
- char quoting ('\0'); // Current quoting mode, can be used as bool.
-
- for (; !eol (); next ())
- {
- if (!quoting)
- {
- if (space ()) // End of the field.
- break;
- else if (c == '"' || c == '\'') // Begin of quoted string.
- quoting = c;
- }
- else if (c == quoting) // End of quoted string.
- quoting = '\0';
-
- tf.value += c;
- }
+ auto i (s.begin ());
+ auto e (s.end ());
+ for (; i != e && (*i == ' ' || *i == '\t'); ++i) ; // Skip spaces.
- if (quoting)
- throw parsing (name_, c.line, c.column, "unterminated quoted string");
+ if (i == e || *i == '#')
+ continue;
- r.emplace_back (move (tf));
- }
+ r.line = line_;
+ r.end_column = s.size () + 1; // Newline position.
- r.end_column = c.column;
+ vector<std::pair<string, size_t>> sp;
- // Read out eof or newline character from the stream. Note that "reading"
- // eof multiple times is safe.
- //
- get ();
- return r;
- }
-
- tab_parser::xchar tab_parser::
- skip_spaces ()
- {
- xchar c (peek ());
- bool start (c.column == 1);
-
- for (; !eos (c); c = peek ())
- {
- switch (c)
+ try
{
- case ' ':
- case '\t':
- break;
- case '\n':
- {
- // Skip empty lines.
- //
- if (!start)
- return c;
-
- break;
- }
- case '#':
- {
- // We only recognize '#' as a start of a comment at the beginning
- // of the line (sans leading spaces).
- //
- if (!start)
- return c;
-
- get ();
-
- // Read until newline or eos.
- //
- for (c = peek (); !eos (c) && c != '\n'; c = peek ())
- get ();
-
- continue;
- }
- default:
- return c; // Not a space.
+ sp = string_parser::parse_quoted_position (s, false);
+ }
+ catch (const invalid_string& e)
+ {
+ throw parsing (name_, line_, e.position + 1, e.what ());
}
- get ();
+ for (auto& s: sp)
+ r.emplace_back (tab_field ({move (s.first), s.second + 1}));
+
+ break;
}
- return c;
+ return r;
}
// tab_parsing