aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-04-20 17:31:26 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-04-20 21:08:32 +0300
commit5661b404b0104c3065a40ad622bdd3c11d748a99 (patch)
treeb1e1d7aefa9fda7214fa0fcce92cf1b85f87fc76
parent972f89d5a1d0c094241eb6ce1b8f499e3fcf151b (diff)
Implement string_parser
-rw-r--r--butl/buildfile1
-rw-r--r--butl/string-parser23
-rw-r--r--butl/string-parser.cxx132
-rw-r--r--butl/tab-parser16
-rw-r--r--butl/tab-parser.cxx116
-rw-r--r--tests/string-parser/buildfile7
-rw-r--r--tests/string-parser/driver.cxx93
-rw-r--r--tests/string-parser/testscript42
-rw-r--r--tests/tab-parser/testscript7
9 files changed, 334 insertions, 103 deletions
diff --git a/butl/buildfile b/butl/buildfile
index 328afc0..75217fa 100644
--- a/butl/buildfile
+++ b/butl/buildfile
@@ -28,6 +28,7 @@ lib{butl}: \
{hxx ixx cxx}{ sendmail } \
{hxx cxx}{ sha256 } \
{hxx }{ small-vector } \
+ {hxx cxx}{ string-parser } \
{hxx txx }{ string-table } \
{hxx cxx}{ tab-parser } \
{hxx cxx}{ target-triplet } \
diff --git a/butl/string-parser b/butl/string-parser
index e3fa507..74391cd 100644
--- a/butl/string-parser
+++ b/butl/string-parser
@@ -8,6 +8,7 @@
#include <string>
#include <vector>
#include <cstddef> // size_t
+#include <utility> // pair
#include <stdexcept> // invalid_argument
#include <butl/export>
@@ -18,9 +19,9 @@ namespace butl
{
public:
invalid_string (std::size_t p, const std::string& d)
- : invalid_argument (d), pos (p) {}
+ : invalid_argument (d), position (p) {}
- std::size_t pos; // Zero-based.
+ std::size_t position; // Zero-based.
};
class LIBBUTL_EXPORT string_parser
@@ -31,14 +32,24 @@ namespace butl
// return one-level unquoted values. Throw invalid_string in case of
// invalid quoting.
//
- std::vector<string>
- parse_quoted (const string&, bool unquote);
+ static std::vector<std::string>
+ parse_quoted (const std::string&, bool unquote);
+
+ // As above but return a list of string and zero-based position pairs.
+ // Position is useful for issuing diagnostics about an invalid string
+ // during second-level parsing.
+ //
+ static std::vector<std::pair<std::string, std::size_t>>
+ parse_quoted_position (const std::string&, bool unquote);
// Remove a single level of quotes. Note that the format or the
// correctness of the quotation is not validated.
//
- std::string
- unquote (const string&);
+ static std::string
+ unquote (const std::string&);
+
+ static std::vector<std::string>
+ unquote (const std::vector<std::string>&);
};
}
diff --git a/butl/string-parser.cxx b/butl/string-parser.cxx
new file mode 100644
index 0000000..9228d66
--- /dev/null
+++ b/butl/string-parser.cxx
@@ -0,0 +1,132 @@
+// file : butl/string-parser.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <butl/string-parser>
+
+#include <utility> // move()
+
+using namespace std;
+
+namespace butl
+{
+ // Utility functions
+ //
+ inline static bool
+ space (char c) noexcept
+ {
+ return c == ' ' || c == '\t';
+ }
+
+ // string_parser
+ //
+ vector<pair<string, size_t>> string_parser::
+ parse_quoted_position (const string& s, bool unquote)
+ {
+ vector<pair<string, size_t>> r;
+ for (auto b (s.begin ()), i (b), e (s.end ()); i != e; )
+ {
+ for (; i != e && space (*i); ++i) ; // Skip spaces.
+
+ if (i == e) // No more strings.
+ break;
+
+ string s;
+ char quoting ('\0'); // Current quoting mode, can be used as bool.
+ size_t pos (i - b); // String position.
+
+ for (; i != e; ++i)
+ {
+ char c (*i);
+
+ if (!quoting)
+ {
+ if (space (c)) // End of string.
+ break;
+
+ if (c == '"' || c == '\'') // Begin of quoted substring.
+ {
+ quoting = c;
+
+ if (!unquote)
+ s += c;
+
+ continue;
+ }
+ }
+ else if (c == quoting) // End of quoted substring.
+ {
+ quoting = '\0';
+
+ if (!unquote)
+ s += c;
+
+ continue;
+ }
+
+ s += c;
+ }
+
+ if (quoting)
+ throw invalid_string (i - b, "unterminated quoted string");
+
+ r.emplace_back (move (s), pos);
+ }
+
+ return r;
+ }
+
+ vector<string> string_parser::
+ parse_quoted (const string& s, bool unquote)
+ {
+ vector<pair<string, size_t>> sp (parse_quoted_position (s, unquote));
+
+ vector<string> r;
+ r.reserve (sp.size ());
+ for (auto& s: sp)
+ r.emplace_back (move (s.first));
+
+ return r;
+ }
+
+ string string_parser::
+ unquote (const string& s)
+ {
+ string r;
+ char quoting ('\0'); // Current quoting mode, can be used as bool.
+
+ for (auto i (s.begin ()), e (s.end ()); i != e; ++i)
+ {
+ char c (*i);
+
+ if (!quoting)
+ {
+ if (c == '"' || c == '\'') // Begin of quoted substring.
+ {
+ quoting = c;
+ continue;
+ }
+ }
+ else if (c == quoting) // End of quoted substring.
+ {
+ quoting = '\0';
+ continue;
+ }
+
+ r += c;
+ }
+
+ return r;
+ }
+
+ vector<string> string_parser::
+ unquote (const vector<string>& v)
+ {
+ vector<string> r;
+ r.reserve (v.size ());
+ for (auto& s: v)
+ r.emplace_back (unquote (s));
+
+ return r;
+ }
+}
diff --git a/butl/tab-parser b/butl/tab-parser
index 6aa0705..f140b71 100644
--- a/butl/tab-parser
+++ b/butl/tab-parser
@@ -7,14 +7,12 @@
#include <string>
#include <vector>
-#include <iosfwd>
+#include <istream>
#include <cstdint> // uint64_t
#include <stdexcept> // runtime_error
#include <butl/export>
-#include <butl/char-scanner>
-
namespace butl
{
class LIBBUTL_EXPORT tab_parsing: public std::runtime_error
@@ -52,11 +50,11 @@ namespace butl
// supported. Blank lines and lines that start with # (collectively called
// empty lines) are ignored.
//
- class LIBBUTL_EXPORT tab_parser: protected butl::char_scanner
+ class LIBBUTL_EXPORT tab_parser
{
public:
tab_parser (std::istream& is, const std::string& name)
- : char_scanner (is), name_ (name) {}
+ : is_ (is), name_ (name) {}
// Return next line of fields. Skip empty lines. Empty result denotes the
// end of stream.
@@ -65,13 +63,9 @@ namespace butl
next ();
private:
- // Skip spaces and return the first peeked non-space character.
- //
- xchar
- skip_spaces ();
-
- private:
+ std::istream& is_;
const std::string name_;
+ std::uint64_t line_ = 0;
};
}
diff --git a/butl/tab-parser.cxx b/butl/tab-parser.cxx
index bae9327..4743e69 100644
--- a/butl/tab-parser.cxx
+++ b/butl/tab-parser.cxx
@@ -7,6 +7,8 @@
#include <cassert>
#include <sstream>
+#include <butl/string-parser>
+
using namespace std;
namespace butl
@@ -19,106 +21,50 @@ namespace butl
next ()
{
tab_fields r;
- xchar c (skip_spaces ()); // Skip empty lines and leading spaces.
-
- auto eol = [&c] () -> bool {return eos (c) || c == '\n';};
- auto space = [&c] () -> bool {return c == ' ' || c == '\t';};
- auto next = [&c, this] () {get (); c = peek ();};
-
- r.line = c.line;
- // Read line fields until eos or the newline character.
+ // Read lines until a non-empty one or EOF is encountered. In the first
+ // case parse the line and bail out.
//
- while (!eol ())
+ // Note that we check for character presence in the stream prior to the
+ // getline() call, to prevent it from setting the failbit.
+ //
+ while (!is_.eof () && is_.peek () != istream::traits_type::eof ())
{
- for (; !eol () && space (); next ()) ; // Skip space characters.
+ string s;
+ getline (is_, s);
- if (eol ()) // No more fields.
- break;
+ ++line_;
- // Read the field. Here we scan until the first whitespace character that
- // appears out of quotes.
+ // Skip empty line.
//
- tab_field tf ({string (), c.column});
- char quoting ('\0'); // Current quoting mode, can be used as bool.
-
- for (; !eol (); next ())
- {
- if (!quoting)
- {
- if (space ()) // End of the field.
- break;
- else if (c == '"' || c == '\'') // Begin of quoted string.
- quoting = c;
- }
- else if (c == quoting) // End of quoted string.
- quoting = '\0';
-
- tf.value += c;
- }
+ auto i (s.begin ());
+ auto e (s.end ());
+ for (; i != e && (*i == ' ' || *i == '\t'); ++i) ; // Skip spaces.
- if (quoting)
- throw parsing (name_, c.line, c.column, "unterminated quoted string");
+ if (i == e || *i == '#')
+ continue;
- r.emplace_back (move (tf));
- }
+ r.line = line_;
+ r.end_column = s.size () + 1; // Newline position.
- r.end_column = c.column;
+ vector<std::pair<string, size_t>> sp;
- // Read out eof or newline character from the stream. Note that "reading"
- // eof multiple times is safe.
- //
- get ();
- return r;
- }
-
- tab_parser::xchar tab_parser::
- skip_spaces ()
- {
- xchar c (peek ());
- bool start (c.column == 1);
-
- for (; !eos (c); c = peek ())
- {
- switch (c)
+ try
{
- case ' ':
- case '\t':
- break;
- case '\n':
- {
- // Skip empty lines.
- //
- if (!start)
- return c;
-
- break;
- }
- case '#':
- {
- // We only recognize '#' as a start of a comment at the beginning
- // of the line (sans leading spaces).
- //
- if (!start)
- return c;
-
- get ();
-
- // Read until newline or eos.
- //
- for (c = peek (); !eos (c) && c != '\n'; c = peek ())
- get ();
-
- continue;
- }
- default:
- return c; // Not a space.
+ sp = string_parser::parse_quoted_position (s, false);
+ }
+ catch (const invalid_string& e)
+ {
+ throw parsing (name_, line_, e.position + 1, e.what ());
}
- get ();
+ for (auto& s: sp)
+ r.emplace_back (tab_field ({move (s.first), s.second + 1}));
+
+ break;
}
- return c;
+ return r;
}
// tab_parsing
diff --git a/tests/string-parser/buildfile b/tests/string-parser/buildfile
new file mode 100644
index 0000000..9ccf480
--- /dev/null
+++ b/tests/string-parser/buildfile
@@ -0,0 +1,7 @@
+# file : tests/string-parser/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+exe{driver}: cxx{driver} ../../butl/lib{butl} test{testscript}
+
+include ../../butl/
diff --git a/tests/string-parser/driver.cxx b/tests/string-parser/driver.cxx
new file mode 100644
index 0000000..2aad3a7
--- /dev/null
+++ b/tests/string-parser/driver.cxx
@@ -0,0 +1,93 @@
+// file : tests/string-parser/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <ios> // ios::failbit, ios::badbit
+#include <vector>
+#include <string>
+#include <cassert>
+#include <iostream>
+
+#include <butl/utility> // operator<<(ostream,exception)
+#include <butl/string-parser>
+
+using namespace std;
+using namespace butl;
+
+// Usage: argv[0] [-l] [-u] [-p]
+//
+// Read and parse lines into strings from STDIN and print them to STDOUT.
+//
+// -l output each string on a separate line
+// -u unquote strings
+// -p output positions
+//
+int
+main (int argc, char* argv[])
+try
+{
+ bool spl (false); // Print string per line.
+ bool unquote (false);
+ bool pos (false);
+
+ for (int i (1); i != argc; ++i)
+ {
+ string o (argv[i]);
+
+ if (o == "-l")
+ spl = true;
+ else if (o == "-u")
+ unquote = true;
+ else if (o == "-p")
+ pos = true;
+ else
+ assert (false);
+ }
+
+ // Do not throw when eofbit is set (end of stream reached), and when failbit
+ // is set (getline() failed to extract any character).
+ //
+ cin.exceptions (ios::badbit);
+
+ cout.exceptions (ios::failbit | ios::badbit);
+
+ string l;
+ while (getline (cin, l))
+ {
+ vector<pair<string, size_t>> v (
+ string_parser::parse_quoted_position (l, unquote));
+
+ if (!spl)
+ {
+ for (auto b (v.cbegin ()), i (b), e (v.cend ()); i != e; ++i)
+ {
+ if (i != b)
+ cout << ' ';
+
+ if (pos)
+ cout << i->second << ":";
+
+ cout << i->first;
+ }
+
+ cout << endl;
+ }
+ else
+ {
+ for (const auto& s: v)
+ {
+ if (pos)
+ cout << s.second << ":";
+
+ cout << s.first << endl;
+ }
+ }
+ }
+
+ return 0;
+}
+catch (const invalid_string& e)
+{
+ cerr << e.position << ": " << e << endl;
+ return 1;
+}
diff --git a/tests/string-parser/testscript b/tests/string-parser/testscript
new file mode 100644
index 0000000..83c8346
--- /dev/null
+++ b/tests/string-parser/testscript
@@ -0,0 +1,42 @@
+# file : tests/string-parser/testscript
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+: valid
+:
+{
+ test.options += -l
+ sp=' ' # For line-terminating spaces.
+
+ : quoted
+ :
+ {
+ $* -p <<"EOI" >>EOO
+ abc "d 'ef " 'x "y z'$sp
+ EOI
+ 0:abc
+ 5:"d 'ef "
+ 14:'x "y z'
+ EOO
+ }
+
+ : unquoted
+ :
+ {
+ $* -u <<"EOI" >>"EOO"
+ abc "d 'ef " 'x "y z'$sp
+ EOI
+ abc
+ d 'ef$sp
+ x "y z
+ EOO
+ }
+}
+
+: invalid
+:
+{
+ : unterm-quoting
+ :
+ $* <'ab"c' 2>'4: unterminated quoted string' == 1
+}
diff --git a/tests/tab-parser/testscript b/tests/tab-parser/testscript
index 1b0a816..de3b167 100644
--- a/tests/tab-parser/testscript
+++ b/tests/tab-parser/testscript
@@ -45,5 +45,10 @@
{
: unterm-quoting
:
- $* <'ab"c' 2>'cin:1:5: error: unterminated quoted string' == 1
+ $* <<EOI >'123' 2>'cin:3:5: error: unterminated quoted string' == 1
+
+ 123
+ ab"c
+ xyz
+ EOI
}