aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2024-03-19 13:24:44 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2024-03-21 06:44:06 +0200
commit3a55e033e4fc9a18ede99c4f9dd69fd30c383cf7 (patch)
tree1c35062021a261479ff5f38d95dd22cfe34fbb38
parent736b0f25003c92b3903798ce0a768230480d8f4b (diff)
Add next_word() overload that doesn't skip consecutive delimiters
In particular, this version can be used to parse lines while observing blanks.
-rw-r--r--libbutl/utility.hxx23
-rw-r--r--libbutl/utility.ixx60
-rw-r--r--tests/next-word/buildfile6
-rw-r--r--tests/next-word/driver.cxx46
4 files changed, 135 insertions, 0 deletions
diff --git a/libbutl/utility.hxx b/libbutl/utility.hxx
index a129276..779a0aa 100644
--- a/libbutl/utility.hxx
+++ b/libbutl/utility.hxx
@@ -190,6 +190,24 @@ namespace butl
//
// The second version examines up to the n'th character in the string.
//
+ // The third version, instead of skipping consecutive delimiters, treats
+ // them as separating empty words. The additional m variable contains an
+ // unspecified internal state and should be initialized to 0. Note that in
+ // this case you should use the (b == n) condition to detect the end. Note
+ // also that a leading delimiter is considered as separating an empty word
+ // from the rest and the trailing delimiter is considered as separating the
+ // rest from an empty word. For example, this is how to parse lines while
+ // observing blanks:
+ //
+ // for (size_t b (0), e (0), m (0), n (s.size ());
+ // next_word (s, n, b, e, m, '\n', '\r'), b != n; )
+ // {
+ // string l (s, b, e - b);
+ // }
+ //
+ // For string "\na\n" this code will observe the {"", "a", ""} words. And
+ // for just "\n" it will observe the {"", ""} words.
+ //
std::size_t
next_word (const std::string&, std::size_t& b, std::size_t& e,
char d1 = ' ', char d2 = '\0');
@@ -198,6 +216,11 @@ namespace butl
next_word (const std::string&, std::size_t n, std::size_t& b, std::size_t& e,
char d1 = ' ', char d2 = '\0');
+ std::size_t
+ next_word (const std::string&, std::size_t n,
+ std::size_t& b, std::size_t& e, std::size_t& m,
+ char d1 = ' ', char d2 = '\0');
+
// Sanitize a string to only contain characters valid in an identifier
// (ASCII alphanumeric plus `_`) replacing all others with `_`.
//
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index 0ce33a7..fda1ce5 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -210,6 +210,66 @@ namespace butl
return e - b;
}
+ inline std::size_t
+ next_word (const std::string& s,
+ std::size_t n, std::size_t& b, std::size_t& e, std::size_t& m,
+ char d1, char d2)
+ {
+ // An empty word will necessarily be represented as b and e being the
+ // position of a delimiter. Consider these corner cases (in all three we
+ // should produce two words):
+ //
+ // \n
+ // a\n
+ // \na
+ //
+ // It feels sensible to represent an empty word as the position of the
+ // trailing delimiter except if it is the last character (the first two
+ // cases). Thus the additional m state, which, if 0 or 1 indicates the
+ // number of delimiters to skip before parsing the next word and 2 if
+ // this is a trailing delimiter for which we need to fake an empty word
+ // with the leading delimiter.
+
+ if (b != e)
+ b = e;
+
+ if (m > 1)
+ {
+ --m;
+ return 0;
+ }
+
+ // Skip the leading delimiter, if any.
+ //
+ b += m;
+
+ if (b == n)
+ {
+ e = n;
+ return 0;
+ }
+
+ // Find first trailing delimiter.
+ //
+ m = 0;
+ for (e = b; e != n; ++e)
+ {
+ if (s[e] == d1 || s[e] == d2)
+ {
+ m = 1;
+
+ // Handle the special delimiter as the last character case.
+ //
+ if (e + 1 == n)
+ ++m;
+
+ break;
+ }
+ }
+
+ return e - b;
+ }
+
inline std::string&
sanitize_identifier (std::string& s)
{
diff --git a/tests/next-word/buildfile b/tests/next-word/buildfile
new file mode 100644
index 0000000..e06cd88
--- /dev/null
+++ b/tests/next-word/buildfile
@@ -0,0 +1,6 @@
+# file : tests/next-word/buildfile
+# license : MIT; see accompanying LICENSE file
+
+import libs = libbutl%lib{butl}
+
+exe{driver}: {hxx cxx}{*} $libs
diff --git a/tests/next-word/driver.cxx b/tests/next-word/driver.cxx
new file mode 100644
index 0000000..4ebe1a5
--- /dev/null
+++ b/tests/next-word/driver.cxx
@@ -0,0 +1,46 @@
+// file : tests/next-word/driver.cxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#include <vector>
+#include <string>
+//#include <iostream>
+
+#include <libbutl/utility.hxx>
+
+#undef NDEBUG
+#include <cassert>
+
+using namespace std;
+using namespace butl;
+
+using strings = vector<string>;
+
+static strings
+parse_lines (const string& s)
+{
+ strings r;
+ for (size_t b (0), e (0), m (0), n (s.size ());
+ next_word (s, n, b, e, m, '\n', '\r'), b != n; )
+ {
+ //cerr << "'" << string (s, b, e - b) << "'" << endl;
+ r.push_back (string (s, b, e - b));
+ }
+ return r;
+}
+
+int
+main ()
+{
+ assert ((parse_lines("") == strings {}));
+ assert ((parse_lines("a") == strings {"a"}));
+ assert ((parse_lines("\n") == strings {"", ""}));
+ assert ((parse_lines("\n\n") == strings {"", "", ""}));
+ assert ((parse_lines("\n\n\n") == strings {"", "", "", ""}));
+ assert ((parse_lines("\na") == strings {"", "a"}));
+ assert ((parse_lines("\n\na") == strings {"", "", "a"}));
+ assert ((parse_lines("a\n") == strings {"a", ""}));
+ assert ((parse_lines("a\n\n") == strings {"a", "", ""}));
+ assert ((parse_lines("a\nb") == strings {"a", "b"}));
+ assert ((parse_lines("a\n\nb") == strings {"a", "", "b"}));
+ assert ((parse_lines("\na\nb\n") == strings {"", "a", "b", ""}));
+}