From 3a55e033e4fc9a18ede99c4f9dd69fd30c383cf7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 19 Mar 2024 13:24:44 +0200 Subject: Add next_word() overload that doesn't skip consecutive delimiters In particular, this version can be used to parse lines while observing blanks. --- libbutl/utility.hxx | 23 ++++++++++++++++++ libbutl/utility.ixx | 60 ++++++++++++++++++++++++++++++++++++++++++++++ tests/next-word/buildfile | 6 +++++ tests/next-word/driver.cxx | 46 +++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+) create mode 100644 tests/next-word/buildfile create mode 100644 tests/next-word/driver.cxx diff --git a/libbutl/utility.hxx b/libbutl/utility.hxx index a129276..779a0aa 100644 --- a/libbutl/utility.hxx +++ b/libbutl/utility.hxx @@ -190,6 +190,24 @@ namespace butl // // The second version examines up to the n'th character in the string. // + // The third version, instead of skipping consecutive delimiters, treats + // them as separating empty words. The additional m variable contains an + // unspecified internal state and should be initialized to 0. Note that in + // this case you should use the (b == n) condition to detect the end. Note + // also that a leading delimiter is considered as separating an empty word + // from the rest and the trailing delimiter is considered as separating the + // rest from an empty word. For example, this is how to parse lines while + // observing blanks: + // + // for (size_t b (0), e (0), m (0), n (s.size ()); + // next_word (s, n, b, e, m, '\n', '\r'), b != n; ) + // { + // string l (s, b, e - b); + // } + // + // For string "\na\n" this code will observe the {"", "a", ""} words. And + // for just "\n" it will observe the {"", ""} words. + // std::size_t next_word (const std::string&, std::size_t& b, std::size_t& e, char d1 = ' ', char d2 = '\0'); @@ -198,6 +216,11 @@ namespace butl next_word (const std::string&, std::size_t n, std::size_t& b, std::size_t& e, char d1 = ' ', char d2 = '\0'); + std::size_t + next_word (const std::string&, std::size_t n, + std::size_t& b, std::size_t& e, std::size_t& m, + char d1 = ' ', char d2 = '\0'); + // Sanitize a string to only contain characters valid in an identifier // (ASCII alphanumeric plus `_`) replacing all others with `_`. // diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index 0ce33a7..fda1ce5 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -210,6 +210,66 @@ namespace butl return e - b; } + inline std::size_t + next_word (const std::string& s, + std::size_t n, std::size_t& b, std::size_t& e, std::size_t& m, + char d1, char d2) + { + // An empty word will necessarily be represented as b and e being the + // position of a delimiter. Consider these corner cases (in all three we + // should produce two words): + // + // \n + // a\n + // \na + // + // It feels sensible to represent an empty word as the position of the + // trailing delimiter except if it is the last character (the first two + // cases). Thus the additional m state, which, if 0 or 1 indicates the + // number of delimiters to skip before parsing the next word and 2 if + // this is a trailing delimiter for which we need to fake an empty word + // with the leading delimiter. + + if (b != e) + b = e; + + if (m > 1) + { + --m; + return 0; + } + + // Skip the leading delimiter, if any. + // + b += m; + + if (b == n) + { + e = n; + return 0; + } + + // Find first trailing delimiter. + // + m = 0; + for (e = b; e != n; ++e) + { + if (s[e] == d1 || s[e] == d2) + { + m = 1; + + // Handle the special delimiter as the last character case. + // + if (e + 1 == n) + ++m; + + break; + } + } + + return e - b; + } + inline std::string& sanitize_identifier (std::string& s) { diff --git a/tests/next-word/buildfile b/tests/next-word/buildfile new file mode 100644 index 0000000..e06cd88 --- /dev/null +++ b/tests/next-word/buildfile @@ -0,0 +1,6 @@ +# file : tests/next-word/buildfile +# license : MIT; see accompanying LICENSE file + +import libs = libbutl%lib{butl} + +exe{driver}: {hxx cxx}{*} $libs diff --git a/tests/next-word/driver.cxx b/tests/next-word/driver.cxx new file mode 100644 index 0000000..4ebe1a5 --- /dev/null +++ b/tests/next-word/driver.cxx @@ -0,0 +1,46 @@ +// file : tests/next-word/driver.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include +#include +//#include + +#include + +#undef NDEBUG +#include + +using namespace std; +using namespace butl; + +using strings = vector; + +static strings +parse_lines (const string& s) +{ + strings r; + for (size_t b (0), e (0), m (0), n (s.size ()); + next_word (s, n, b, e, m, '\n', '\r'), b != n; ) + { + //cerr << "'" << string (s, b, e - b) << "'" << endl; + r.push_back (string (s, b, e - b)); + } + return r; +} + +int +main () +{ + assert ((parse_lines("") == strings {})); + assert ((parse_lines("a") == strings {"a"})); + assert ((parse_lines("\n") == strings {"", ""})); + assert ((parse_lines("\n\n") == strings {"", "", ""})); + assert ((parse_lines("\n\n\n") == strings {"", "", "", ""})); + assert ((parse_lines("\na") == strings {"", "a"})); + assert ((parse_lines("\n\na") == strings {"", "", "a"})); + assert ((parse_lines("a\n") == strings {"a", ""})); + assert ((parse_lines("a\n\n") == strings {"a", "", ""})); + assert ((parse_lines("a\nb") == strings {"a", "b"})); + assert ((parse_lines("a\n\nb") == strings {"a", "", "b"})); + assert ((parse_lines("\na\nb\n") == strings {"", "a", "b", ""})); +} -- cgit v1.1