aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2018-06-19 15:30:22 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2018-06-19 15:30:22 +0300
commit06e915be138b0638e30083f84cecda0eb1bfc895 (patch)
tree50f7eca40de25033116c6f6f75524ae5801dcc78
parent338d8065f1b681da841fa0d79cc9265776ff1e1e (diff)
Add regex_replace_match() and rename regex_replace_ex() to regex_replace_search()
-rw-r--r--libbutl/regex.ixx14
-rw-r--r--libbutl/regex.mxx51
-rw-r--r--libbutl/regex.txx376
-rw-r--r--tests/regex/driver.cxx14
-rw-r--r--tests/regex/testscript11
5 files changed, 262 insertions, 204 deletions
diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx
index 15189fb..8e3286f 100644
--- a/libbutl/regex.ixx
+++ b/libbutl/regex.ixx
@@ -6,19 +6,19 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
{
template <typename C>
inline std::pair<std::basic_string<C>, bool>
- regex_replace_ex (const std::basic_string<C>& s,
- const std::basic_regex<C>& re,
- const std::basic_string<C>& fmt,
- std::regex_constants::match_flag_type flags)
+ regex_replace_search (const std::basic_string<C>& s,
+ const std::basic_regex<C>& re,
+ const std::basic_string<C>& fmt,
+ std::regex_constants::match_flag_type flags)
{
using namespace std;
using it = typename basic_string<C>::const_iterator;
basic_string<C> r;
- bool match (regex_replace_ex (s, re, fmt,
- [&r] (it b, it e) {r.append (b, e);},
- flags));
+ bool match (regex_replace_search (s, re, fmt,
+ [&r] (it b, it e) {r.append (b, e);},
+ flags));
return make_pair (move (r), match);
}
diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx
index 741b818..7fa0155 100644
--- a/libbutl/regex.mxx
+++ b/libbutl/regex.mxx
@@ -40,16 +40,9 @@ import std.regex; // @@ MOD TODO should probably be re-exported.
LIBBUTL_MODEXPORT namespace butl
{
- // Call specified append() function for non-matched substrings and matched
- // substring replacements returning true if search succeeded. The function
- // must be callable with the following signature:
- //
- // void
- // append(basic_string<C>::iterator begin, basic_string<C>::iterator end);
- //
- // The regex semantics is like that of std::regex_replace() extended the
- // standard ECMA-262 substitution escape sequences with a subset of Perl
- // sequences:
+ // The regex semantics for the following functions is like that of
+ // std::regex_replace() extended the standard ECMA-262 substitution escape
+ // sequences with a subset of Perl sequences:
//
// \\, \u, \l, \U, \L, \E, \1, ..., \9
//
@@ -65,14 +58,22 @@ LIBBUTL_MODEXPORT namespace butl
// C++ locale (which is, unless changed, is the same as C locale and
// both default to the POSIX locale aka "C").
//
+
+ // Call specified append() function for non-matched substrings and matched
+ // substring replacements returning true if search succeeded. The function
+ // must be callable with the following signature:
+ //
+ // void
+ // append(basic_string<C>::iterator begin, basic_string<C>::iterator end);
+ //
template <typename C, typename F>
bool
- regex_replace_ex (const std::basic_string<C>&,
- const std::basic_regex<C>&,
- const std::basic_string<C>& fmt,
- F&& append,
- std::regex_constants::match_flag_type =
- std::regex_constants::match_default);
+ regex_replace_search (const std::basic_string<C>&,
+ const std::basic_regex<C>&,
+ const std::basic_string<C>& fmt,
+ F&& append,
+ std::regex_constants::match_flag_type =
+ std::regex_constants::match_default);
// As above but concatenate non-matched substrings and matched substring
// replacements into a string returning it as well as whether the search
@@ -80,11 +81,19 @@ LIBBUTL_MODEXPORT namespace butl
//
template <typename C>
std::pair<std::basic_string<C>, bool>
- regex_replace_ex (const std::basic_string<C>&,
- const std::basic_regex<C>&,
- const std::basic_string<C>& fmt,
- std::regex_constants::match_flag_type =
- std::regex_constants::match_default);
+ regex_replace_search (const std::basic_string<C>&,
+ const std::basic_regex<C>&,
+ const std::basic_string<C>& fmt,
+ std::regex_constants::match_flag_type =
+ std::regex_constants::match_default);
+
+ // Match the entire string and, if it matches, return the string replacement.
+ //
+ template <typename C>
+ std::pair<std::basic_string<C>, bool>
+ regex_replace_match (const std::basic_string<C>&,
+ const std::basic_regex<C>&,
+ const std::basic_string<C>& fmt);
}
LIBBUTL_MODEXPORT namespace std
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index a7a6c9a..fbe2885 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -4,13 +4,209 @@
LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
{
+ // Replace the regex match results using the format string.
+ //
+ template <typename C>
+ std::basic_string<C>
+ regex_replace_match_results (
+ const std::match_results<typename std::basic_string<C>::const_iterator>& m,
+ const std::basic_string<C>& fmt)
+ {
+ using namespace std;
+
+ using string_type = basic_string<C>;
+ using str_it = typename string_type::const_iterator;
+
+ string_type r;
+
+ // Note that we are using char type literals with the assumption that
+ // being ASCII characters they will be properly "widened" to the
+ // corresponding literals of the C template parameter type.
+ //
+ auto digit = [] (C c) -> int
+ {
+ return c >= '0' && c <= '9' ? c - '0' : -1;
+ };
+
+ enum class case_conv {none, upper, lower, upper_once, lower_once}
+ mode (case_conv::none);
+
+ locale cl; // Copy of the global C++ locale.
+
+ auto conv_chr = [&mode, &cl] (C c) -> C
+ {
+ switch (mode)
+ {
+ case case_conv::upper_once: mode = case_conv::none; // Fall through.
+ case case_conv::upper: c = toupper (c, cl); break;
+ case case_conv::lower_once: mode = case_conv::none; // Fall through.
+ case case_conv::lower: c = tolower (c, cl); break;
+ case case_conv::none: break;
+ }
+ return c;
+ };
+
+ auto append_chr = [&r, &conv_chr] (C c) {r.push_back (conv_chr (c));};
+
+ auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
+ {
+ // Optimize for the common case.
+ //
+ if (mode == case_conv::none)
+ r.append (b, e);
+ else
+ {
+ for (str_it i (b); i != e; ++i)
+ r.push_back (conv_chr (*i));
+ }
+ };
+
+ size_t n (fmt.size ());
+ for (size_t i (0); i < n; ++i)
+ {
+ C c (fmt[i]);
+
+ switch (c)
+ {
+ case '$':
+ {
+ // Check if this is a $-based escape sequence. Interpret it
+ // accordingly if that's the case, treat '$' as a regular character
+ // otherwise.
+ //
+ c = fmt[++i]; // '\0' if last.
+
+ switch (c)
+ {
+ case '$': append_chr (c); break;
+ case '&': append_str (m[0].first, m[0].second); break;
+ case '`':
+ {
+ append_str (m.prefix ().first, m.prefix ().second);
+ break;
+ }
+ case '\'':
+ {
+ append_str (m.suffix ().first, m.suffix ().second);
+ break;
+ }
+ default:
+ {
+ // Check if this is a sub-expression 1-based index ($n or $nn).
+ // Append the matching substring if that's the case. Treat '$'
+ // as a regular character otherwise. Index greater than the
+ // sub-expression count is silently ignored.
+ //
+ int si (digit (c));
+ if (si >= 0)
+ {
+ int d;
+ if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
+ {
+ si = si * 10 + d;
+ ++i;
+ }
+ }
+
+ if (si > 0)
+ {
+ // m[0] refers to the matched substring. Note that we ignore
+ // unmatched sub-expression references.
+ //
+ if (static_cast<size_t> (si) < m.size () && m[si].matched)
+ append_str (m[si].first, m[si].second);
+ }
+ else
+ {
+ // Not a $-based escape sequence so treat '$' as a regular
+ // character.
+ //
+ --i;
+ append_chr ('$');
+ }
+
+ break;
+ }
+ }
+
+ break;
+ }
+ case '\\':
+ {
+ c = fmt[++i]; // '\0' if last.
+
+ switch (c)
+ {
+ case '\\': append_chr (c); break;
+
+ case 'u': mode = case_conv::upper_once; break;
+ case 'l': mode = case_conv::lower_once; break;
+ case 'U': mode = case_conv::upper; break;
+ case 'L': mode = case_conv::lower; break;
+ case 'E': mode = case_conv::none; break;
+ default:
+ {
+ // Check if this is a sub-expression 1-based index. Append the
+ // matching substring if that's the case, Skip '\\' otherwise.
+ // Index greater than the sub-expression count is silently
+ // ignored.
+ //
+ int si (digit (c));
+ if (si > 0)
+ {
+ // m[0] refers to the matched substring. Note that we ignore
+ // unmatched sub-expression references.
+ //
+ if (static_cast<size_t> (si) < m.size () && m[si].matched)
+ append_str (m[si].first, m[si].second);
+ }
+ else
+ --i;
+
+ break;
+ }
+ }
+
+ break;
+ }
+ default:
+ {
+ // Append a regular character.
+ //
+ append_chr (c);
+ break;
+ }
+ }
+ }
+
+ return r;
+ }
+
+ template <typename C>
+ std::pair<std::basic_string<C>, bool>
+ regex_replace_match (const std::basic_string<C>& s,
+ const std::basic_regex<C>& re,
+ const std::basic_string<C>& fmt)
+ {
+ using namespace std;
+
+ using string_type = basic_string<C>;
+ using str_it = typename string_type::const_iterator;
+
+ match_results<str_it> m;
+ bool match (regex_match (s, m, re));
+
+ return make_pair (match ? regex_replace_match_results (m, fmt) : string (),
+ match);
+ }
+
template <typename C, typename F>
bool
- regex_replace_ex (const std::basic_string<C>& s,
- const std::basic_regex<C>& re,
- const std::basic_string<C>& fmt,
- F&& append,
- std::regex_constants::match_flag_type flags)
+ regex_replace_search (const std::basic_string<C>& s,
+ const std::basic_regex<C>& re,
+ const std::basic_string<C>& fmt,
+ F&& append,
+ std::regex_constants::match_flag_type flags)
{
using namespace std;
@@ -18,10 +214,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
using str_it = typename string_type::const_iterator;
using regex_it = regex_iterator<str_it>;
- bool first_only ((flags & std::regex_constants::format_first_only) != 0);
- bool no_copy ((flags & std::regex_constants::format_no_copy) != 0);
-
- locale cl; // Copy of the global C++ locale.
+ bool first_only ((flags & regex_constants::format_first_only) != 0);
+ bool no_copy ((flags & regex_constants::format_no_copy) != 0);
// Beginning of the last unmatched substring.
//
@@ -72,169 +266,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
// The standard implementation calls m.format() here. We perform our
// own formatting.
//
- // Note that we are using char type literals with the assumption that
- // being ASCII characters they will be properly "widened" to the
- // corresponding literals of the C template parameter type.
- //
- auto digit = [] (C c) -> int
- {
- return c >= '0' && c <= '9' ? c - '0' : -1;
- };
-
- enum class case_conv {none, upper, lower, upper_once, lower_once}
- mode (case_conv::none);
-
- auto conv_chr = [&mode, &cl] (C c) -> C
- {
- switch (mode)
- {
- case case_conv::upper_once: mode = case_conv::none; // Fall through.
- case case_conv::upper: c = toupper (c, cl); break;
- case case_conv::lower_once: mode = case_conv::none; // Fall through.
- case case_conv::lower: c = tolower (c, cl); break;
- case case_conv::none: break;
- }
- return c;
- };
-
- string_type r;
-
- auto append_chr = [&r, &conv_chr] (C c)
- {
- r.push_back (conv_chr (c));
- };
-
- auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
- {
- // Optimize for the common case.
- //
- if (mode == case_conv::none)
- r.append (b, e);
- else
- {
- for (str_it i (b); i != e; ++i)
- r.push_back (conv_chr (*i));
- }
- };
-
- size_t n (fmt.size ());
- for (size_t i (0); i < n; ++i)
- {
- C c (fmt[i]);
-
- switch (c)
- {
- case '$':
- {
- // Check if this is a $-based escape sequence. Interpret it
- // accordingly if that's the case, treat '$' as a regular
- // character otherwise.
- //
- c = fmt[++i]; // '\0' if last.
-
- switch (c)
- {
- case '$': append_chr (c); break;
- case '&': append_str (m[0].first, m[0].second); break;
- case '`':
- {
- append_str (m.prefix ().first, m.prefix ().second);
- break;
- }
- case '\'':
- {
- append_str (m.suffix ().first, m.suffix ().second);
- break;
- }
- default:
- {
- // Check if this is a sub-expression 1-based index ($n or
- // $nn). Append the matching substring if that's the case.
- // Treat '$' as a regular character otherwise. Index greater
- // than the sub-expression count is silently ignored.
- //
- int si (digit (c));
- if (si >= 0)
- {
- int d;
- if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
- {
- si = si * 10 + d;
- ++i;
- }
- }
-
- if (si > 0)
- {
- // m[0] refers to the matched substring. Note that we
- // ignore unmatched sub-expression references.
- //
- if (static_cast<size_t> (si) < m.size () && m[si].matched)
- append_str (m[si].first, m[si].second);
- }
- else
- {
- // Not a $-based escape sequence so treat '$' as a
- // regular character.
- //
- --i;
- append_chr ('$');
- }
-
- break;
- }
- }
-
- break;
- }
- case '\\':
- {
- c = fmt[++i]; // '\0' if last.
-
- switch (c)
- {
- case '\\': append_chr (c); break;
-
- case 'u': mode = case_conv::upper_once; break;
- case 'l': mode = case_conv::lower_once; break;
- case 'U': mode = case_conv::upper; break;
- case 'L': mode = case_conv::lower; break;
- case 'E': mode = case_conv::none; break;
- default:
- {
- // Check if this is a sub-expression 1-based index. Append
- // the matching substring if that's the case, Skip '\\'
- // otherwise. Index greater than the sub-expression count is
- // silently ignored.
- //
- int si (digit (c));
- if (si > 0)
- {
- // m[0] refers to the matched substring. Note that we
- // ignore unmatched sub-expression references.
- //
- if (static_cast<size_t> (si) < m.size () && m[si].matched)
- append_str (m[si].first, m[si].second);
- }
- else
- --i;
-
- break;
- }
- }
-
- break;
- }
- default:
- {
- // Append a regular character.
- //
- append_chr (c);
- break;
- }
- }
- }
-
+ string_type r (regex_replace_match_results (m, fmt));
append (r.begin (), r.end ());
}
}
diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx
index 0f6a385..fb41ba2 100644
--- a/tests/regex/driver.cxx
+++ b/tests/regex/driver.cxx
@@ -28,10 +28,10 @@ import butl.utility; // operator<<(ostream, exception)
using namespace std;
using namespace butl;
-// Usage: argv[0] [-ffo] [-fnc] <string> <regex> <format>
+// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format>
//
// Perform substitution of matched substrings with formatted replacement
-// strings using regex_replace_ex() function. If the string matches the regex
+// strings using regex_replace_*() functions. If the string matches the regex
// then print the replacement to STDOUT and exit with zero code. Exit with
// code one if it doesn't match, and with code two on failure (print error
// description to STDERR).
@@ -42,6 +42,9 @@ using namespace butl;
// -fnc
// Use format_no_copy replacement flag.
//
+// -m
+// Match the entire string, rather than its sub-strings.
+//
int
main (int argc, const char* argv[])
try
@@ -49,6 +52,7 @@ try
regex_constants::match_flag_type fl (regex_constants::match_default);
int i (1);
+ bool match (false);
for (; i != argc; ++i)
{
string op (argv[i]);
@@ -57,6 +61,8 @@ try
fl |= regex_constants::format_first_only;
else if (op == "-fnc")
fl |= regex_constants::format_no_copy;
+ else if (op == "-m")
+ match = true;
else
break;
}
@@ -67,7 +73,9 @@ try
regex re (argv[i++]);
string fmt (argv[i]);
- auto r (regex_replace_ex (s, re, fmt, fl));
+ auto r (match
+ ? regex_replace_match (s, re, fmt)
+ : regex_replace_search (s, re, fmt, fl));
if (r.second)
cout << r.first << endl;
diff --git a/tests/regex/testscript b/tests/regex/testscript
index 4b03e45..d431756 100644
--- a/tests/regex/testscript
+++ b/tests/regex/testscript
@@ -2,7 +2,7 @@
# copyright : Copyright (c) 2014-2018 Code Synthesis Ltd
# license : MIT; see accompanying LICENSE file
-: match
+: replace-search
:
{
$* abcbd b x >axcxd : all
@@ -58,3 +58,12 @@
$* xay a '\lVZ' >xvZy
}
}
+
+: replace-match
+:
+{
+ test.options += -m
+
+ $* abc 'a(b)c' 'x\1y' >xby : match
+ $* abcd 'a(b)c' 'x\1yd' == 1 : no-match
+}