diff options
-rw-r--r-- | libbutl/regex.ixx | 14 | ||||
-rw-r--r-- | libbutl/regex.mxx | 51 | ||||
-rw-r--r-- | libbutl/regex.txx | 376 | ||||
-rw-r--r-- | tests/regex/driver.cxx | 14 | ||||
-rw-r--r-- | tests/regex/testscript | 11 |
5 files changed, 262 insertions, 204 deletions
diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx index 15189fb..8e3286f 100644 --- a/libbutl/regex.ixx +++ b/libbutl/regex.ixx @@ -6,19 +6,19 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { template <typename C> inline std::pair<std::basic_string<C>, bool> - regex_replace_ex (const std::basic_string<C>& s, - const std::basic_regex<C>& re, - const std::basic_string<C>& fmt, - std::regex_constants::match_flag_type flags) + regex_replace_search (const std::basic_string<C>& s, + const std::basic_regex<C>& re, + const std::basic_string<C>& fmt, + std::regex_constants::match_flag_type flags) { using namespace std; using it = typename basic_string<C>::const_iterator; basic_string<C> r; - bool match (regex_replace_ex (s, re, fmt, - [&r] (it b, it e) {r.append (b, e);}, - flags)); + bool match (regex_replace_search (s, re, fmt, + [&r] (it b, it e) {r.append (b, e);}, + flags)); return make_pair (move (r), match); } diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx index 741b818..7fa0155 100644 --- a/libbutl/regex.mxx +++ b/libbutl/regex.mxx @@ -40,16 +40,9 @@ import std.regex; // @@ MOD TODO should probably be re-exported. LIBBUTL_MODEXPORT namespace butl { - // Call specified append() function for non-matched substrings and matched - // substring replacements returning true if search succeeded. The function - // must be callable with the following signature: - // - // void - // append(basic_string<C>::iterator begin, basic_string<C>::iterator end); - // - // The regex semantics is like that of std::regex_replace() extended the - // standard ECMA-262 substitution escape sequences with a subset of Perl - // sequences: + // The regex semantics for the following functions is like that of + // std::regex_replace() extended the standard ECMA-262 substitution escape + // sequences with a subset of Perl sequences: // // \\, \u, \l, \U, \L, \E, \1, ..., \9 // @@ -65,14 +58,22 @@ LIBBUTL_MODEXPORT namespace butl // C++ locale (which is, unless changed, is the same as C locale and // both default to the POSIX locale aka "C"). // + + // Call specified append() function for non-matched substrings and matched + // substring replacements returning true if search succeeded. The function + // must be callable with the following signature: + // + // void + // append(basic_string<C>::iterator begin, basic_string<C>::iterator end); + // template <typename C, typename F> bool - regex_replace_ex (const std::basic_string<C>&, - const std::basic_regex<C>&, - const std::basic_string<C>& fmt, - F&& append, - std::regex_constants::match_flag_type = - std::regex_constants::match_default); + regex_replace_search (const std::basic_string<C>&, + const std::basic_regex<C>&, + const std::basic_string<C>& fmt, + F&& append, + std::regex_constants::match_flag_type = + std::regex_constants::match_default); // As above but concatenate non-matched substrings and matched substring // replacements into a string returning it as well as whether the search @@ -80,11 +81,19 @@ LIBBUTL_MODEXPORT namespace butl // template <typename C> std::pair<std::basic_string<C>, bool> - regex_replace_ex (const std::basic_string<C>&, - const std::basic_regex<C>&, - const std::basic_string<C>& fmt, - std::regex_constants::match_flag_type = - std::regex_constants::match_default); + regex_replace_search (const std::basic_string<C>&, + const std::basic_regex<C>&, + const std::basic_string<C>& fmt, + std::regex_constants::match_flag_type = + std::regex_constants::match_default); + + // Match the entire string and, if it matches, return the string replacement. + // + template <typename C> + std::pair<std::basic_string<C>, bool> + regex_replace_match (const std::basic_string<C>&, + const std::basic_regex<C>&, + const std::basic_string<C>& fmt); } LIBBUTL_MODEXPORT namespace std diff --git a/libbutl/regex.txx b/libbutl/regex.txx index a7a6c9a..fbe2885 100644 --- a/libbutl/regex.txx +++ b/libbutl/regex.txx @@ -4,13 +4,209 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { + // Replace the regex match results using the format string. + // + template <typename C> + std::basic_string<C> + regex_replace_match_results ( + const std::match_results<typename std::basic_string<C>::const_iterator>& m, + const std::basic_string<C>& fmt) + { + using namespace std; + + using string_type = basic_string<C>; + using str_it = typename string_type::const_iterator; + + string_type r; + + // Note that we are using char type literals with the assumption that + // being ASCII characters they will be properly "widened" to the + // corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + locale cl; // Copy of the global C++ locale. + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; // Fall through. + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; // Fall through. + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) {r.push_back (conv_chr (c));}; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular character + // otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or $nn). + // Append the matching substring if that's the case. Treat '$' + // as a regular character otherwise. Index greater than the + // sub-expression count is silently ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. Note that we ignore + // unmatched sub-expression references. + // + if (static_cast<size_t> (si) < m.size () && m[si].matched) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a regular + // character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append the + // matching substring if that's the case, Skip '\\' otherwise. + // Index greater than the sub-expression count is silently + // ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. Note that we ignore + // unmatched sub-expression references. + // + if (static_cast<size_t> (si) < m.size () && m[si].matched) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + + return r; + } + + template <typename C> + std::pair<std::basic_string<C>, bool> + regex_replace_match (const std::basic_string<C>& s, + const std::basic_regex<C>& re, + const std::basic_string<C>& fmt) + { + using namespace std; + + using string_type = basic_string<C>; + using str_it = typename string_type::const_iterator; + + match_results<str_it> m; + bool match (regex_match (s, m, re)); + + return make_pair (match ? regex_replace_match_results (m, fmt) : string (), + match); + } + template <typename C, typename F> bool - regex_replace_ex (const std::basic_string<C>& s, - const std::basic_regex<C>& re, - const std::basic_string<C>& fmt, - F&& append, - std::regex_constants::match_flag_type flags) + regex_replace_search (const std::basic_string<C>& s, + const std::basic_regex<C>& re, + const std::basic_string<C>& fmt, + F&& append, + std::regex_constants::match_flag_type flags) { using namespace std; @@ -18,10 +214,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. using str_it = typename string_type::const_iterator; using regex_it = regex_iterator<str_it>; - bool first_only ((flags & std::regex_constants::format_first_only) != 0); - bool no_copy ((flags & std::regex_constants::format_no_copy) != 0); - - locale cl; // Copy of the global C++ locale. + bool first_only ((flags & regex_constants::format_first_only) != 0); + bool no_copy ((flags & regex_constants::format_no_copy) != 0); // Beginning of the last unmatched substring. // @@ -72,169 +266,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // The standard implementation calls m.format() here. We perform our // own formatting. // - // Note that we are using char type literals with the assumption that - // being ASCII characters they will be properly "widened" to the - // corresponding literals of the C template parameter type. - // - auto digit = [] (C c) -> int - { - return c >= '0' && c <= '9' ? c - '0' : -1; - }; - - enum class case_conv {none, upper, lower, upper_once, lower_once} - mode (case_conv::none); - - auto conv_chr = [&mode, &cl] (C c) -> C - { - switch (mode) - { - case case_conv::upper_once: mode = case_conv::none; // Fall through. - case case_conv::upper: c = toupper (c, cl); break; - case case_conv::lower_once: mode = case_conv::none; // Fall through. - case case_conv::lower: c = tolower (c, cl); break; - case case_conv::none: break; - } - return c; - }; - - string_type r; - - auto append_chr = [&r, &conv_chr] (C c) - { - r.push_back (conv_chr (c)); - }; - - auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) - { - // Optimize for the common case. - // - if (mode == case_conv::none) - r.append (b, e); - else - { - for (str_it i (b); i != e; ++i) - r.push_back (conv_chr (*i)); - } - }; - - size_t n (fmt.size ()); - for (size_t i (0); i < n; ++i) - { - C c (fmt[i]); - - switch (c) - { - case '$': - { - // Check if this is a $-based escape sequence. Interpret it - // accordingly if that's the case, treat '$' as a regular - // character otherwise. - // - c = fmt[++i]; // '\0' if last. - - switch (c) - { - case '$': append_chr (c); break; - case '&': append_str (m[0].first, m[0].second); break; - case '`': - { - append_str (m.prefix ().first, m.prefix ().second); - break; - } - case '\'': - { - append_str (m.suffix ().first, m.suffix ().second); - break; - } - default: - { - // Check if this is a sub-expression 1-based index ($n or - // $nn). Append the matching substring if that's the case. - // Treat '$' as a regular character otherwise. Index greater - // than the sub-expression count is silently ignored. - // - int si (digit (c)); - if (si >= 0) - { - int d; - if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. - { - si = si * 10 + d; - ++i; - } - } - - if (si > 0) - { - // m[0] refers to the matched substring. Note that we - // ignore unmatched sub-expression references. - // - if (static_cast<size_t> (si) < m.size () && m[si].matched) - append_str (m[si].first, m[si].second); - } - else - { - // Not a $-based escape sequence so treat '$' as a - // regular character. - // - --i; - append_chr ('$'); - } - - break; - } - } - - break; - } - case '\\': - { - c = fmt[++i]; // '\0' if last. - - switch (c) - { - case '\\': append_chr (c); break; - - case 'u': mode = case_conv::upper_once; break; - case 'l': mode = case_conv::lower_once; break; - case 'U': mode = case_conv::upper; break; - case 'L': mode = case_conv::lower; break; - case 'E': mode = case_conv::none; break; - default: - { - // Check if this is a sub-expression 1-based index. Append - // the matching substring if that's the case, Skip '\\' - // otherwise. Index greater than the sub-expression count is - // silently ignored. - // - int si (digit (c)); - if (si > 0) - { - // m[0] refers to the matched substring. Note that we - // ignore unmatched sub-expression references. - // - if (static_cast<size_t> (si) < m.size () && m[si].matched) - append_str (m[si].first, m[si].second); - } - else - --i; - - break; - } - } - - break; - } - default: - { - // Append a regular character. - // - append_chr (c); - break; - } - } - } - + string_type r (regex_replace_match_results (m, fmt)); append (r.begin (), r.end ()); } } diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx index 0f6a385..fb41ba2 100644 --- a/tests/regex/driver.cxx +++ b/tests/regex/driver.cxx @@ -28,10 +28,10 @@ import butl.utility; // operator<<(ostream, exception) using namespace std; using namespace butl; -// Usage: argv[0] [-ffo] [-fnc] <string> <regex> <format> +// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format> // // Perform substitution of matched substrings with formatted replacement -// strings using regex_replace_ex() function. If the string matches the regex +// strings using regex_replace_*() functions. If the string matches the regex // then print the replacement to STDOUT and exit with zero code. Exit with // code one if it doesn't match, and with code two on failure (print error // description to STDERR). @@ -42,6 +42,9 @@ using namespace butl; // -fnc // Use format_no_copy replacement flag. // +// -m +// Match the entire string, rather than its sub-strings. +// int main (int argc, const char* argv[]) try @@ -49,6 +52,7 @@ try regex_constants::match_flag_type fl (regex_constants::match_default); int i (1); + bool match (false); for (; i != argc; ++i) { string op (argv[i]); @@ -57,6 +61,8 @@ try fl |= regex_constants::format_first_only; else if (op == "-fnc") fl |= regex_constants::format_no_copy; + else if (op == "-m") + match = true; else break; } @@ -67,7 +73,9 @@ try regex re (argv[i++]); string fmt (argv[i]); - auto r (regex_replace_ex (s, re, fmt, fl)); + auto r (match + ? regex_replace_match (s, re, fmt) + : regex_replace_search (s, re, fmt, fl)); if (r.second) cout << r.first << endl; diff --git a/tests/regex/testscript b/tests/regex/testscript index 4b03e45..d431756 100644 --- a/tests/regex/testscript +++ b/tests/regex/testscript @@ -2,7 +2,7 @@ # copyright : Copyright (c) 2014-2018 Code Synthesis Ltd # license : MIT; see accompanying LICENSE file -: match +: replace-search : { $* abcbd b x >axcxd : all @@ -58,3 +58,12 @@ $* xay a '\lVZ' >xvZy } } + +: replace-match +: +{ + test.options += -m + + $* abc 'a(b)c' 'x\1y' >xby : match + $* abcd 'a(b)c' 'x\1yd' == 1 : no-match +} |