From 06e915be138b0638e30083f84cecda0eb1bfc895 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 19 Jun 2018 15:30:22 +0300 Subject: Add regex_replace_match() and rename regex_replace_ex() to regex_replace_search() --- libbutl/regex.txx | 376 +++++++++++++++++++++++++++++------------------------- 1 file changed, 204 insertions(+), 172 deletions(-) (limited to 'libbutl/regex.txx') diff --git a/libbutl/regex.txx b/libbutl/regex.txx index a7a6c9a..fbe2885 100644 --- a/libbutl/regex.txx +++ b/libbutl/regex.txx @@ -4,13 +4,209 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { + // Replace the regex match results using the format string. + // + template + std::basic_string + regex_replace_match_results ( + const std::match_results::const_iterator>& m, + const std::basic_string& fmt) + { + using namespace std; + + using string_type = basic_string; + using str_it = typename string_type::const_iterator; + + string_type r; + + // Note that we are using char type literals with the assumption that + // being ASCII characters they will be properly "widened" to the + // corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + locale cl; // Copy of the global C++ locale. + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; // Fall through. + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; // Fall through. + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) {r.push_back (conv_chr (c));}; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular character + // otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or $nn). + // Append the matching substring if that's the case. Treat '$' + // as a regular character otherwise. Index greater than the + // sub-expression count is silently ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. Note that we ignore + // unmatched sub-expression references. + // + if (static_cast (si) < m.size () && m[si].matched) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a regular + // character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append the + // matching substring if that's the case, Skip '\\' otherwise. + // Index greater than the sub-expression count is silently + // ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. Note that we ignore + // unmatched sub-expression references. + // + if (static_cast (si) < m.size () && m[si].matched) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + + return r; + } + + template + std::pair, bool> + regex_replace_match (const std::basic_string& s, + const std::basic_regex& re, + const std::basic_string& fmt) + { + using namespace std; + + using string_type = basic_string; + using str_it = typename string_type::const_iterator; + + match_results m; + bool match (regex_match (s, m, re)); + + return make_pair (match ? regex_replace_match_results (m, fmt) : string (), + match); + } + template bool - regex_replace_ex (const std::basic_string& s, - const std::basic_regex& re, - const std::basic_string& fmt, - F&& append, - std::regex_constants::match_flag_type flags) + regex_replace_search (const std::basic_string& s, + const std::basic_regex& re, + const std::basic_string& fmt, + F&& append, + std::regex_constants::match_flag_type flags) { using namespace std; @@ -18,10 +214,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. using str_it = typename string_type::const_iterator; using regex_it = regex_iterator; - bool first_only ((flags & std::regex_constants::format_first_only) != 0); - bool no_copy ((flags & std::regex_constants::format_no_copy) != 0); - - locale cl; // Copy of the global C++ locale. + bool first_only ((flags & regex_constants::format_first_only) != 0); + bool no_copy ((flags & regex_constants::format_no_copy) != 0); // Beginning of the last unmatched substring. // @@ -72,169 +266,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // The standard implementation calls m.format() here. We perform our // own formatting. // - // Note that we are using char type literals with the assumption that - // being ASCII characters they will be properly "widened" to the - // corresponding literals of the C template parameter type. - // - auto digit = [] (C c) -> int - { - return c >= '0' && c <= '9' ? c - '0' : -1; - }; - - enum class case_conv {none, upper, lower, upper_once, lower_once} - mode (case_conv::none); - - auto conv_chr = [&mode, &cl] (C c) -> C - { - switch (mode) - { - case case_conv::upper_once: mode = case_conv::none; // Fall through. - case case_conv::upper: c = toupper (c, cl); break; - case case_conv::lower_once: mode = case_conv::none; // Fall through. - case case_conv::lower: c = tolower (c, cl); break; - case case_conv::none: break; - } - return c; - }; - - string_type r; - - auto append_chr = [&r, &conv_chr] (C c) - { - r.push_back (conv_chr (c)); - }; - - auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) - { - // Optimize for the common case. - // - if (mode == case_conv::none) - r.append (b, e); - else - { - for (str_it i (b); i != e; ++i) - r.push_back (conv_chr (*i)); - } - }; - - size_t n (fmt.size ()); - for (size_t i (0); i < n; ++i) - { - C c (fmt[i]); - - switch (c) - { - case '$': - { - // Check if this is a $-based escape sequence. Interpret it - // accordingly if that's the case, treat '$' as a regular - // character otherwise. - // - c = fmt[++i]; // '\0' if last. - - switch (c) - { - case '$': append_chr (c); break; - case '&': append_str (m[0].first, m[0].second); break; - case '`': - { - append_str (m.prefix ().first, m.prefix ().second); - break; - } - case '\'': - { - append_str (m.suffix ().first, m.suffix ().second); - break; - } - default: - { - // Check if this is a sub-expression 1-based index ($n or - // $nn). Append the matching substring if that's the case. - // Treat '$' as a regular character otherwise. Index greater - // than the sub-expression count is silently ignored. - // - int si (digit (c)); - if (si >= 0) - { - int d; - if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. - { - si = si * 10 + d; - ++i; - } - } - - if (si > 0) - { - // m[0] refers to the matched substring. Note that we - // ignore unmatched sub-expression references. - // - if (static_cast (si) < m.size () && m[si].matched) - append_str (m[si].first, m[si].second); - } - else - { - // Not a $-based escape sequence so treat '$' as a - // regular character. - // - --i; - append_chr ('$'); - } - - break; - } - } - - break; - } - case '\\': - { - c = fmt[++i]; // '\0' if last. - - switch (c) - { - case '\\': append_chr (c); break; - - case 'u': mode = case_conv::upper_once; break; - case 'l': mode = case_conv::lower_once; break; - case 'U': mode = case_conv::upper; break; - case 'L': mode = case_conv::lower; break; - case 'E': mode = case_conv::none; break; - default: - { - // Check if this is a sub-expression 1-based index. Append - // the matching substring if that's the case, Skip '\\' - // otherwise. Index greater than the sub-expression count is - // silently ignored. - // - int si (digit (c)); - if (si > 0) - { - // m[0] refers to the matched substring. Note that we - // ignore unmatched sub-expression references. - // - if (static_cast (si) < m.size () && m[si].matched) - append_str (m[si].first, m[si].second); - } - else - --i; - - break; - } - } - - break; - } - default: - { - // Append a regular character. - // - append_chr (c); - break; - } - } - } - + string_type r (regex_replace_match_results (m, fmt)); append (r.begin (), r.end ()); } } -- cgit v1.1