From 5d50c0499b30650deafc291a3872a386d08a3200 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 21 Jun 2017 13:03:56 +0300 Subject: Add regex utility functions --- libbutl/buildfile | 1 + libbutl/regex.cxx | 45 +++++++++++ libbutl/regex.hxx | 57 ++++++++++++++ libbutl/regex.txx | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+) create mode 100644 libbutl/regex.cxx create mode 100644 libbutl/regex.hxx create mode 100644 libbutl/regex.txx diff --git a/libbutl/buildfile b/libbutl/buildfile index d5f492b..42fc421 100644 --- a/libbutl/buildfile +++ b/libbutl/buildfile @@ -26,6 +26,7 @@ lib{butl}: \ {hxx }{ process-details } \ {hxx }{ process-io } \ { txx cxx}{ process-run } \ + {hxx txx cxx}{ regex } \ {hxx ixx cxx}{ sendmail } \ {hxx cxx}{ sha256 } \ {hxx }{ small-vector } \ diff --git a/libbutl/regex.cxx b/libbutl/regex.cxx new file mode 100644 index 0000000..4e2e26f --- /dev/null +++ b/libbutl/regex.cxx @@ -0,0 +1,45 @@ +// file : libbutl/regex.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1910 +# include // strstr() +#endif + +#include +#include +#include // runtime_error + +#include // operator<<(ostream, exception) + +namespace std +{ + // Currently libstdc++ just returns the name of the exception (bug #67361). + // So we check that the description contains at least one space character. + // + // While VC's description is meaningful, it has an undesired prefix that + // resembles the following: 'regex_error(error_badrepeat): '. So we skip it. + // + ostream& + operator<< (ostream& o, const regex_error& e) + { + const char* d (e.what ()); + +#if defined(_MSC_VER) && _MSC_VER <= 1910 + const char* rd (strstr (d, "): ")); + if (rd != nullptr) + d = rd + 3; +#endif + + ostringstream os; + os << runtime_error (d); // Sanitize the description. + + string s (os.str ()); + if (s.find (' ') != string::npos) + o << ": " << s; + + return o; + } +} diff --git a/libbutl/regex.hxx b/libbutl/regex.hxx new file mode 100644 index 0000000..4a93106 --- /dev/null +++ b/libbutl/regex.hxx @@ -0,0 +1,57 @@ +// file : libbutl/regex.hxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUTL_REGEX_HXX +#define LIBBUTL_REGEX_HXX + +#include +#include +#include // basic_string +#include // pair + +#include + +namespace butl +{ + // Like std::regex_match() but extends the standard ECMA-262 + // substitution escape sequences with a subset of Perl sequences: + // + // \\, \u, \l, \U, \L, \E, \1, ..., \9 + // + // Also return the resulting string as well as whether the search + // succeeded. + // + // Notes and limitations: + // + // - The only valid regex_constants flags are match_default, + // format_first_only (format_no_copy can easily be supported). + // + // - If backslash doesn't start any of the listed sequences then it is + // silently dropped and the following character is copied as is. + // + // - The character case conversion is performed according to the global + // C++ locale (which is, unless changed, is the same as C locale and + // both default to the POSIX locale aka "C"). + // + template + std::pair, bool> + regex_replace_ex (const std::basic_string&, + const std::basic_regex&, + const std::basic_string& fmt, + std::regex_constants::match_flag_type = + std::regex_constants::match_default); +} + +namespace std +{ + // Print regex error description but only if it is meaningful (this is also + // why we have to print leading colon). + // + LIBBUTL_EXPORT ostream& + operator<< (ostream&, const regex_error&); +} + +#include + +#endif // LIBBUTL_REGEX_HXX diff --git a/libbutl/regex.txx b/libbutl/regex.txx new file mode 100644 index 0000000..cb8cfe0 --- /dev/null +++ b/libbutl/regex.txx @@ -0,0 +1,218 @@ +// file : libbutl/regex.txx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include +#include // size_t + +namespace butl +{ + template + std::pair, bool> + regex_replace_ex (const std::basic_string& s, + const std::basic_regex& re, + const std::basic_string& fmt, + std::regex_constants::match_flag_type flags) + { + using namespace std; + + using string_type = basic_string; + using str_it = typename string_type::const_iterator; + using regex_it = regex_iterator; + + bool first_only ((flags & std::regex_constants::format_first_only) == + std::regex_constants::format_first_only); + + locale cl; // Copy of the global C++ locale. + string_type r; + + // Beginning of the last unmatched substring. + // + str_it ub (s.begin ()); + + regex_it b (s.begin (), s.end (), re, flags); + regex_it e; + bool match (b != e); + + for (regex_it i (b); i != e; ++i) + { + const match_results& m (*i); + + // Copy the preceeding unmatched substring, save the beginning of the + // one that follows. + // + r.append (ub, m.prefix ().second); + ub = m.suffix ().first; + + if (first_only && i != b) + r.append (m[0].first, m[0].second); // Append matched substring. + else + { + // The standard implementation calls m.format() here. We perform our + // own formatting. + // + // Note that we are using char type literals with the assumption that + // being ASCII characters they will be properly "widened" to the + // corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) + { + r.push_back (conv_chr (c)); + }; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular + // character otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or + // $nn). Append the matching substring if that's the case. + // Treat '$' as a regular character otherwise. Index greater + // than the sub-expression count is silently ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a + // regular character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append + // the matching substring if that's the case, Skip '\\' + // otherwise. Index greater than the sub-expression count is + // silently ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + } + } + + r.append (ub, s.end ()); // Append the rightmost non-matched substring. + return make_pair (move (r), match); + } +} -- cgit v1.1