Add regex_replace_match() and rename regex_replace_ex() to regex_replace_search()

author: Karen Arutyunov <karen@codesynthesis.com> 2018-06-19 15:30:22 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2018-06-19 15:30:22 +0300
commit: 06e915be138b0638e30083f84cecda0eb1bfc895 (patch)
tree: 50f7eca40de25033116c6f6f75524ae5801dcc78 /libbutl/regex.txx
parent: 338d8065f1b681da841fa0d79cc9265776ff1e1e (diff)
1 files changed, 204 insertions, 172 deletions
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index a7a6c9a..fbe2885 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -4,13 +4,209 @@
 
 LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
 {
+  // Replace the regex match results using the format string.
+  //
+  template <typename C>
+  std::basic_string<C>
+  regex_replace_match_results (
+    const std::match_results<typename std::basic_string<C>::const_iterator>& m,
+    const std::basic_string<C>& fmt)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+    using str_it      = typename string_type::const_iterator;
+
+    string_type r;
+
+    // Note that we are using char type literals with the assumption that
+    // being ASCII characters they will be properly "widened" to the
+    // corresponding literals of the C template parameter type.
+    //
+    auto digit = [] (C c) -> int
+    {
+      return c >= '0' && c <= '9' ? c - '0' : -1;
+    };
+
+    enum class case_conv {none, upper, lower, upper_once, lower_once}
+    mode (case_conv::none);
+
+    locale cl; // Copy of the global C++ locale.
+
+    auto conv_chr = [&mode, &cl] (C c) -> C
+    {
+      switch (mode)
+      {
+      case case_conv::upper_once: mode = case_conv::none; // Fall through.
+      case case_conv::upper:      c = toupper (c, cl); break;
+      case case_conv::lower_once: mode = case_conv::none; // Fall through.
+      case case_conv::lower:      c = tolower (c, cl); break;
+      case case_conv::none:       break;
+      }
+      return c;
+    };
+
+    auto append_chr = [&r, &conv_chr] (C c) {r.push_back (conv_chr (c));};
+
+    auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
+    {
+      // Optimize for the common case.
+      //
+      if (mode == case_conv::none)
+        r.append (b, e);
+      else
+      {
+        for (str_it i (b); i != e; ++i)
+          r.push_back (conv_chr (*i));
+      }
+    };
+
+    size_t n (fmt.size ());
+    for (size_t i (0); i < n; ++i)
+    {
+      C c (fmt[i]);
+
+      switch (c)
+      {
+      case '$':
+        {
+          // Check if this is a $-based escape sequence. Interpret it
+          // accordingly if that's the case, treat '$' as a regular character
+          // otherwise.
+          //
+          c = fmt[++i]; // '\0' if last.
+
+          switch (c)
+          {
+          case '$': append_chr (c); break;
+          case '&': append_str (m[0].first, m[0].second); break;
+          case '`':
+            {
+              append_str (m.prefix ().first, m.prefix ().second);
+              break;
+            }
+          case '\'':
+            {
+              append_str (m.suffix ().first, m.suffix ().second);
+              break;
+            }
+          default:
+            {
+              // Check if this is a sub-expression 1-based index ($n or $nn).
+              // Append the matching substring if that's the case. Treat '$'
+              // as a regular character otherwise. Index greater than the
+              // sub-expression count is silently ignored.
+              //
+              int si (digit (c));
+              if (si >= 0)
+              {
+                int d;
+                if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
+                {
+                  si = si * 10 + d;
+                  ++i;
+                }
+              }
+
+              if (si > 0)
+              {
+                // m[0] refers to the matched substring. Note that we ignore
+                // unmatched sub-expression references.
+                //
+                if (static_cast<size_t> (si) < m.size () && m[si].matched)
+                  append_str (m[si].first, m[si].second);
+              }
+              else
+              {
+                // Not a $-based escape sequence so treat '$' as a regular
+                // character.
+                //
+                --i;
+                append_chr ('$');
+              }
+
+              break;
+            }
+          }
+
+          break;
+        }
+      case '\\':
+        {
+          c = fmt[++i]; // '\0' if last.
+
+          switch (c)
+          {
+          case '\\': append_chr (c); break;
+
+          case 'u': mode = case_conv::upper_once; break;
+          case 'l': mode = case_conv::lower_once; break;
+          case 'U': mode = case_conv::upper;      break;
+          case 'L': mode = case_conv::lower;      break;
+          case 'E': mode = case_conv::none;       break;
+          default:
+            {
+              // Check if this is a sub-expression 1-based index. Append the
+              // matching substring if that's the case, Skip '\\' otherwise.
+              // Index greater than the sub-expression count is silently
+              // ignored.
+              //
+              int si (digit (c));
+              if (si > 0)
+              {
+                // m[0] refers to the matched substring. Note that we ignore
+                // unmatched sub-expression references.
+                //
+                if (static_cast<size_t> (si) < m.size () && m[si].matched)
+                  append_str (m[si].first, m[si].second);
+              }
+              else
+                --i;
+
+              break;
+            }
+          }
+
+          break;
+        }
+      default:
+        {
+          // Append a regular character.
+          //
+          append_chr (c);
+          break;
+        }
+      }
+    }
+
+    return r;
+  }
+
+  template <typename C>
+  std::pair<std::basic_string<C>, bool>
+  regex_replace_match (const std::basic_string<C>& s,
+                       const std::basic_regex<C>& re,
+                       const std::basic_string<C>& fmt)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+    using str_it      = typename string_type::const_iterator;
+
+    match_results<str_it> m;
+    bool match (regex_match (s, m, re));
+
+    return make_pair (match ? regex_replace_match_results (m, fmt) : string (),
+                      match);
+  }
+
   template <typename C, typename F>
   bool
-  regex_replace_ex (const std::basic_string<C>& s,
-                    const std::basic_regex<C>& re,
-                    const std::basic_string<C>& fmt,
-                    F&& append,
-                    std::regex_constants::match_flag_type flags)
+  regex_replace_search (const std::basic_string<C>& s,
+                        const std::basic_regex<C>& re,
+                        const std::basic_string<C>& fmt,
+                        F&& append,
+                        std::regex_constants::match_flag_type flags)
   {
     using namespace std;
 
@@ -18,10 +214,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
     using str_it      = typename string_type::const_iterator;
     using regex_it    = regex_iterator<str_it>;
 
-    bool first_only ((flags & std::regex_constants::format_first_only) != 0);
-    bool no_copy ((flags & std::regex_constants::format_no_copy) != 0);
-
-    locale cl; // Copy of the global C++ locale.
+    bool first_only ((flags & regex_constants::format_first_only) != 0);
+    bool no_copy ((flags & regex_constants::format_no_copy) != 0);
 
     // Beginning of the last unmatched substring.
     //
@@ -72,169 +266,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
         // The standard implementation calls m.format() here. We perform our
         // own formatting.
         //
-        // Note that we are using char type literals with the assumption that
-        // being ASCII characters they will be properly "widened" to the
-        // corresponding literals of the C template parameter type.
-        //
-        auto digit = [] (C c) -> int
-        {
-          return c >= '0' && c <= '9' ? c - '0' : -1;
-        };
-
-        enum class case_conv {none, upper, lower, upper_once, lower_once}
-        mode (case_conv::none);
-
-        auto conv_chr = [&mode, &cl] (C c) -> C
-        {
-          switch (mode)
-          {
-          case case_conv::upper_once: mode = case_conv::none; // Fall through.
-          case case_conv::upper:      c = toupper (c, cl); break;
-          case case_conv::lower_once: mode = case_conv::none; // Fall through.
-          case case_conv::lower:      c = tolower (c, cl); break;
-          case case_conv::none:       break;
-          }
-          return c;
-        };
-
-        string_type r;
-
-        auto append_chr = [&r, &conv_chr] (C c)
-        {
-          r.push_back (conv_chr (c));
-        };
-
-        auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
-        {
-          // Optimize for the common case.
-          //
-          if (mode == case_conv::none)
-            r.append (b, e);
-          else
-          {
-            for (str_it i (b); i != e; ++i)
-              r.push_back (conv_chr (*i));
-          }
-        };
-
-        size_t n (fmt.size ());
-        for (size_t i (0); i < n; ++i)
-        {
-          C c (fmt[i]);
-
-          switch (c)
-          {
-          case '$':
-            {
-              // Check if this is a $-based escape sequence. Interpret it
-              // accordingly if that's the case, treat '$' as a regular
-              // character otherwise.
-              //
-              c = fmt[++i]; // '\0' if last.
-
-              switch (c)
-              {
-              case '$': append_chr (c); break;
-              case '&': append_str (m[0].first, m[0].second); break;
-              case '`':
-                {
-                  append_str (m.prefix ().first, m.prefix ().second);
-                  break;
-                }
-              case '\'':
-                {
-                  append_str (m.suffix ().first, m.suffix ().second);
-                  break;
-                }
-              default:
-                {
-                  // Check if this is a sub-expression 1-based index ($n or
-                  // $nn). Append the matching substring if that's the case.
-                  // Treat '$' as a regular character otherwise. Index greater
-                  // than the sub-expression count is silently ignored.
-                  //
-                  int si (digit (c));
-                  if (si >= 0)
-                  {
-                    int d;
-                    if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
-                    {
-                      si = si * 10 + d;
-                      ++i;
-                    }
-                  }
-
-                  if (si > 0)
-                  {
-                    // m[0] refers to the matched substring. Note that we
-                    // ignore unmatched sub-expression references.
-                    //
-                    if (static_cast<size_t> (si) < m.size () && m[si].matched)
-                      append_str (m[si].first, m[si].second);
-                  }
-                  else
-                  {
-                    // Not a $-based escape sequence so treat '$' as a
-                    // regular character.
-                    //
-                    --i;
-                    append_chr ('$');
-                  }
-
-                  break;
-                }
-              }
-
-              break;
-            }
-          case '\\':
-            {
-              c = fmt[++i]; // '\0' if last.
-
-              switch (c)
-              {
-              case '\\': append_chr (c); break;
-
-              case 'u': mode = case_conv::upper_once; break;
-              case 'l': mode = case_conv::lower_once; break;
-              case 'U': mode = case_conv::upper;      break;
-              case 'L': mode = case_conv::lower;      break;
-              case 'E': mode = case_conv::none;       break;
-              default:
-                {
-                  // Check if this is a sub-expression 1-based index. Append
-                  // the matching substring if that's the case, Skip '\\'
-                  // otherwise. Index greater than the sub-expression count is
-                  // silently ignored.
-                  //
-                  int si (digit (c));
-                  if (si > 0)
-                  {
-                    // m[0] refers to the matched substring. Note that we
-                    // ignore unmatched sub-expression references.
-                    //
-                    if (static_cast<size_t> (si) < m.size () && m[si].matched)
-                      append_str (m[si].first, m[si].second);
-                  }
-                  else
-                    --i;
-
-                  break;
-                }
-              }
-
-              break;
-            }
-          default:
-            {
-              // Append a regular character.
-              //
-              append_chr (c);
-              break;
-            }
-          }
-        }
-
+        string_type r (regex_replace_match_results (m, fmt));
         append (r.begin (), r.end ());
       }
     }
author	Karen Arutyunov <karen@codesynthesis.com>	2018-06-19 15:30:22 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2018-06-19 15:30:22 +0300
commit	06e915be138b0638e30083f84cecda0eb1bfc895 (patch)
tree	50f7eca40de25033116c6f6f75524ae5801dcc78 /libbutl/regex.txx
parent	338d8065f1b681da841fa0d79cc9265776ff1e1e (diff)