aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2024-10-25 18:35:43 +0200
committerKaren Arutyunov <karen@codesynthesis.com>2024-10-28 16:59:02 +0200
commit0befab300849be7ac0f77bc4228f8de50a108191 (patch)
treeec68cffd95238b814d879621db191ee78c07b8fc
parent8f892e075668c1ed1d668c59d4d0a7929d0c7f5e (diff)
Make regex_replace_search() not to match empty substrings in non-empty strings
-rw-r--r--libbutl/regex.hxx2
-rw-r--r--libbutl/regex.txx19
-rw-r--r--tests/regex/testscript23
3 files changed, 43 insertions, 1 deletions
diff --git a/libbutl/regex.hxx b/libbutl/regex.hxx
index 9b31075..69009c3 100644
--- a/libbutl/regex.hxx
+++ b/libbutl/regex.hxx
@@ -52,7 +52,7 @@ namespace butl
const std::basic_string<C>& fmt,
F&& append,
std::regex_constants::match_flag_type =
- std::regex_constants::match_default);
+ std::regex_constants::match_default);
// As above but concatenate non-matched substrings and matched substring
// replacements into a string returning it as well as whether the search
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index 214d949..ec9f7af 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -217,6 +217,25 @@ namespace butl
bool first_only ((flags & regex_constants::format_first_only) != 0);
bool no_copy ((flags & regex_constants::format_no_copy) != 0);
+ // Note that by default the std::regex_search(), std::regex_replace(), and
+ // std::regex_iterator() functions match the empty substrings in non-empty
+ // strings for all the major implementations. For example:
+ //
+ // - regex_search("bb", "a*") call returns true.
+ //
+ // - regex_replace("bb", "a*", "x") call returns "xbxbx".
+ //
+ // - regex_replace("a", ".*", "x") call returns "xx".
+ //
+ // - Iterating using the regex_iterator("a", ".*") object ends up with the
+ // two matches: "a" and "".
+ //
+ // Since such a behavior feels counter-intuitive, we suppress it using the
+ // match_not_null flag, except for the empty string.
+ //
+ if (!s.empty ())
+ flags |= regex_constants::match_not_null;
+
// Beginning of the last unmatched substring.
//
str_it ub (s.begin ());
diff --git a/tests/regex/testscript b/tests/regex/testscript
index 93ad4b6..137469d 100644
--- a/tests/regex/testscript
+++ b/tests/regex/testscript
@@ -63,6 +63,21 @@
:
$* xay '/a/\lVZ/' >xvZy
}
+
+ : empty-substring
+ :
+ : Note that the regex search-based replacement with the match_not_null flag
+ : is broken for older versions of libstdc++ and libc++ (may ignore
+ : match_not_null for the former and may hang for some string/pattern for the
+ : latter).
+ :
+ if (($cxx.id != 'gcc' || $cxx.version.major >= 7) && \
+ ($cxx.id != 'clang' || $cxx.version.major >= 6))
+ {
+ $* '' '/.*/x/' >'x' : empty
+ $* a '/a*/x/' >'x' : match
+ $* aa '/b*/x/' == 1 : no-match
+ }
}
: replace-match
@@ -72,6 +87,14 @@
$* abc '/a(b)c/x\1y/' >xby : match
$* abcd '/a(b)c/x\1yd/' == 1 : no-match
+
+ : empty-substring
+ :
+ {
+ $* '' '/.*/x/' >'x' : empty
+ $* a '/a*/x/' >'x' : match
+ $* ab '/a(c*)(b)/\1\2/' >'b' : match-mid
+ }
}
: invalid-regex-fmt