aboutsummaryrefslogtreecommitdiff
path: root/libbuild2/functions-regex.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'libbuild2/functions-regex.cxx')
-rw-r--r--libbuild2/functions-regex.cxx542
1 files changed, 542 insertions, 0 deletions
diff --git a/libbuild2/functions-regex.cxx b/libbuild2/functions-regex.cxx
new file mode 100644
index 0000000..2c478fe
--- /dev/null
+++ b/libbuild2/functions-regex.cxx
@@ -0,0 +1,542 @@
+// file : libbuild2/functions-regex.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <sstream>
+
+#include <libbutl/regex.mxx>
+
+#include <libbuild2/function.hxx>
+#include <libbuild2/variable.hxx>
+
+using namespace std;
+using namespace butl;
+
+namespace build2
+{
+ // Convert value of an arbitrary type to string.
+ //
+ static inline string
+ to_string (value&& v)
+ {
+ // Optimize for the string value type.
+ //
+ if (v.type != &value_traits<string>::value_type)
+ untypify (v);
+
+ return convert<string> (move (v));
+ }
+
+ // Parse a regular expression. Throw invalid_argument if it is not valid.
+ //
+ // Note: also used in functions-process.cxx (thus not static).
+ //
+ regex
+ parse_regex (const string& s, regex::flag_type f)
+ {
+ try
+ {
+ return regex (s, f);
+ }
+ catch (const regex_error& e)
+ {
+ // Print regex_error description if meaningful (no space).
+ //
+ ostringstream os;
+ os << "invalid regex '" << s << "'" << e;
+ throw invalid_argument (os.str ());
+ }
+ }
+
+ // Match value of an arbitrary type against the regular expression. See
+ // match() overloads (below) for details.
+ //
+ static value
+ match (value&& v, const string& re, optional<names>&& flags)
+ {
+ // Parse flags.
+ //
+ regex::flag_type rf (regex::ECMAScript);
+ bool subs (false);
+
+ if (flags)
+ {
+ for (auto& f: *flags)
+ {
+ string s (convert<string> (move (f)));
+
+ if (s == "icase")
+ rf |= regex::icase;
+ else if (s == "return_subs")
+ subs = true;
+ else
+ throw invalid_argument ("invalid flag '" + s + "'");
+ }
+ }
+
+ // Parse regex.
+ //
+ regex rge (parse_regex (re, rf));
+
+ // Match.
+ //
+ string s (to_string (move (v)));
+
+ if (!subs)
+ return value (regex_match (s, rge)); // Return boolean value.
+
+ names r;
+ match_results<string::const_iterator> m;
+
+ if (regex_match (s, m, rge))
+ {
+ assert (!m.empty ());
+
+ for (size_t i (1); i != m.size (); ++i)
+ {
+ if (m[i].matched)
+ r.emplace_back (m.str (i));
+ }
+ }
+
+ return value (move (r));
+ }
+
+ // Determine if there is a match between the regular expression and some
+ // part of a value of an arbitrary type. See search() overloads (below)
+ // for details.
+ //
+ static value
+ search (value&& v, const string& re, optional<names>&& flags)
+ {
+ // Parse flags.
+ //
+ regex::flag_type rf (regex::ECMAScript);
+ bool match (false);
+ bool subs (false);
+
+ if (flags)
+ {
+ for (auto& f: *flags)
+ {
+ string s (convert<string> (move (f)));
+
+ if (s == "icase")
+ rf |= regex::icase;
+ else if (s == "return_match")
+ match = true;
+ else if (s == "return_subs")
+ subs = true;
+ else
+ throw invalid_argument ("invalid flag '" + s + "'");
+ }
+ }
+
+ // Parse regex.
+ //
+ regex rge (parse_regex (re, rf));
+
+ // Search.
+ //
+ string s (to_string (move (v)));
+
+ if (!match && !subs)
+ return value (regex_search (s, rge)); // Return boolean value.
+
+ names r;
+ match_results<string::const_iterator> m;
+
+ if (regex_search (s, m, rge))
+ {
+ assert (!m.empty ());
+
+ if (match)
+ {
+ assert (m[0].matched);
+ r.emplace_back (m.str (0));
+ }
+
+ if (subs)
+ {
+ for (size_t i (1); i != m.size (); ++i)
+ {
+ if (m[i].matched)
+ r.emplace_back (m.str (i));
+ }
+ }
+ }
+
+ return value (move (r));
+ }
+
+ static pair<regex::flag_type, regex_constants::match_flag_type>
+ parse_replacement_flags (optional<names>&& flags, bool first_only = true)
+ {
+ regex::flag_type rf (regex::ECMAScript);
+ regex_constants::match_flag_type mf (regex_constants::match_default);
+
+ if (flags)
+ {
+ for (auto& f: *flags)
+ {
+ string s (convert<string> (move (f)));
+
+ if (s == "icase")
+ rf |= regex::icase;
+ else if (first_only && s == "format_first_only")
+ mf |= regex_constants::format_first_only;
+ else if (s == "format_no_copy")
+ mf |= regex_constants::format_no_copy;
+ else
+ throw invalid_argument ("invalid flag '" + s + "'");
+ }
+ }
+
+ return make_pair (rf, mf);
+ }
+
+ // Replace matched parts in a value of an arbitrary type, using the format
+ // string. See replace() overloads (below) for details.
+ //
+ static names
+ replace (value&& v,
+ const string& re,
+ const string& fmt,
+ optional<names>&& flags)
+ {
+ auto fl (parse_replacement_flags (move (flags)));
+ regex rge (parse_regex (re, fl.first));
+
+ names r;
+
+ try
+ {
+ r.emplace_back (regex_replace_search (to_string (move (v)),
+ rge,
+ fmt,
+ fl.second).first);
+ }
+ catch (const regex_error& e)
+ {
+ fail << "unable to replace" << e;
+ }
+
+ return r;
+ }
+
+ // Split a value of an arbitrary type into a list of unmatched value parts
+ // and replacements of the matched parts. See split() overloads (below) for
+ // details.
+ //
+ static names
+ split (value&& v,
+ const string& re,
+ const string& fmt,
+ optional<names>&& flags)
+ {
+ auto fl (parse_replacement_flags (move (flags), false));
+ regex rge (parse_regex (re, fl.first));
+
+ names r;
+
+ try
+ {
+ regex_replace_search (to_string (move (v)), rge, fmt,
+ [&r] (string::const_iterator b,
+ string::const_iterator e)
+ {
+ if (b != e)
+ r.emplace_back (string (b, e));
+ },
+ fl.second);
+ }
+ catch (const regex_error& e)
+ {
+ fail << "unable to split" << e;
+ }
+
+ return r;
+ }
+
+ // Replace matched parts of list elements using the format string. See
+ // apply() overloads (below) for details.
+ //
+ static names
+ apply (names&& s,
+ const string& re,
+ const string& fmt,
+ optional<names>&& flags)
+ {
+ auto fl (parse_replacement_flags (move (flags)));
+ regex rge (parse_regex (re, fl.first));
+
+ names r;
+
+ try
+ {
+ for (auto& v: s)
+ {
+ string s (regex_replace_search (convert<string> (move (v)),
+ rge,
+ fmt,
+ fl.second).first);
+
+ if (!s.empty ())
+ r.emplace_back (move (s));
+ }
+ }
+ catch (const regex_error& e)
+ {
+ fail << "unable to apply" << e;
+ }
+
+ return r;
+ }
+
+ // Replace matched parts of list elements using the format string and
+ // concatenate the transformed elements. See merge() overloads (below) for
+ // details.
+ //
+ static names
+ merge (names&& s,
+ const string& re,
+ const string& fmt,
+ optional<string>&& delim,
+ optional<names>&& flags)
+ {
+ auto fl (parse_replacement_flags (move (flags)));
+ regex rge (parse_regex (re, fl.first));
+
+ string rs;
+
+ try
+ {
+ for (auto& v: s)
+ {
+ string s (regex_replace_search (convert<string> (move (v)),
+ rge,
+ fmt,
+ fl.second).first);
+
+ if (!s.empty ())
+ {
+ if (!rs.empty () && delim)
+ rs.append (*delim);
+
+ rs.append (s);
+ }
+
+ }
+ }
+ catch (const regex_error& e)
+ {
+ fail << "unable to merge" << e;
+ }
+
+ names r;
+ r.emplace_back (move (rs));
+ return r;
+ }
+
+ void
+ regex_functions ()
+ {
+ function_family f ("regex");
+
+ // $regex.match(<val>, <pat> [, <flags>])
+ //
+ // Match a value of an arbitrary type against the regular expression.
+ // Convert the value to string prior to matching. Return the boolean value
+ // unless return_subs flag is specified (see below), in which case return
+ // names (empty if no match).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // return_subs - return names (rather than boolean), that contain
+ // sub-strings that match the marked sub-expressions
+ //
+ f[".match"] = [](value s, string re, optional<names> flags)
+ {
+ return match (move (s), re, move (flags));
+ };
+
+ f[".match"] = [](value s, names re, optional<names> flags)
+ {
+ return match (move (s), convert<string> (move (re)), move (flags));
+ };
+
+ // $regex.search(<val>, <pat> [, <flags>])
+ //
+ // Determine if there is a match between the regular expression and some
+ // part of a value of an arbitrary type. Convert the value to string prior
+ // to searching. Return the boolean value unless return_match or
+ // return_subs flag is specified (see below) in which case return names
+ // (empty if no match).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // return_match - return names (rather than boolean), that contain a
+ // sub-string that matches the whole regular expression
+ //
+ // return_subs - return names (rather than boolean), that contain
+ // sub-strings that match the marked sub-expressions
+ //
+ // If both return_match and return_subs flags are specified then the
+ // sub-string that matches the whole regular expression comes first.
+ //
+ f[".search"] = [](value s, string re, optional<names> flags)
+ {
+ return search (move (s), re, move (flags));
+ };
+
+ f[".search"] = [](value s, names re, optional<names> flags)
+ {
+ return search (move (s), convert<string> (move (re)), move (flags));
+ };
+
+ // $regex.replace(<val>, <pat>, <fmt> [, <flags>])
+ //
+ // Replace matched parts in a value of an arbitrary type, using the format
+ // string. Convert the value to string prior to matching. The result value
+ // is always untyped, regardless of the argument type.
+ //
+ // Substitution escape sequences are extended with a subset of Perl
+ // sequences (see libbutl/regex.mxx for details).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // format_first_only - only replace the first match
+ //
+ // format_no_copy - do not copy unmatched value parts into the result
+ //
+ // If both format_first_only and format_no_copy flags are specified then
+ // the result will only contain the replacement of the first match.
+ //
+ f[".replace"] = [](value s, string re, string fmt, optional<names> flags)
+ {
+ return replace (move (s), re, fmt, move (flags));
+ };
+
+ f[".replace"] = [](value s, names re, names fmt, optional<names> flags)
+ {
+ return replace (move (s),
+ convert<string> (move (re)),
+ convert<string> (move (fmt)),
+ move (flags));
+ };
+
+ // $regex.split(<val>, <pat>, <fmt> [, <flags>])
+ //
+ // Split a value of an arbitrary type into a list of unmatched value parts
+ // and replacements of the matched parts, omitting empty ones. Convert the
+ // value to string prior to matching.
+ //
+ // Substitution escape sequences are extended with a subset of Perl
+ // sequences (see libbutl/regex.mxx for details).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // format_no_copy - do not copy unmatched value parts into the result
+ //
+ f[".split"] = [](value s, string re, string fmt, optional<names> flags)
+ {
+ return split (move (s), re, fmt, move (flags));
+ };
+
+ f[".split"] = [](value s, names re, names fmt, optional<names> flags)
+ {
+ return split (move (s),
+ convert<string> (move (re)),
+ convert<string> (move (fmt)),
+ move (flags));
+ };
+
+ // $regex.merge(<vals>, <pat>, <fmt> [, <delim> [, <flags>]])
+ //
+ // Replace matched parts in a list of elements using the regex format
+ // string. Convert the elements to string prior to matching. The result
+ // value is untyped and contains concatenation of transformed non-empty
+ // elements optionally separated with a delimiter.
+ //
+ // Substitution escape sequences are extended with a subset of Perl
+ // sequences (see libbutl/regex.mxx for details).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // format_first_only - only replace the first match
+ //
+ // format_no_copy - do not copy unmatched value parts into the result
+ //
+ // If both format_first_only and format_no_copy flags are specified then
+ // the result will be a concatenation of only the first match
+ // replacements.
+ //
+ f[".merge"] = [](names s,
+ string re,
+ string fmt,
+ optional<string> delim,
+ optional<names> flags)
+ {
+ return merge (move (s), re, fmt, move (delim), move (flags));
+ };
+
+ f[".merge"] = [](names s,
+ names re,
+ names fmt,
+ optional<names> delim,
+ optional<names> flags)
+ {
+ return merge (move (s),
+ convert<string> (move (re)),
+ convert<string> (move (fmt)),
+ delim
+ ? convert<string> (move (*delim))
+ : optional<string> (),
+ move (flags));
+ };
+
+ // $regex.apply(<vals>, <pat>, <fmt> [, <flags>])
+ //
+ // Replace matched parts of each element in a list using the regex format
+ // string. Convert the elements to string prior to matching. Return a list
+ // of transformed elements, omitting the empty ones.
+ //
+ // Substitution escape sequences are extended with a subset of Perl
+ // sequences (see libbutl/regex.mxx for details).
+ //
+ // The following flags are supported:
+ //
+ // icase - match ignoring case
+ //
+ // format_first_only - only replace the first match
+ //
+ // format_no_copy - do not copy unmatched value parts into the result
+ //
+ // If both format_first_only and format_no_copy flags are specified then
+ // the result elements will only contain the replacement of the first
+ // match.
+ //
+ f[".apply"] = [](names s, string re, string fmt, optional<names> flags)
+ {
+ return apply (move (s), re, fmt, move (flags));
+ };
+
+ f[".apply"] = [](names s, names re, names fmt, optional<names> flags)
+ {
+ return apply (move (s),
+ convert<string> (move (re)),
+ convert<string> (move (fmt)),
+ move (flags));
+ };
+ }
+}