1 files changed, 436 insertions, 0 deletions
diff --git a/libbuild2/script/regex.cxx b/libbuild2/script/regex.cxx
new file mode 100644
index 0000000..3f796b6
--- /dev/null
+++ b/libbuild2/script/regex.cxx
@@ -0,0 +1,436 @@
+// file      : libbuild2/script/regex.cxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#include <locale>
+
+#include <libbuild2/script/regex.hxx>
+
+using namespace std;
+
+namespace build2
+{
+  namespace script
+  {
+    namespace regex
+    {
+      static_assert (alignof (char_string) % 4 == 0,
+                     "unexpected char_string alignment");
+
+      static_assert (alignof (char_regex) % 4 == 0,
+                     "unexpected char_regex alignment");
+
+      static_assert (sizeof (uintptr_t) > sizeof (int16_t),
+                     "unexpected uintptr_t size");
+
+      const line_char line_char::nul (0);
+      const line_char line_char::eof (-1);
+
+      // line_char
+      //
+      // We package the special character into uintptr_t with the following
+      // steps:
+      //
+      // - narrow down int value to int16_t (preserves all the valid values)
+      //
+      // - convert to uint16_t (bitwise representation stays the same, but no
+      //   need to bother with signed value widening, leftmost bits loss on
+      //   left shift, etc)
+      //
+      // - convert to uintptr_t (storage type)
+      //
+      // - shift left by two bits (the operation is fully reversible as
+      //   uintptr_t is wider then uint16_t)
+      //
+      line_char::
+      line_char (int c)
+          : data_ (
+            (static_cast <uintptr_t> (
+              static_cast<uint16_t> (
+                static_cast<int16_t> (c))) << 2) |
+            static_cast <uintptr_t> (line_type::special))
+      {
+        // @@ How can we allow anything for basic_regex but only subset
+        //    for our own code?
+        //
+        const char ex[] = "pn\n\r";
+
+        assert (c == 0  || // Null character.
+
+                // EOF. Note that is also passed by msvcrt as _Meta_eos
+                // enum value.
+                //
+                c == -1 ||
+
+                // libstdc++ line/paragraph separators.
+                //
+                c == u'\u2028' || c == u'\u2029' ||
+
+                (c > 0 && c <= 255 && (
+                  // Supported regex special characters.
+                  //
+                  syntax (c) ||
+
+                  // libstdc++ look-ahead tokens, newline chars.
+                  //
+                  string::traits_type::find (ex, 4, c) != nullptr)));
+      }
+
+      line_char::
+      line_char (const char_string& s, line_pool& p)
+          : line_char (&(*p.strings.emplace (s).first))
+      {
+      }
+
+      line_char::
+      line_char (char_string&& s, line_pool& p)
+          : line_char (&(*p.strings.emplace (move (s)).first))
+      {
+      }
+
+      line_char::
+      line_char (char_regex r, line_pool& p)
+          // Note: in C++17 can write as p.regexes.emplace_front(move (r))
+          //
+          : line_char (&(*p.regexes.emplace (p.regexes.begin (), move (r))))
+      {
+      }
+
+      bool
+      line_char::syntax (char c)
+      {
+        return string::traits_type::find (
+          "()|.*+?{}\\0123456789,=!", 23, c) != nullptr;
+      }
+
+      bool
+      operator== (const line_char& l, const line_char& r)
+      {
+        line_type lt (l.type ());
+        line_type rt (r.type ());
+
+        if (lt == rt)
+        {
+          bool res (true);
+
+          switch (lt)
+          {
+          case line_type::special: res = l.special () == r.special (); break;
+          case line_type::regex:   assert (false); break;
+
+            // Note that we use pointers (rather than vales) comparison
+            // assuming that the strings must belong to the same pool.
+            //
+          case line_type::literal: res = l.literal () == r.literal (); break;
+          }
+
+          return res;
+        }
+
+        // Match literal with regex.
+        //
+        if (lt == line_type::literal && rt == line_type::regex)
+          return regex_match (*l.literal (), *r.regex ());
+        else if (rt == line_type::literal && lt == line_type::regex)
+          return regex_match (*r.literal (), *l.regex ());
+
+        return false;
+      }
+
+      bool
+      operator< (const line_char& l, const line_char& r)
+      {
+        if (l == r)
+          return false;
+
+        line_type lt (l.type ());
+        line_type rt (r.type ());
+
+        if (lt != rt)
+          return lt < rt;
+
+        bool res (false);
+
+        switch (lt)
+        {
+        case line_type::special: res =  l.special () <  r.special (); break;
+        case line_type::literal: res = *l.literal () < *r.literal (); break;
+        case line_type::regex:   assert (false); break;
+        }
+
+        return res;
+      }
+
+      // line_char_locale
+      //
+
+      // An exemplar locale with the std::ctype<line_char> facet. It is used
+      // for the subsequent line char locale objects creation (see below)
+      // which normally ends up with a shallow copy of a reference-counted
+      // object.
+      //
+      // Note that creating the line char locales from the exemplar is not
+      // merely an optimization: there is a data race in the libstdc++ (at
+      // least as of GCC 9.1) implementation of the locale(const locale&,
+      // Facet*) constructor (bug #91057).
+      //
+      // Also note that we install the facet in init() rather than during
+      // the object creation to avoid a race with the std::locale-related
+      // global variables initialization.
+      //
+      static locale line_char_locale_exemplar;
+
+      void
+      init ()
+      {
+        line_char_locale_exemplar =
+          locale (locale (),
+                  new std::ctype<line_char> ()); // Hidden by ctype bitmask.
+      }
+
+      line_char_locale::
+      line_char_locale ()
+          : locale (line_char_locale_exemplar)
+      {
+        // Make sure init() has been called.
+        //
+        // Note: has_facet() is hidden by a private function in libc++.
+        //
+        assert (std::has_facet<std::ctype<line_char>> (*this));
+      }
+
+      // char_regex
+      //
+      // Transform regex according to the extended flags {idot}. If regex is
+      // malformed then keep transforming, so the resulting string is
+      // malformed the same way. We expect the error to be reported by the
+      // char_regex ctor.
+      //
+      static string
+      transform (const string& s, char_flags f)
+      {
+        assert ((f & char_flags::idot) != char_flags::none);
+
+        string r;
+        bool escape (false);
+        bool cclass (false);
+
+        for (char c: s)
+        {
+          // Inverse escaping for a dot which is out of the char class
+          // brackets.
+          //
+          bool inverse (c == '.' && !cclass);
+
+          // Handle the escape case. Note that we delay adding the backslash
+          // since we may have to inverse things.
+          //
+          if (escape)
+          {
+            if (!inverse)
+              r += '\\';
+
+            r += c;
+            escape = false;
+
+            continue;
+          }
+          else if (c == '\\')
+          {
+            escape = true;
+            continue;
+          }
+
+          // Keep track of being inside the char class brackets, escape if
+          // inversion. Note that we never inverse square brackets.
+          //
+          if (c == '[' && !cclass)
+            cclass = true;
+          else if (c == ']' && cclass)
+            cclass = false;
+          else if (inverse)
+            r += '\\';
+
+          r += c;
+        }
+
+        if (escape) // Regex is malformed but that's not our problem.
+          r += '\\';
+
+        return r;
+      }
+
+      static char_regex::flag_type
+      to_std_flags (char_flags f)
+      {
+        // Note that ECMAScript flag is implied in the absense of a grammar
+        // flag.
+        //
+        return (f & char_flags::icase) != char_flags::none
+          ? char_regex::icase
+          : char_regex::flag_type ();
+      }
+
+      char_regex::
+      char_regex (const char_string& s, char_flags f)
+          : base_type ((f & char_flags::idot) != char_flags::none
+                       ? transform (s, f)
+                       : s,
+                       to_std_flags (f))
+      {
+      }
+    }
+  }
+}
+
+namespace std
+{
+  using namespace build2::script::regex;
+
+  // char_traits<line_char>
+  //
+  line_char* char_traits<line_char>::
+  assign (char_type* s, size_t n, char_type c)
+  {
+    for (size_t i (0); i != n; ++i)
+      s[i] = c;
+    return s;
+  }
+
+  line_char* char_traits<line_char>::
+  move (char_type* d, const char_type* s, size_t n)
+  {
+    if (n > 0 && d != s)
+    {
+      // If d < s then it can't be in [s, s + n) range and so using copy() is
+      // safe. Otherwise d + n is out of (s, s + n] range and so using
+      // copy_backward() is safe.
+      //
+      if (d < s)
+        std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy().
+      else
+        copy_backward (s, s + n, d + n);
+    }
+
+    return d;
+  }
+
+  line_char* char_traits<line_char>::
+  copy (char_type* d, const char_type* s, size_t n)
+  {
+    std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy().
+    return d;
+  }
+
+  int char_traits<line_char>::
+  compare (const char_type* s1, const char_type* s2, size_t n)
+  {
+    for (size_t i (0); i != n; ++i)
+    {
+      if (s1[i] < s2[i])
+        return -1;
+      else if (s2[i] < s1[i])
+        return 1;
+    }
+
+    return 0;
+  }
+
+  size_t char_traits<line_char>::
+  length (const char_type* s)
+  {
+    size_t i (0);
+    while (s[i] != char_type::nul)
+      ++i;
+
+    return i;
+  }
+
+  const line_char* char_traits<line_char>::
+  find (const char_type* s, size_t n, const char_type& c)
+  {
+    for (size_t i (0); i != n; ++i)
+    {
+      if (s[i] == c)
+        return s + i;
+    }
+
+    return nullptr;
+  }
+
+  // ctype<line_char>
+  //
+  locale::id ctype<line_char>::id;
+
+  const line_char* ctype<line_char>::
+  is (const char_type* b, const char_type* e, mask* m) const
+  {
+    while (b != e)
+    {
+      const char_type& c (*b++);
+
+      *m++ = c.type () == line_type::special && c.special () >= 0 &&
+        build2::digit (static_cast<char> (c.special ()))
+        ? digit
+        : 0;
+    }
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  scan_is (mask m, const char_type* b, const char_type* e) const
+  {
+    for (; b != e; ++b)
+    {
+      if (is (m, *b))
+        return b;
+    }
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  scan_not (mask m, const char_type* b, const char_type* e) const
+  {
+    for (; b != e; ++b)
+    {
+      if (!is (m, *b))
+        return b;
+    }
+
+    return e;
+  }
+
+  const char* ctype<line_char>::
+  widen (const char* b, const char* e, char_type* c) const
+  {
+    while (b != e)
+      *c++ = widen (*b++);
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  narrow (const char_type* b, const char_type* e, char def, char* c) const
+  {
+    while (b != e)
+      *c++ = narrow (*b++, def);
+
+    return e;
+  }
+
+  // regex_traits<line_char>
+  //
+  int regex_traits<line_char>::
+  value (char_type c, int radix) const
+  {
+    assert (radix == 8 || radix == 10 || radix == 16);
+
+    if (c.type () != line_type::special)
+      return -1;
+
+    const char digits[] = "0123456789ABCDEF";
+    const char* d (string::traits_type::find (digits, radix, c.special ()));
+    return d != nullptr ? static_cast<int> (d - digits) : -1;
+  }
+}