1 files changed, 440 insertions, 0 deletions
diff --git a/libbuild2/test/script/regex.cxx b/libbuild2/test/script/regex.cxx
new file mode 100644
index 0000000..20dfaa6
--- /dev/null
+++ b/libbuild2/test/script/regex.cxx
@@ -0,0 +1,440 @@
+// file      : libbuild2/test/script/regex.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <locale>
+
+#include <libbuild2/test/script/regex.hxx>
+
+using namespace std;
+
+namespace build2
+{
+  namespace test
+  {
+    namespace script
+    {
+      namespace regex
+      {
+        static_assert (alignof (char_string) % 4 == 0,
+                       "unexpected char_string alignment");
+
+        static_assert (alignof (char_regex) % 4 == 0,
+                       "unexpected char_regex alignment");
+
+        static_assert (sizeof (uintptr_t) > sizeof (int16_t),
+                       "unexpected uintptr_t size");
+
+        const line_char line_char::nul (0);
+        const line_char line_char::eof (-1);
+
+        // line_char
+        //
+        // We package the special character into uintptr_t with the following
+        // steps:
+        //
+        // - narrow down int value to int16_t (preserves all the valid values)
+        //
+        // - convert to uint16_t (bitwise representation stays the same, but no
+        //   need to bother with signed value widening, leftmost bits loss on
+        //   left shift, etc)
+        //
+        // - convert to uintptr_t (storage type)
+        //
+        // - shift left by two bits (the operation is fully reversible as
+        //   uintptr_t is wider then uint16_t)
+        //
+        line_char::
+        line_char (int c)
+            : data_ (
+              (static_cast <uintptr_t> (
+                static_cast<uint16_t> (
+                  static_cast<int16_t> (c))) << 2) |
+              static_cast <uintptr_t> (line_type::special))
+        {
+          // @@ How can we allow anything for basic_regex but only subset
+          //    for our own code?
+          //
+          const char ex[] = "pn\n\r";
+
+          assert (c == 0  || // Null character.
+
+                  // EOF. Note that is also passed by msvcrt as _Meta_eos
+                  // enum value.
+                  //
+                  c == -1 ||
+
+                  // libstdc++ line/paragraph separators.
+                  //
+                  c == u'\u2028' || c == u'\u2029' ||
+
+                  (c > 0 && c <= 255 && (
+                    // Supported regex special characters.
+                    //
+                    syntax (c) ||
+
+                    // libstdc++ look-ahead tokens, newline chars.
+                    //
+                    string::traits_type::find (ex, 4, c) != nullptr)));
+        }
+
+        line_char::
+        line_char (const char_string& s, line_pool& p)
+            : line_char (&(*p.strings.emplace (s).first))
+        {
+        }
+
+        line_char::
+        line_char (char_string&& s, line_pool& p)
+            : line_char (&(*p.strings.emplace (move (s)).first))
+        {
+        }
+
+        line_char::
+        line_char (char_regex r, line_pool& p)
+            // Note: in C++17 can write as p.regexes.emplace_front(move (r))
+            //
+            : line_char (&(*p.regexes.emplace (p.regexes.begin (), move (r))))
+        {
+        }
+
+        bool
+        line_char::syntax (char c)
+        {
+          return string::traits_type::find (
+            "()|.*+?{}\\0123456789,=!", 23, c) != nullptr;
+        }
+
+        bool
+        operator== (const line_char& l, const line_char& r)
+        {
+          line_type lt (l.type ());
+          line_type rt (r.type ());
+
+          if (lt == rt)
+          {
+            bool res (true);
+
+            switch (lt)
+            {
+            case line_type::special: res = l.special () == r.special (); break;
+            case line_type::regex:   assert (false); break;
+
+              // Note that we use pointers (rather than vales) comparison
+              // assuming that the strings must belong to the same pool.
+              //
+            case line_type::literal: res = l.literal () == r.literal (); break;
+            }
+
+            return res;
+          }
+
+          // Match literal with regex.
+          //
+          if (lt == line_type::literal && rt == line_type::regex)
+            return regex_match (*l.literal (), *r.regex ());
+          else if (rt == line_type::literal && lt == line_type::regex)
+            return regex_match (*r.literal (), *l.regex ());
+
+          return false;
+        }
+
+        bool
+        operator< (const line_char& l, const line_char& r)
+        {
+          if (l == r)
+            return false;
+
+          line_type lt (l.type ());
+          line_type rt (r.type ());
+
+          if (lt != rt)
+            return lt < rt;
+
+          bool res (false);
+
+          switch (lt)
+          {
+          case line_type::special: res =  l.special () <  r.special (); break;
+          case line_type::literal: res = *l.literal () < *r.literal (); break;
+          case line_type::regex:   assert (false); break;
+          }
+
+          return res;
+        }
+
+        // line_char_locale
+        //
+
+        // An exemplar locale with the std::ctype<line_char> facet. It is used
+        // for the subsequent line char locale objects creation (see below)
+        // which normally ends up with a shallow copy of a reference-counted
+        // object.
+        //
+        // Note that creating the line char locales from the exemplar is not
+        // merely an optimization: there is a data race in the libstdc++ (at
+        // least as of GCC 9.1) implementation of the locale(const locale&,
+        // Facet*) constructor (bug #91057).
+        //
+        // Also note that we install the facet in init() rather than during
+        // the object creation to avoid a race with the std::locale-related
+        // global variables initialization.
+        //
+        static locale line_char_locale_exemplar;
+
+        void
+        init ()
+        {
+          line_char_locale_exemplar =
+            locale (locale (),
+                    new std::ctype<line_char> ()); // Hidden by ctype bitmask.
+        }
+
+        line_char_locale::
+        line_char_locale ()
+            : locale (line_char_locale_exemplar)
+        {
+          // Make sure init() has been called.
+          //
+          // Note: has_facet() is hidden by a private function in libc++.
+          //
+          assert (std::has_facet<std::ctype<line_char>> (*this));
+        }
+
+        // char_regex
+        //
+        // Transform regex according to the extended flags {idot}. If regex is
+        // malformed then keep transforming, so the resulting string is
+        // malformed the same way. We expect the error to be reported by the
+        // char_regex ctor.
+        //
+        static string
+        transform (const string& s, char_flags f)
+        {
+          assert ((f & char_flags::idot) != char_flags::none);
+
+          string r;
+          bool escape (false);
+          bool cclass (false);
+
+          for (char c: s)
+          {
+            // Inverse escaping for a dot which is out of the char class
+            // brackets.
+            //
+            bool inverse (c == '.' && !cclass);
+
+            // Handle the escape case. Note that we delay adding the backslash
+            // since we may have to inverse things.
+            //
+            if (escape)
+            {
+              if (!inverse)
+                r += '\\';
+
+              r += c;
+              escape = false;
+
+              continue;
+            }
+            else if (c == '\\')
+            {
+              escape = true;
+              continue;
+            }
+
+            // Keep track of being inside the char class brackets, escape if
+            // inversion. Note that we never inverse square brackets.
+            //
+            if (c == '[' && !cclass)
+              cclass = true;
+            else if (c == ']' && cclass)
+              cclass = false;
+            else if (inverse)
+              r += '\\';
+
+            r += c;
+          }
+
+          if (escape) // Regex is malformed but that's not our problem.
+            r += '\\';
+
+          return r;
+        }
+
+        static char_regex::flag_type
+        to_std_flags (char_flags f)
+        {
+          // Note that ECMAScript flag is implied in the absense of a grammar
+          // flag.
+          //
+          return (f & char_flags::icase) != char_flags::none
+            ? char_regex::icase
+            : char_regex::flag_type ();
+        }
+
+        char_regex::
+        char_regex (const char_string& s, char_flags f)
+            : base_type ((f & char_flags::idot) != char_flags::none
+                         ? transform (s, f)
+                         : s,
+                         to_std_flags (f))
+        {
+        }
+      }
+    }
+  }
+}
+
+namespace std
+{
+  using namespace build2::test::script::regex;
+
+  // char_traits<line_char>
+  //
+  line_char* char_traits<line_char>::
+  assign (char_type* s, size_t n, char_type c)
+  {
+    for (size_t i (0); i != n; ++i)
+      s[i] = c;
+    return s;
+  }
+
+  line_char* char_traits<line_char>::
+  move (char_type* d, const char_type* s, size_t n)
+  {
+    if (n > 0 && d != s)
+    {
+      // If d < s then it can't be in [s, s + n) range and so using copy() is
+      // safe. Otherwise d + n is out of (s, s + n] range and so using
+      // copy_backward() is safe.
+      //
+      if (d < s)
+        std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy().
+      else
+        copy_backward (s, s + n, d + n);
+    }
+
+    return d;
+  }
+
+  line_char* char_traits<line_char>::
+  copy (char_type* d, const char_type* s, size_t n)
+  {
+    std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy().
+    return d;
+  }
+
+  int char_traits<line_char>::
+  compare (const char_type* s1, const char_type* s2, size_t n)
+  {
+    for (size_t i (0); i != n; ++i)
+    {
+      if (s1[i] < s2[i])
+        return -1;
+      else if (s2[i] < s1[i])
+        return 1;
+    }
+
+    return 0;
+  }
+
+  size_t char_traits<line_char>::
+  length (const char_type* s)
+  {
+    size_t i (0);
+    while (s[i] != char_type::nul)
+      ++i;
+
+    return i;
+  }
+
+  const line_char* char_traits<line_char>::
+  find (const char_type* s, size_t n, const char_type& c)
+  {
+    for (size_t i (0); i != n; ++i)
+    {
+      if (s[i] == c)
+        return s + i;
+    }
+
+    return nullptr;
+  }
+
+  // ctype<line_char>
+  //
+  locale::id ctype<line_char>::id;
+
+  const line_char* ctype<line_char>::
+  is (const char_type* b, const char_type* e, mask* m) const
+  {
+    while (b != e)
+    {
+      const char_type& c (*b++);
+
+      *m++ = c.type () == line_type::special && c.special () >= 0 &&
+        build2::digit (static_cast<char> (c.special ()))
+        ? digit
+        : 0;
+    }
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  scan_is (mask m, const char_type* b, const char_type* e) const
+  {
+    for (; b != e; ++b)
+    {
+      if (is (m, *b))
+        return b;
+    }
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  scan_not (mask m, const char_type* b, const char_type* e) const
+  {
+    for (; b != e; ++b)
+    {
+      if (!is (m, *b))
+        return b;
+    }
+
+    return e;
+  }
+
+  const char* ctype<line_char>::
+  widen (const char* b, const char* e, char_type* c) const
+  {
+    while (b != e)
+      *c++ = widen (*b++);
+
+    return e;
+  }
+
+  const line_char* ctype<line_char>::
+  narrow (const char_type* b, const char_type* e, char def, char* c) const
+  {
+    while (b != e)
+      *c++ = narrow (*b++, def);
+
+    return e;
+  }
+
+  // regex_traits<line_char>
+  //
+  int regex_traits<line_char>::
+  value (char_type c, int radix) const
+  {
+    assert (radix == 8 || radix == 10 || radix == 16);
+
+    if (c.type () != line_type::special)
+      return -1;
+
+    const char digits[] = "0123456789ABCDEF";
+    const char* d (string::traits_type::find (digits, radix, c.special ()));
+    return d != nullptr ? static_cast<int> (d - digits) : -1;
+  }
+}