diff options
Diffstat (limited to 'libbuild2/script/regex.cxx')
-rw-r--r-- | libbuild2/script/regex.cxx | 436 |
1 files changed, 436 insertions, 0 deletions
diff --git a/libbuild2/script/regex.cxx b/libbuild2/script/regex.cxx new file mode 100644 index 0000000..3f796b6 --- /dev/null +++ b/libbuild2/script/regex.cxx @@ -0,0 +1,436 @@ +// file : libbuild2/script/regex.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <locale> + +#include <libbuild2/script/regex.hxx> + +using namespace std; + +namespace build2 +{ + namespace script + { + namespace regex + { + static_assert (alignof (char_string) % 4 == 0, + "unexpected char_string alignment"); + + static_assert (alignof (char_regex) % 4 == 0, + "unexpected char_regex alignment"); + + static_assert (sizeof (uintptr_t) > sizeof (int16_t), + "unexpected uintptr_t size"); + + const line_char line_char::nul (0); + const line_char line_char::eof (-1); + + // line_char + // + // We package the special character into uintptr_t with the following + // steps: + // + // - narrow down int value to int16_t (preserves all the valid values) + // + // - convert to uint16_t (bitwise representation stays the same, but no + // need to bother with signed value widening, leftmost bits loss on + // left shift, etc) + // + // - convert to uintptr_t (storage type) + // + // - shift left by two bits (the operation is fully reversible as + // uintptr_t is wider then uint16_t) + // + line_char:: + line_char (int c) + : data_ ( + (static_cast <uintptr_t> ( + static_cast<uint16_t> ( + static_cast<int16_t> (c))) << 2) | + static_cast <uintptr_t> (line_type::special)) + { + // @@ How can we allow anything for basic_regex but only subset + // for our own code? + // + const char ex[] = "pn\n\r"; + + assert (c == 0 || // Null character. + + // EOF. Note that is also passed by msvcrt as _Meta_eos + // enum value. + // + c == -1 || + + // libstdc++ line/paragraph separators. + // + c == u'\u2028' || c == u'\u2029' || + + (c > 0 && c <= 255 && ( + // Supported regex special characters. + // + syntax (c) || + + // libstdc++ look-ahead tokens, newline chars. + // + string::traits_type::find (ex, 4, c) != nullptr))); + } + + line_char:: + line_char (const char_string& s, line_pool& p) + : line_char (&(*p.strings.emplace (s).first)) + { + } + + line_char:: + line_char (char_string&& s, line_pool& p) + : line_char (&(*p.strings.emplace (move (s)).first)) + { + } + + line_char:: + line_char (char_regex r, line_pool& p) + // Note: in C++17 can write as p.regexes.emplace_front(move (r)) + // + : line_char (&(*p.regexes.emplace (p.regexes.begin (), move (r)))) + { + } + + bool + line_char::syntax (char c) + { + return string::traits_type::find ( + "()|.*+?{}\\0123456789,=!", 23, c) != nullptr; + } + + bool + operator== (const line_char& l, const line_char& r) + { + line_type lt (l.type ()); + line_type rt (r.type ()); + + if (lt == rt) + { + bool res (true); + + switch (lt) + { + case line_type::special: res = l.special () == r.special (); break; + case line_type::regex: assert (false); break; + + // Note that we use pointers (rather than vales) comparison + // assuming that the strings must belong to the same pool. + // + case line_type::literal: res = l.literal () == r.literal (); break; + } + + return res; + } + + // Match literal with regex. + // + if (lt == line_type::literal && rt == line_type::regex) + return regex_match (*l.literal (), *r.regex ()); + else if (rt == line_type::literal && lt == line_type::regex) + return regex_match (*r.literal (), *l.regex ()); + + return false; + } + + bool + operator< (const line_char& l, const line_char& r) + { + if (l == r) + return false; + + line_type lt (l.type ()); + line_type rt (r.type ()); + + if (lt != rt) + return lt < rt; + + bool res (false); + + switch (lt) + { + case line_type::special: res = l.special () < r.special (); break; + case line_type::literal: res = *l.literal () < *r.literal (); break; + case line_type::regex: assert (false); break; + } + + return res; + } + + // line_char_locale + // + + // An exemplar locale with the std::ctype<line_char> facet. It is used + // for the subsequent line char locale objects creation (see below) + // which normally ends up with a shallow copy of a reference-counted + // object. + // + // Note that creating the line char locales from the exemplar is not + // merely an optimization: there is a data race in the libstdc++ (at + // least as of GCC 9.1) implementation of the locale(const locale&, + // Facet*) constructor (bug #91057). + // + // Also note that we install the facet in init() rather than during + // the object creation to avoid a race with the std::locale-related + // global variables initialization. + // + static locale line_char_locale_exemplar; + + void + init () + { + line_char_locale_exemplar = + locale (locale (), + new std::ctype<line_char> ()); // Hidden by ctype bitmask. + } + + line_char_locale:: + line_char_locale () + : locale (line_char_locale_exemplar) + { + // Make sure init() has been called. + // + // Note: has_facet() is hidden by a private function in libc++. + // + assert (std::has_facet<std::ctype<line_char>> (*this)); + } + + // char_regex + // + // Transform regex according to the extended flags {idot}. If regex is + // malformed then keep transforming, so the resulting string is + // malformed the same way. We expect the error to be reported by the + // char_regex ctor. + // + static string + transform (const string& s, char_flags f) + { + assert ((f & char_flags::idot) != char_flags::none); + + string r; + bool escape (false); + bool cclass (false); + + for (char c: s) + { + // Inverse escaping for a dot which is out of the char class + // brackets. + // + bool inverse (c == '.' && !cclass); + + // Handle the escape case. Note that we delay adding the backslash + // since we may have to inverse things. + // + if (escape) + { + if (!inverse) + r += '\\'; + + r += c; + escape = false; + + continue; + } + else if (c == '\\') + { + escape = true; + continue; + } + + // Keep track of being inside the char class brackets, escape if + // inversion. Note that we never inverse square brackets. + // + if (c == '[' && !cclass) + cclass = true; + else if (c == ']' && cclass) + cclass = false; + else if (inverse) + r += '\\'; + + r += c; + } + + if (escape) // Regex is malformed but that's not our problem. + r += '\\'; + + return r; + } + + static char_regex::flag_type + to_std_flags (char_flags f) + { + // Note that ECMAScript flag is implied in the absense of a grammar + // flag. + // + return (f & char_flags::icase) != char_flags::none + ? char_regex::icase + : char_regex::flag_type (); + } + + char_regex:: + char_regex (const char_string& s, char_flags f) + : base_type ((f & char_flags::idot) != char_flags::none + ? transform (s, f) + : s, + to_std_flags (f)) + { + } + } + } +} + +namespace std +{ + using namespace build2::script::regex; + + // char_traits<line_char> + // + line_char* char_traits<line_char>:: + assign (char_type* s, size_t n, char_type c) + { + for (size_t i (0); i != n; ++i) + s[i] = c; + return s; + } + + line_char* char_traits<line_char>:: + move (char_type* d, const char_type* s, size_t n) + { + if (n > 0 && d != s) + { + // If d < s then it can't be in [s, s + n) range and so using copy() is + // safe. Otherwise d + n is out of (s, s + n] range and so using + // copy_backward() is safe. + // + if (d < s) + std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy(). + else + copy_backward (s, s + n, d + n); + } + + return d; + } + + line_char* char_traits<line_char>:: + copy (char_type* d, const char_type* s, size_t n) + { + std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy(). + return d; + } + + int char_traits<line_char>:: + compare (const char_type* s1, const char_type* s2, size_t n) + { + for (size_t i (0); i != n; ++i) + { + if (s1[i] < s2[i]) + return -1; + else if (s2[i] < s1[i]) + return 1; + } + + return 0; + } + + size_t char_traits<line_char>:: + length (const char_type* s) + { + size_t i (0); + while (s[i] != char_type::nul) + ++i; + + return i; + } + + const line_char* char_traits<line_char>:: + find (const char_type* s, size_t n, const char_type& c) + { + for (size_t i (0); i != n; ++i) + { + if (s[i] == c) + return s + i; + } + + return nullptr; + } + + // ctype<line_char> + // + locale::id ctype<line_char>::id; + + const line_char* ctype<line_char>:: + is (const char_type* b, const char_type* e, mask* m) const + { + while (b != e) + { + const char_type& c (*b++); + + *m++ = c.type () == line_type::special && c.special () >= 0 && + build2::digit (static_cast<char> (c.special ())) + ? digit + : 0; + } + + return e; + } + + const line_char* ctype<line_char>:: + scan_is (mask m, const char_type* b, const char_type* e) const + { + for (; b != e; ++b) + { + if (is (m, *b)) + return b; + } + + return e; + } + + const line_char* ctype<line_char>:: + scan_not (mask m, const char_type* b, const char_type* e) const + { + for (; b != e; ++b) + { + if (!is (m, *b)) + return b; + } + + return e; + } + + const char* ctype<line_char>:: + widen (const char* b, const char* e, char_type* c) const + { + while (b != e) + *c++ = widen (*b++); + + return e; + } + + const line_char* ctype<line_char>:: + narrow (const char_type* b, const char_type* e, char def, char* c) const + { + while (b != e) + *c++ = narrow (*b++, def); + + return e; + } + + // regex_traits<line_char> + // + int regex_traits<line_char>:: + value (char_type c, int radix) const + { + assert (radix == 8 || radix == 10 || radix == 16); + + if (c.type () != line_type::special) + return -1; + + const char digits[] = "0123456789ABCDEF"; + const char* d (string::traits_type::find (digits, radix, c.special ())); + return d != nullptr ? static_cast<int> (d - digits) : -1; + } +} |