diff options
Diffstat (limited to 'libbuild2/script/regex.hxx')
-rw-r--r-- | libbuild2/script/regex.hxx | 678 |
1 files changed, 678 insertions, 0 deletions
diff --git a/libbuild2/script/regex.hxx b/libbuild2/script/regex.hxx new file mode 100644 index 0000000..30d3363 --- /dev/null +++ b/libbuild2/script/regex.hxx @@ -0,0 +1,678 @@ +// file : libbuild2/script/regex.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_REGEX_HXX +#define LIBBUILD2_SCRIPT_REGEX_HXX + +#include <list> +#include <regex> +#include <locale> +#include <string> // basic_string +#include <type_traits> // make_unsigned, enable_if, is_* +#include <unordered_set> + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +namespace build2 +{ + namespace script + { + namespace regex + { + using char_string = std::basic_string<char>; + + enum class char_flags: uint16_t + { + icase = 0x1, // Case-insensitive match. + idot = 0x2, // Invert '.' escaping. + + none = 0 + }; + + // Restricts valid standard flags to just {icase}, extends with custom + // flags {idot}. + // + class char_regex: public std::basic_regex<char> + { + public: + using base_type = std::basic_regex<char>; + + char_regex (const char_string&, char_flags = char_flags::none); + }; + + // Newlines are line separators and are not part of the line: + // + // line<newline>line<newline> + // + // Specifically, this means that a customary trailing newline creates a + // trailing blank line. + // + // All characters can inter-compare (though there cannot be regex + // characters in the output, only in line_regex). + // + // Note that we assume that line_regex and the input to regex_match() + // use the same pool. + // + struct line_pool + { + // Note that we assume the pool can be moved without invalidating + // pointers to any already pooled entities. + // + std::unordered_set<char_string> strings; + std::list<char_regex> regexes; + }; + + enum class line_type + { + special, + literal, + regex + }; + + struct line_char + { + // Steal last two bits from the pointer to store the type. + // + private: + std::uintptr_t data_; + + public: + line_type + type () const {return static_cast<line_type> (data_ & 0x3);} + + int + special () const + { + // Stored as (shifted) int16_t. Perform steps reversed to those + // that are described in the comment for the corresponding ctor. + // Note that the intermediate cast to uint16_t is required to + // portably preserve the -1 special character. + // + return static_cast<int16_t> (static_cast<uint16_t> (data_ >> 2)); + } + + const char_string* + literal () const + { + // Note that 2 rightmost bits are used for packaging line_char + // type. Read the comment for the corresponding ctor for details. + // + return reinterpret_cast<const char_string*> ( + data_ & ~std::uintptr_t (0x3)); + } + + const char_regex* + regex () const + { + // Note that 2 rightmost bits are used for packaging line_char + // type. Read the comment for the corresponding ctor for details. + // + return reinterpret_cast<const char_regex*> ( + data_ & ~std::uintptr_t (0x3)); + } + + static const line_char nul; + static const line_char eof; + + // Note: creates an uninitialized value. + // + line_char () = default; + + // Create a special character. The argument value must be one of the + // following ones: + // + // 0 (nul character) + // -1 (EOF) + // [()|.*+?{}\0123456789,=!] (excluding []) + // + // Note that the constructor is implicit to allow basic_regex to + // implicitly construct line_chars from special char literals (in + // particular libstdc++ appends them to an internal line_string). + // + // Also note that we extend the valid characters set (see above) with + // 'p', 'n' (used by libstdc++ for positive/negative look-ahead + // tokens representation), and '\n', '\r', u'\u2028', u'\u2029' (used + // by libstdc++ for newline/newparagraph matching). + // + line_char (int); + + // Create a literal character. + // + // Don't copy string if already pooled. + // + explicit + line_char (const char_string&, line_pool&); + + explicit + line_char (char_string&&, line_pool&); + + explicit + line_char (const char_string* s) // Assume already pooled. + // + // Steal two bits from the pointer to package line_char type. + // Assume (and statically assert) that char_string address is a + // multiple of four. + // + : data_ (reinterpret_cast <std::uintptr_t> (s) | + static_cast <std::uintptr_t> (line_type::literal)) {} + + // Create a regex character. + // + explicit + line_char (char_regex, line_pool&); + + explicit + line_char (const char_regex* r) // Assume already pooled. + // + // Steal two bits from the pointer to package line_char type. + // Assume (and statically assert) that char_regex address is a + // multiple of four. + // + : data_ (reinterpret_cast <std::uintptr_t> (r) | + static_cast <std::uintptr_t> (line_type::regex)) {} + + // Provide basic_regex with the ability to use line_char in a context + // where a char value is expected (e.g., as a function argument). + // + // libstdc++ seems to cast special line_chars only (and such a + // conversion is meanigfull). + // + // msvcrt casts line_chars of arbitrary types instead. The only + // reasonable strategy is to return a value that differs from any + // other that can be encountered in a regex expression and so will + // unlikelly be misinterpreted. + // + operator char () const + { + return type () == line_type::special ? special () : '\a'; // BELL. + } + + // Return true if the character is a syntax (special) one. + // + static bool + syntax (char); + + // Provide basic_regex (such as from msvcrt) with the ability to + // explicitly cast line_chars to implementation-specific numeric + // types (enums, msvcrt's _Uelem, etc). + // + template <typename T> + explicit + operator T () const + { + assert (type () == line_type::special); + return static_cast<T> (special ()); + } + }; + + // Perform "deep" characters comparison (for example match literal + // character with a regex character), rather than just compare them + // literally. At least one argument must be of a type other than regex + // as there is no operator==() defined to compare regexes. Characters + // of the literal type must share the same pool (strings are compared + // by pointers not by values). + // + bool + operator== (const line_char&, const line_char&); + + // Return false if arguments are equal (operator==() returns true). + // Otherwise if types are different return the value implying that + // special < literal < regex. If types are special or literal return + // the result of the respective characters or strings comparison. At + // least one argument must be of a type other than regex as there is no + // operator<() defined to compare regexes. + // + // While not very natural operation for the class we have, we have to + // provide some meaningfull semantics for such a comparison as it is + // required by the char_traits<line_char> specialization. While we + // could provide it right in that specialization, let's keep it here + // for basic_regex implementations that potentially can compare + // line_chars as they compare them with expressions of other types (see + // below). + // + bool + operator< (const line_char&, const line_char&); + + inline bool + operator!= (const line_char& l, const line_char& r) + { + return !(l == r); + } + + inline bool + operator<= (const line_char& l, const line_char& r) + { + return l < r || l == r; + } + + // Provide basic_regex (such as from msvcrt) with the ability to + // compare line_char to a value of an integral or + // implementation-specific enum type. In the absense of the following + // template operators, such a comparisons would be ambigious for + // integral types (given that there are implicit conversions + // int->line_char and line_char->char) and impossible for enums. + // + // Note that these == and < operators can succeed only for a line_char + // of the special type. For other types they always return false. That + // in particular leads to the following case: + // + // (lc != c) != (lc < c || c < lc). + // + // Note that we can not assert line_char is of the special type as + // basic_regex (such as from libc++) may need the ability to check if + // arbitrary line_char belongs to some special characters range (like + // ['0', '9']). + // + template <typename T> + struct line_char_cmp + : public std::enable_if<std::is_integral<T>::value || + (std::is_enum<T>::value && + !std::is_same<T, char_flags>::value)> {}; + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator== (const line_char& l, const T& r) + { + return l.type () == line_type::special && + static_cast<T> (l.special ()) == r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator== (const T& l, const line_char& r) + { + return r.type () == line_type::special && + static_cast<T> (r.special ()) == l; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator!= (const line_char& l, const T& r) + { + return !(l == r); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator!= (const T& l, const line_char& r) + { + return !(l == r); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator< (const line_char& l, const T& r) + { + return l.type () == line_type::special && + static_cast<T> (l.special ()) < r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator< (const T& l, const line_char& r) + { + return r.type () == line_type::special && + l < static_cast<T> (r.special ()); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + inline bool + operator<= (const line_char& l, const T& r) + { + return l < r || l == r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + inline bool + operator<= (const T& l, const line_char& r) + { + return l < r || l == r; + } + + using line_string = std::basic_string<line_char>; + + // Locale that has ctype<line_char> facet installed. Used in the + // regex_traits<line_char> specialization (see below). + // + class line_char_locale: public std::locale + { + public: + // Create a copy of the global C++ locale. + // + line_char_locale (); + }; + + // Initialize the script regex global state. Should be called once + // prior to creating objects of types from this namespace. Note: not + // thread-safe. + // + void + init (); + } + } +} + +// Standard template specializations for line_char that are required for the +// basic_regex<line_char> instantiation. +// +namespace std +{ + template <> + class char_traits<build2::script::regex::line_char> + { + public: + using char_type = build2::script::regex::line_char; + using int_type = char_type; + using off_type = char_traits<char>::off_type; + using pos_type = char_traits<char>::pos_type; + using state_type = char_traits<char>::state_type; + + static void + assign (char_type& c1, const char_type& c2) {c1 = c2;} + + static char_type* + assign (char_type*, size_t, char_type); + + // Note that eq() and lt() are not constexpr (as required by C++11) + // because == and < operators for char_type are not constexpr. + // + static bool + eq (const char_type& l, const char_type& r) {return l == r;} + + static bool + lt (const char_type& l, const char_type& r) {return l < r;} + + static char_type* + move (char_type*, const char_type*, size_t); + + static char_type* + copy (char_type*, const char_type*, size_t); + + static int + compare (const char_type*, const char_type*, size_t); + + static size_t + length (const char_type*); + + static const char_type* + find (const char_type*, size_t, const char_type&); + + static constexpr char_type + to_char_type (const int_type& c) {return c;} + + static constexpr int_type + to_int_type (const char_type& c) {return int_type (c);} + + // Note that the following functions are not constexpr (as required by + // C++11) because their return expressions are not constexpr. + // + static bool + eq_int_type (const int_type& l, const int_type& r) {return l == r;} + + static int_type eof () {return char_type::eof;} + + static int_type + not_eof (const int_type& c) + { + return c != char_type::eof ? c : char_type::nul; + } + }; + + // ctype<> must be derived from both ctype_base and locale::facet (the later + // supports ref-counting used by the std::locale implementation internally). + // + // msvcrt for some reason also derives ctype_base from locale::facet which + // produces "already a base-class" warning and effectivelly breaks the + // reference counting. So we derive from ctype_base only in this case. + // + template <> + class ctype<build2::script::regex::line_char>: public ctype_base +#if !defined(_MSC_VER) || _MSC_VER >= 2000 + , public locale::facet +#endif + { + // Used by the implementation only. + // + using line_type = build2::script::regex::line_type; + + public: + using char_type = build2::script::regex::line_char; + + static locale::id id; + +#if !defined(_MSC_VER) || _MSC_VER >= 2000 + explicit + ctype (size_t refs = 0): locale::facet (refs) {} +#else + explicit + ctype (size_t refs = 0): ctype_base (refs) {} +#endif + + // While unnecessary, let's keep for completeness. + // + virtual + ~ctype () override = default; + + // The C++ standard requires the following functions to call their virtual + // (protected) do_*() counterparts that provide the real implementations. + // The only purpose for this indirection is to provide a user with the + // ability to customize existing (standard) ctype facets. As we do not + // provide such an ability, for simplicity we will omit the do_*() + // functions and provide the implementations directly. This should be safe + // as nobody except us could call those protected functions. + // + bool + is (mask m, char_type c) const + { + return m == + (c.type () == line_type::special && c.special () >= 0 && + build2::digit (static_cast<char> (c.special ())) + ? digit + : 0); + } + + const char_type* + is (const char_type*, const char_type*, mask*) const; + + const char_type* + scan_is (mask, const char_type*, const char_type*) const; + + const char_type* + scan_not (mask, const char_type*, const char_type*) const; + + char_type + toupper (char_type c) const {return c;} + + const char_type* + toupper (char_type*, const char_type* e) const {return e;} + + char_type + tolower (char_type c) const {return c;} + + const char_type* + tolower (char_type*, const char_type* e) const {return e;} + + char_type + widen (char c) const {return char_type (c);} + + const char* + widen (const char*, const char*, char_type*) const; + + char + narrow (char_type c, char def) const + { + return c.type () == line_type::special ? c.special () : def; + } + + const char_type* + narrow (const char_type*, const char_type*, char, char*) const; + }; + + // Note: the current application locale must be POSIX. Otherwise the + // behavior is undefined. + // + template <> + class regex_traits<build2::script::regex::line_char> + { + public: + using char_type = build2::script::regex::line_char; + using string_type = build2::script::regex::line_string; + using locale_type = build2::script::regex::line_char_locale; + using char_class_type = regex_traits<char>::char_class_type; + + // Workaround for msvcrt bugs. For some reason it assumes such a members + // to be present in a regex_traits specialization. + // +#if defined(_MSC_VER) && _MSC_VER < 2000 + static const ctype_base::mask _Ch_upper = ctype_base::upper; + static const ctype_base::mask _Ch_alpha = ctype_base::alpha; + + // Unsigned numeric type. msvcrt normally casts characters to this type + // for comparing with some numeric values or for calculating an index in + // some bit array. Luckily that all relates to the character class + // handling that we don't support. + // + using _Uelem = unsigned int; +#endif + + regex_traits () = default; // Unnecessary but let's keep for completeness. + + static size_t + length (const char_type* p) {return string_type::traits_type::length (p);} + + char_type + translate (char_type c) const {return c;} + + // Case-insensitive matching is not supported by line_regex. So there is no + // reason for the function to be called. + // + char_type + translate_nocase (char_type c) const {assert (false); return c;} + + // Return a sort-key - the exact copy of [b, e). + // + template <typename I> + string_type + transform (I b, I e) const {return string_type (b, e);} + + // Return a case-insensitive sort-key. Case-insensitive matching is not + // supported by line_regex. So there is no reason for the function to be + // called. + // + template <typename I> + string_type + transform_primary (I b, I e) const + { + assert (false); + return string_type (b, e); + } + + // POSIX regex grammar and collating elements (e.g., [.tilde.]) in + // particular are not supported. So there is no reason for the function to + // be called. + // + template <typename I> + string_type + lookup_collatename (I, I) const {assert (false); return string_type ();} + + // Character classes (e.g., [:lower:]) are not supported. So there is no + // reason for the function to be called. + // + template <typename I> + char_class_type + lookup_classname (I, I, bool = false) const + { + assert (false); + return char_class_type (); + } + + // Return false as we don't support character classes (e.g., [:lower:]). + // + bool + isctype (char_type, char_class_type) const {return false;} + + int + value (char_type, int) const; + + // Return the locale passed as an argument as we do not expect anything + // other than POSIX locale, that we also assume to be imbued by default. + // + locale_type + imbue (locale_type l) {return l;} + + locale_type + getloc () const {return locale_type ();} + }; + + // We assume line_char to be an unsigned type and express that with the + // following specialization used by basic_regex implementations. + // + // libstdc++ defines unsigned CharT type (regex_traits template parameter) + // to use as an index in some internal cache regardless if the cache is used + // for this specialization (and the cache is used only if CharT is char). + // + template <> + struct make_unsigned<build2::script::regex::line_char> + { + using type = build2::script::regex::line_char; + }; + + // When used with libc++ the linker complains that it can't find + // __match_any_but_newline<line_char>::__exec() function. The problem is + // that the function is only specialized for char and wchar_t + // (LLVM bug #31409). As line_char has no notion of the newline character we + // specialize the class template to behave as the __match_any<line_char> + // instantiation does (that luckily has all the functions in place). + // +#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION <= 9000 + template <> + class __match_any_but_newline<build2::script::regex::line_char> + : public __match_any<build2::script::regex::line_char> + { + public: + using base = __match_any<build2::script::regex::line_char>; + using base::base; + }; +#endif +} + +namespace build2 +{ + namespace script + { + namespace regex + { + class line_regex: public std::basic_regex<line_char> + { + public: + using base_type = std::basic_regex<line_char>; + + using base_type::base_type; + + line_regex () = default; + + // Move string regex together with the pool used to create it. + // + line_regex (line_string&& s, line_pool&& p) + // No move-string ctor for base_type, so emulate it. + // + : base_type (s), pool (move (p)) {s.clear ();} + + // Move constuctible/assignable-only type. + // + line_regex (line_regex&&) = default; + line_regex (const line_regex&) = delete; + line_regex& operator= (line_regex&&) = default; + line_regex& operator= (const line_regex&) = delete; + + public: + line_pool pool; + }; + } + } +} + +#include <libbuild2/script/regex.ixx> + +#endif // LIBBUILD2_SCRIPT_REGEX_HXX |