// file      : libbuild2/test/script/regex.hxx -*- C++ -*-
// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef LIBBUILD2_TEST_SCRIPT_REGEX_HXX
#define LIBBUILD2_TEST_SCRIPT_REGEX_HXX

#include <list>
#include <regex>
#include <locale>
#include <string>        // basic_string
#include <type_traits>   // make_unsigned, enable_if, is_*
#include <unordered_set>

#include <libbuild2/types.hxx>
#include <libbuild2/utility.hxx>

namespace build2
{
  namespace test
  {
    namespace script
    {
      namespace regex
      {
        using char_string = std::basic_string<char>;

        enum class char_flags: uint16_t
        {
          icase = 0x1, // Case-insensitive match.
          idot  = 0x2, // Invert '.' escaping.

          none = 0
        };

        // Restricts valid standard flags to just {icase}, extends with custom
        // flags {idot}.
        //
        class char_regex: public std::basic_regex<char>
        {
        public:
          using base_type = std::basic_regex<char>;

          char_regex (const char_string&, char_flags = char_flags::none);
        };

        // Newlines are line separators and are not part of the line:
        //
        // line<newline>line<newline>
        //
        // Specifically, this means that a customary trailing newline creates a
        // trailing blank line.
        //
        // All characters can inter-compare (though there cannot be regex
        // characters in the output, only in line_regex).
        //
        // Note that we assume that line_regex and the input to regex_match()
        // use the same pool.
        //
        struct line_pool
        {
          // Note that we assume the pool can be moved without invalidating
          // pointers to any already pooled entities.
          //
          std::unordered_set<char_string> strings;
          std::list<char_regex> regexes;
        };

        enum class line_type
        {
          special,
          literal,
          regex
        };

        struct line_char
        {
          // Steal last two bits from the pointer to store the type.
          //
        private:
          std::uintptr_t data_;

        public:
          line_type
          type () const {return static_cast<line_type> (data_ & 0x3);}

          int
          special () const
          {
            // Stored as (shifted) int16_t. Perform steps reversed to those
            // that are described in the comment for the corresponding ctor.
            // Note that the intermediate cast to uint16_t is required to
            // portably preserve the -1 special character.
            //
            return static_cast<int16_t> (static_cast<uint16_t> (data_ >> 2));
          }

          const char_string*
          literal () const
          {
            // Note that 2 rightmost bits are used for packaging line_char
            // type. Read the comment for the corresponding ctor for details.
            //
            return reinterpret_cast<const char_string*> (
              data_ & ~std::uintptr_t (0x3));
          }

          const char_regex*
          regex () const
          {
            // Note that 2 rightmost bits are used for packaging line_char
            // type. Read the comment for the corresponding ctor for details.
            //
            return reinterpret_cast<const char_regex*> (
              data_ & ~std::uintptr_t (0x3));
          }

          static const line_char nul;
          static const line_char eof;

          // Note: creates an uninitialized value.
          //
          line_char () = default;

          // Create a special character. The argument value must be one of the
          // following ones:
          //
          // 0 (nul character)
          // -1 (EOF)
          // [()|.*+?{}\0123456789,=!] (excluding [])
          //
          // Note that the constructor is implicit to allow basic_regex to
          // implicitly construct line_chars from special char literals (in
          // particular libstdc++ appends them to an internal line_string).
          //
          // Also note that we extend the valid characters set (see above) with
          // 'p', 'n' (used by libstdc++ for positive/negative look-ahead
          // tokens representation), and '\n', '\r', u'\u2028', u'\u2029' (used
          // by libstdc++ for newline/newparagraph matching).
          //
          line_char (int);

          // Create a literal character.
          //
          // Don't copy string if already pooled.
          //
          explicit
          line_char (const char_string&, line_pool&);

          explicit
          line_char (char_string&&, line_pool&);

          explicit
          line_char (const char_string* s) // Assume already pooled.
              //
              // Steal two bits from the pointer to package line_char type.
              // Assume (and statically assert) that char_string address is a
              // multiple of four.
              //
              : data_ (reinterpret_cast <std::uintptr_t> (s) |
                       static_cast <std::uintptr_t> (line_type::literal)) {}

          // Create a regex character.
          //
          explicit
          line_char (char_regex, line_pool&);

          explicit
          line_char (const char_regex* r) // Assume already pooled.
              //
              // Steal two bits from the pointer to package line_char type.
              // Assume (and statically assert) that char_regex address is a
              // multiple of four.
              //
              : data_ (reinterpret_cast <std::uintptr_t> (r) |
                       static_cast <std::uintptr_t> (line_type::regex)) {}

          // Provide basic_regex with the ability to use line_char in a context
          // where a char value is expected (e.g., as a function argument).
          //
          // libstdc++ seems to cast special line_chars only (and such a
          // conversion is meanigfull).
          //
          // msvcrt casts line_chars of arbitrary types instead. The only
          // reasonable strategy is to return a value that differs from any
          // other that can be encountered in a regex expression and so will
          // unlikelly be misinterpreted.
          //
          operator char () const
          {
            return type () == line_type::special ? special () : '\a'; // BELL.
          }

          // Return true if the character is a syntax (special) one.
          //
          static bool
          syntax (char);

          // Provide basic_regex (such as from msvcrt) with the ability to
          // explicitly cast line_chars to implementation-specific numeric
          // types (enums, msvcrt's _Uelem, etc).
          //
          template <typename T>
          explicit
          operator T () const
          {
            assert (type () == line_type::special);
            return static_cast<T> (special ());
          }
        };

        // Perform "deep" characters comparison (for example match literal
        // character with a regex character), rather than just compare them
        // literally. At least one argument must be of a type other than regex
        // as there is no operator==() defined to compare regexes. Characters
        // of the literal type must share the same pool (strings are compared
        // by pointers not by values).
        //
        bool
        operator== (const line_char&, const line_char&);

        // Return false if arguments are equal (operator==() returns true).
        // Otherwise if types are different return the value implying that
        // special < literal < regex. If types are special or literal return
        // the result of the respective characters or strings comparison. At
        // least one argument must be of a type other than regex as there is no
        // operator<() defined to compare regexes.
        //
        // While not very natural operation for the class we have, we have to
        // provide some meaningfull semantics for such a comparison as it is
        // required by the char_traits<line_char> specialization. While we
        // could provide it right in that specialization, let's keep it here
        // for basic_regex implementations that potentially can compare
        // line_chars as they compare them with expressions of other types (see
        // below).
        //
        bool
        operator< (const line_char&, const line_char&);

        inline bool
        operator!= (const line_char& l, const line_char& r)
        {
          return !(l == r);
        }

        inline bool
        operator<= (const line_char& l, const line_char& r)
        {
          return l < r || l == r;
        }

        // Provide basic_regex (such as from msvcrt) with the ability to
        // compare line_char to a value of an integral or
        // implementation-specific enum type. In the absense of the following
        // template operators, such a comparisons would be ambigious for
        // integral types (given that there are implicit conversions
        // int->line_char and line_char->char) and impossible for enums.
        //
        // Note that these == and < operators can succeed only for a line_char
        // of the special type. For other types they always return false. That
        // in particular leads to the following case:
        //
        // (lc != c) != (lc < c || c < lc).
        //
        // Note that we can not assert line_char is of the special type as
        // basic_regex (such as from libc++) may need the ability to check if
        // arbitrary line_char belongs to some special characters range (like
        // ['0', '9']).
        //
        template <typename T>
        struct line_char_cmp
          : public std::enable_if<std::is_integral<T>::value ||
                                  (std::is_enum<T>::value &&
                                   !std::is_same<T, char_flags>::value)> {};

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator== (const line_char& l, const T& r)
        {
          return l.type () == line_type::special &&
            static_cast<T> (l.special ()) == r;
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator== (const T& l, const line_char& r)
        {
          return r.type () == line_type::special &&
            static_cast<T> (r.special ()) == l;
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator!= (const line_char& l, const T& r)
        {
          return !(l == r);
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator!= (const T& l, const line_char& r)
        {
          return !(l == r);
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator< (const line_char& l, const T& r)
        {
          return l.type () == line_type::special &&
            static_cast<T> (l.special ()) < r;
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        bool
        operator< (const T& l, const line_char& r)
        {
          return r.type () == line_type::special &&
            l < static_cast<T> (r.special ());
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        inline bool
        operator<= (const line_char& l, const T& r)
        {
          return l < r || l == r;
        }

        template <typename T, typename = typename line_char_cmp<T>::type>
        inline bool
        operator<= (const T& l, const line_char& r)
        {
          return l < r || l == r;
        }

        using line_string = std::basic_string<line_char>;

        // Locale that has ctype<line_char> facet installed. Used in the
        // regex_traits<line_char> specialization (see below).
        //
        class line_char_locale: public std::locale
        {
        public:
          // Create a copy of the global C++ locale.
          //
          line_char_locale ();
        };

        // Initialize the testscript regex global state. Should be called once
        // prior to creating objects of types from this namespace. Note: not
        // thread-safe.
        //
        void
        init ();
      }
    }
  }
}

// Standard template specializations for line_char that are required for the
// basic_regex<line_char> instantiation.
//
namespace std
{
  template <>
  class char_traits<build2::test::script::regex::line_char>
  {
  public:
    using char_type  = build2::test::script::regex::line_char;
    using int_type   = char_type;
    using off_type   = char_traits<char>::off_type;
    using pos_type   = char_traits<char>::pos_type;
    using state_type = char_traits<char>::state_type;

    static void
    assign (char_type& c1, const char_type& c2) {c1 = c2;}

    static char_type*
    assign (char_type*, size_t, char_type);

    // Note that eq() and lt() are not constexpr (as required by C++11)
    // because == and < operators for char_type are not constexpr.
    //
    static bool
    eq (const char_type& l, const char_type& r) {return l == r;}

    static bool
    lt (const char_type& l, const char_type& r) {return l < r;}

    static char_type*
    move (char_type*, const char_type*, size_t);

    static char_type*
    copy (char_type*, const char_type*, size_t);

    static int
    compare (const char_type*, const char_type*, size_t);

    static size_t
    length (const char_type*);

    static const char_type*
    find (const char_type*, size_t, const char_type&);

    static constexpr char_type
    to_char_type (const int_type& c) {return c;}

    static constexpr int_type
    to_int_type (const char_type& c) {return int_type (c);}

    // Note that the following functions are not constexpr (as required by
    // C++11) because their return expressions are not constexpr.
    //
    static bool
    eq_int_type (const int_type& l, const int_type& r) {return l == r;}

    static int_type eof () {return char_type::eof;}

    static int_type
    not_eof (const int_type& c)
    {
      return c != char_type::eof ? c : char_type::nul;
    }
  };

  // ctype<> must be derived from both ctype_base and locale::facet (the later
  // supports ref-counting used by the std::locale implementation internally).
  //
  // msvcrt for some reason also derives ctype_base from locale::facet which
  // produces "already a base-class" warning and effectivelly breaks the
  // reference counting. So we derive from ctype_base only in this case.
  //
  template <>
  class ctype<build2::test::script::regex::line_char>: public ctype_base
#if !defined(_MSC_VER) || _MSC_VER >= 2000
                                              , public locale::facet
#endif
  {
    // Used by the implementation only.
    //
    using line_type = build2::test::script::regex::line_type;

  public:
    using char_type = build2::test::script::regex::line_char;

    static locale::id id;

#if !defined(_MSC_VER) || _MSC_VER >= 2000
    explicit
    ctype (size_t refs = 0): locale::facet (refs) {}
#else
    explicit
    ctype (size_t refs = 0): ctype_base (refs) {}
#endif

    // While unnecessary, let's keep for completeness.
    //
    virtual
    ~ctype () override = default;

    // The C++ standard requires the following functions to call their virtual
    // (protected) do_*() counterparts that provide the real implementations.
    // The only purpose for this indirection is to provide a user with the
    // ability to customize existing (standard) ctype facets. As we do not
    // provide such an ability, for simplicity we will omit the do_*()
    // functions and provide the implementations directly. This should be safe
    // as nobody except us could call those protected functions.
    //
    bool
    is (mask m, char_type c) const
    {
      return m ==
        (c.type () == line_type::special && c.special () >= 0 &&
         build2::digit (static_cast<char> (c.special ()))
         ? digit
         : 0);
    }

    const char_type*
    is (const char_type*, const char_type*, mask*) const;

    const char_type*
    scan_is (mask, const char_type*, const char_type*) const;

    const char_type*
    scan_not (mask, const char_type*, const char_type*) const;

    char_type
    toupper (char_type c) const {return c;}

    const char_type*
    toupper (char_type*, const char_type* e) const {return e;}

    char_type
    tolower (char_type c) const {return c;}

    const char_type*
    tolower (char_type*, const char_type* e) const {return e;}

    char_type
    widen (char c) const {return char_type (c);}

    const char*
    widen (const char*, const char*, char_type*) const;

    char
    narrow (char_type c, char def) const
    {
      return c.type () == line_type::special ? c.special () : def;
    }

    const char_type*
    narrow (const char_type*, const char_type*, char, char*) const;
  };

  // Note: the current application locale must be the POSIX one. Otherwise the
  // behavior is undefined.
  //
  template <>
  class regex_traits<build2::test::script::regex::line_char>
  {
  public:
    using char_type       = build2::test::script::regex::line_char;
    using string_type     = build2::test::script::regex::line_string;
    using locale_type     = build2::test::script::regex::line_char_locale;
    using char_class_type = regex_traits<char>::char_class_type;

    // Workaround for msvcrt bugs. For some reason it assumes such a members
    // to be present in a regex_traits specialization.
    //
#if defined(_MSC_VER) && _MSC_VER < 2000
    static const ctype_base::mask _Ch_upper = ctype_base::upper;
    static const ctype_base::mask _Ch_alpha = ctype_base::alpha;

    // Unsigned numeric type. msvcrt normally casts characters to this type
    // for comparing with some numeric values or for calculating an index in
    // some bit array. Luckily that all relates to the character class
    // handling that we don't support.
    //
    using _Uelem = unsigned int;
#endif

    regex_traits () = default; // Unnecessary but let's keep for completeness.

    static size_t
    length (const char_type* p) {return string_type::traits_type::length (p);}

    char_type
    translate (char_type c) const {return c;}

    // Case-insensitive matching is not supported by line_regex. So there is no
    // reason for the function to be called.
    //
    char_type
    translate_nocase (char_type c) const {assert (false); return c;}

    // Return a sort-key - the exact copy of [b, e).
    //
    template <typename I>
    string_type
    transform (I b, I e) const {return string_type (b, e);}

    // Return a case-insensitive sort-key. Case-insensitive matching is not
    // supported by line_regex. So there is no reason for the function to be
    // called.
    //
    template <typename I>
    string_type
    transform_primary (I b, I e) const
    {
      assert (false);
      return string_type (b, e);
    }

    // POSIX regex grammar and collating elements (e.g., [.tilde.]) in
    // particular are not supported. So there is no reason for the function to
    // be called.
    //
    template <typename I>
    string_type
    lookup_collatename (I, I) const {assert (false); return string_type ();}

    // Character classes (e.g., [:lower:]) are not supported. So there is no
    // reason for the function to be called.
    //
    template <typename I>
    char_class_type
    lookup_classname (I, I, bool = false) const
    {
      assert (false);
      return char_class_type ();
    }

    // Return false as we don't support character classes (e.g., [:lower:]).
    //
    bool
    isctype (char_type, char_class_type) const {return false;}

    int
    value (char_type, int) const;

    // Return the locale passed as an argument as we do not expect anything
    // other than POSIX locale, that we also assume to be imbued by default.
    //
    locale_type
    imbue (locale_type l) {return l;}

    locale_type
    getloc () const {return locale_type ();}
  };

  // We assume line_char to be an unsigned type and express that with the
  // following specialization used by basic_regex implementations.
  //
  // libstdc++ defines unsigned CharT type (regex_traits template parameter)
  // to use as an index in some internal cache regardless if the cache is used
  // for this specialization (and the cache is used only if CharT is char).
  //
  template <>
  struct make_unsigned<build2::test::script::regex::line_char>
  {
    using type = build2::test::script::regex::line_char;
  };

  // When used with libc++ the linker complains that it can't find
  // __match_any_but_newline<line_char>::__exec() function. The problem is
  // that the function is only specialized for char and wchar_t
  // (LLVM bug #31409). As line_char has no notion of the newline character we
  // specialize the class template to behave as the __match_any<line_char>
  // instantiation does (that luckily has all the functions in place).
  //
#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION <= 9000
  template <>
  class __match_any_but_newline<build2::test::script::regex::line_char>
    : public __match_any<build2::test::script::regex::line_char>
  {
  public:
    using base = __match_any<build2::test::script::regex::line_char>;
    using base::base;
  };
#endif
}

namespace build2
{
  namespace test
  {
    namespace script
    {
      namespace regex
      {
        class line_regex: public std::basic_regex<line_char>
        {
        public:
          using base_type = std::basic_regex<line_char>;

          using base_type::base_type;

          line_regex () = default;

          // Move string regex together with the pool used to create it.
          //
          line_regex (line_string&& s, line_pool&& p)
              // No move-string ctor for base_type, so emulate it.
              //
              : base_type (s), pool (move (p)) {s.clear ();}

          // Move constuctible/assignable-only type.
          //
          line_regex (line_regex&&) = default;
          line_regex (const line_regex&) = delete;
          line_regex& operator= (line_regex&&) = default;
          line_regex& operator= (const line_regex&) = delete;

        public:
          line_pool pool;
        };
      }
    }
  }
}

#include <libbuild2/test/script/regex.ixx>

#endif // LIBBUILD2_TEST_SCRIPT_REGEX_HXX