1 files changed, 552 insertions, 0 deletions
diff --git a/libbutl/url.hxx b/libbutl/url.hxx
new file mode 100644
index 0000000..5721cfd
--- /dev/null
+++ b/libbutl/url.hxx
@@ -0,0 +1,552 @@
+// file      : libbutl/url.hxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#pragma once
+
+#include <string>
+#include <cassert>
+#include <cstddef>  // size_t
+#include <cstdint>  // uint*_t
+#include <utility>  // move()
+#include <ostream>
+#include <iterator> // back_inserter
+
+#include <libbutl/path.hxx>
+#include <libbutl/utility.hxx>
+#include <libbutl/optional.hxx>
+
+#include <libbutl/export.hxx>
+
+namespace butl
+{
+  // RFC3986 Uniform Resource Locator (URL).
+  //
+  // <url> = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>] |
+  //         <scheme>:[<path>][?<query>][#<fragment>]
+  //
+  // <authority> = [<user>@]<host>[:<port>]
+  //
+  // Some examples of equivalent URLs to meditate upon:
+  //
+  // file://localhost/tmp     (localhost authority)
+  // file:///tmp              (empty     authority)
+  // file:/tmp                (absent    authority)
+  //
+  // file://localhost/c:/tmp
+  // file:///c:/tmp
+  // file:/c:/tmp
+  //
+  // We think of the slash between <authority> and <path> as a separator but
+  // with the path always interpreted as starting from the "root" of the
+  // authority. Thus:
+  //
+  // file://localhost/tmp     ->  'file'://'localhost'/'tmp'    ->  /tmp
+  // file://localhost/c:/tmp  ->  'file'://'localhost'/'c:/tmp' ->  c:/tmp
+  //
+  // This means that the <path> component is represented as a relative path
+  // and, in the general case, we cannot use our path type for its storage
+  // since it assumes the path is for the host platform. In other words, the
+  // interpretation of the path has to take into account the platform of the
+  // authority host. Note, however, that a custom url_traits implementation
+  // can choose to use the path type if local paths are to be interpreted as
+  // relative to the host.
+  //
+  // For authority-less schemes the <path> component is also represented as a
+  // relative path. Some examples of such URLs (let's call them rootless
+  // rather than authority-less not to confuse with a case where authority is
+  // empty/implied):
+  //
+  // pkcs11:token=sign;object=SIGN%20key
+  // pkcs11:id=%02%38%01?pin-value=12345
+  // pkcs11:
+  //
+  // Note that a scheme can theoretically allow both rootless and "rootfull"
+  // representations.
+  //
+  // Note also that we currently forbid one character schemes to support
+  // scheme-less (Windows) paths which can be done by
+  // url_traits::translate_scheme() (see below). (A Windows path that uses
+  // forward slashes would be parsed as a valid authority-less URL).
+
+  // URL host component can be an IPv4 address (if matches its dotted-decimal
+  // notation), an IPv6 address (if enclosed in [square brackets]) or
+  // otherwise a name.
+  //
+  // Note that non-ASCII host names are allowed in URLs. They must be
+  // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed
+  // host name UTF8-encoded without regards to the template argument string
+  // type. Later we may add support for more appropriate encodings for
+  // multi-byte character types.
+  //
+  enum class url_host_kind {ipv4, ipv6, name};
+
+  template <typename S>
+  struct basic_url_host
+  {
+    using string_type = S;
+    using kind_type   = url_host_kind;
+
+    string_type value;
+    kind_type   kind;
+
+    // Can be treated as const string_type&.
+    //
+    operator const string_type& () const noexcept {return value;}
+
+    // Create an empty host.
+    //
+    basic_url_host (): kind (kind_type::name) {}
+
+    // Create the host object from its string representation as it appears in
+    // a URL, throwing std::invalid_argument if invalid. Remove the enclosing
+    // square brackets for IPv6 addresses, and URL-decode host names.
+    //
+    // Note that the 'x:x:x:x:x:x:d.d.d.d' IPv6 address mixed notation is not
+    // supported.
+    //
+    explicit
+    basic_url_host (string_type);
+
+    basic_url_host (string_type v, kind_type k)
+        : value (std::move (v)), kind (k) {}
+
+    bool
+    empty () const
+    {
+      assert (kind == kind_type::name || !value.empty ());
+      return value.empty ();
+    }
+
+    // Return string representation of the host as it would appear in a URL.
+    //
+    string_type
+    string () const;
+
+    // Normalize the host value in accordance with its type:
+    //
+    // Name - convert to the lower case. Note: only ASCII names are currently
+    //        supported.
+    //
+    // IPv4 - strip the leading zeros in its octets.
+    //
+    // IPv6 - strip the leading zeros in its groups (hextets), squash the
+    //        longest zero-only hextet sequence, and convert to the lower case
+    //        (as per RFC5952).
+    //
+    // Assume that the host value is valid.
+    //
+    void
+    normalize ();
+  };
+
+  template <typename S>
+  struct basic_url_authority
+  {
+    using string_type = S;
+    using host_type   = basic_url_host<string_type>;
+
+    string_type   user;  // Empty if not specified.
+    host_type     host;
+    std::uint16_t port;  // Zero if not specified.
+
+    bool
+    empty () const
+    {
+      assert (!host.empty () || (user.empty () && port == 0));
+      return host.empty ();
+    }
+
+    // Return a string representation of the URL authority. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+  };
+
+  template <typename H, typename S = H, typename P = S>
+  struct url_traits
+  {
+    using scheme_type = H;
+    using string_type = S;
+    using path_type   = P;
+
+    using authority_type = basic_url_authority<string_type>;
+
+    // Translate the scheme string representation to its type. May throw
+    // std::invalid_argument. May change the URL components. Should not return
+    // nullopt if called with a non-empty scheme.
+    //
+    // This function is called with an empty scheme if the URL has no scheme,
+    // the scheme is invalid, or it could not be parsed into components
+    // according to the URL syntax. In this case all the passed components
+    // reference empty/absent/false values. If nullopt is returned, the URL is
+    // considered invalid and the std::invalid_argument exception with an
+    // appropriate description is thrown by the URL object constructor. This
+    // can be used to support scheme-less URLs, local paths, etc.
+    //
+    static optional<scheme_type>
+    translate_scheme (const string_type&         /*url*/,
+                      string_type&&              scheme,
+                      optional<authority_type>&  /*authority*/,
+                      optional<path_type>&       /*path*/,
+                      optional<string_type>&     /*query*/,
+                      optional<string_type>&     /*fragment*/,
+                      bool&                      /*rootless*/)
+    {
+      return !scheme.empty ()
+        ? optional<scheme_type> (std::move (scheme))
+        : nullopt; // Leave the URL object constructor to throw.
+    }
+
+    // Translate scheme type back to its string representation.
+    //
+    // Similar to the above the function is called with an empty string
+    // representation. If on return this value is no longer empty, then it is
+    // assume the URL has been translated in a custom manner (in which case
+    // the returned scheme value is ignored).
+    //
+    static string_type
+    translate_scheme (string_type&,                    /*url*/
+                      const scheme_type&               scheme,
+                      const optional<authority_type>&  /*authority*/,
+                      const optional<path_type>&       /*path*/,
+                      const optional<string_type>&     /*query*/,
+                      const optional<string_type>&     /*fragment*/,
+                      bool                             /*rootless*/)
+    {
+      return string_type (scheme);
+    }
+
+    // Translate the URL-encoded path string representation to its type.
+    //
+    // Note that encoding for non-ASCII paths is not specified (in contrast
+    // to the host name), and presumably is local to the referenced authority.
+    // Furthermore, for some schemes, the path component can contain encoded
+    // binary data, for example for pkcs11.
+    //
+    static path_type
+    translate_path (string_type&&);
+
+    // Translate path type back to its URL-encoded string representation.
+    //
+    static string_type
+    translate_path (const path_type&);
+
+    // Check whether a string looks like a non-rootless URL by searching for
+    // the first ':' (unless its position is specified with the second
+    // argument) and then making sure it's both followed by '/' (e.g., http://
+    // or file:/) and preceded by a valid scheme at least 2 characters long
+    // (so we don't confuse it with an absolute Windows path, e.g., c:/).
+    //
+    // Return the start of the URL substring or string_type::npos.
+    //
+    static std::size_t
+    find (const string_type&, std::size_t pos = string_type::npos);
+  };
+
+  template <typename H, // scheme
+            typename T = url_traits<H>>
+  class basic_url
+  {
+  public:
+    using traits_type = T;
+
+    using string_type = typename traits_type::string_type;
+    using char_type   = typename string_type::value_type;
+    using path_type   = typename traits_type::path_type;
+
+    using scheme_type    = typename traits_type::scheme_type;
+    using authority_type = typename traits_type::authority_type;
+    using host_type      = typename authority_type::host_type;
+
+    scheme_type              scheme;
+    optional<authority_type> authority;
+    optional<path_type>      path;
+    optional<string_type>    query;
+    optional<string_type>    fragment;
+    bool                     rootless = false;
+
+    // Create an empty URL object.
+    //
+    basic_url (): scheme (), empty_ (true) {}
+
+    // Create the URL object from its string representation. Verify that the
+    // string is compliant to the generic URL syntax. URL-decode and validate
+    // components with common for all schemes syntax (scheme, host, port).
+    // Throw std::invalid_argument if the passed string is not a valid URL
+    // representation.
+    //
+    // Validation and URL-decoding of the scheme-specific components can be
+    // provided by a custom url_traits::translate_scheme() implementation.
+    //
+    explicit
+    basic_url (const string_type&);
+
+    // Create the URL object from individual components. Performs no
+    // components URL-decoding or verification.
+    //
+    basic_url (scheme_type,
+               optional<authority_type>,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    // Create a rootless URL.
+    //
+    basic_url (scheme_type,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    bool
+    empty () const noexcept {return empty_;}
+
+    // Return a string representation of the URL. Note that while this is not
+    // necessarily syntactically the same string as what was used to
+    // initialize this instance, it should be semantically equivalent. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+
+    // Normalize the URL host, if present.
+    //
+    void
+    normalize ();
+
+    // The following predicates can be used to classify URL characters while
+    // parsing, validating or encoding scheme-specific components. For the
+    // semantics of character classes see RFC3986.
+    //
+    static bool
+    gen_delim (char_type c)
+    {
+      return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' ||
+             c == ']' || c == '@';
+    }
+
+    static bool
+    sub_delim (char_type c)
+    {
+      return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' ||
+             c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||
+             c == '\'';
+    }
+
+    static bool
+    reserved (char_type c) {return sub_delim (c) || gen_delim (c);}
+
+    static bool
+    unreserved (char_type c)
+    {
+      return alnum (c) || c == '-' ||  c == '.' || c =='_' || c == '~';
+    }
+
+    static bool
+    path_char (char_type c)
+    {
+      return c == '/' || c == ':' || unreserved (c) || c == '@' ||
+             sub_delim (c);
+    }
+
+    // URL-encode a character sequence.
+    //
+    // Note that the set of characters that should be encoded may differ for
+    // different URL components. The optional callback function must return
+    // true for characters that should be percent-encoded. The function may
+    // encode the passed character in it's own way with another character (but
+    // never with '%'), and return false. By default all characters other than
+    // unreserved are percent-encoded.
+    //
+    // Also note that the characters are interpreted as bytes. In other words,
+    // each character may result in a single encoding triplet.
+    //
+    template <typename I, typename O, typename F>
+    static void
+    encode (I begin, I end, O output, F&& efunc);
+
+    template <typename I, typename O>
+    static void
+    encode (I b, I e, O o)
+    {
+      encode (b, e, o, [] (char_type& c) {return !unreserved (c);});
+    }
+
+    template <typename F>
+    static string_type
+    encode (const string_type& s, F&& f)
+    {
+      string_type r;
+      encode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    static string_type
+    encode (const string_type& s)
+    {
+      return encode (s, [] (char_type& c) {return !unreserved (c);});
+    }
+
+    template <typename F>
+    static string_type
+    encode (const char_type* s, F&& f)
+    {
+      string_type r;
+      encode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r),
+              f);
+      return r;
+    }
+
+    static string_type
+    encode (const char_type* s)
+    {
+      return encode (s, [] (char_type& c) {return !unreserved (c);});
+    }
+
+    // URL-decode a character sequence. Throw std::invalid_argument if an
+    // invalid encoding sequence is encountered.
+    //
+    // If some characters in the sequence are encoded with another characters
+    // (rather than percent-encoded), then one must provide the callback
+    // function to decode them.
+    //
+    template <typename I, typename O, typename F>
+    static void
+    decode (I begin, I end, O output, F&& dfunc);
+
+    template <typename I, typename O>
+    static void
+    decode (I b, I e, O o)
+    {
+      decode (b, e, o, [] (char_type&) {});
+    }
+
+    template <typename F>
+    static string_type
+    decode (const string_type& s, F&& f)
+    {
+      string_type r;
+      decode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    static string_type
+    decode (const string_type& s)
+    {
+      return decode (s, [] (char_type&) {});
+    }
+
+    template <typename F>
+    static string_type
+    decode (const char_type* s, F&& f)
+    {
+      string_type r;
+      decode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r),
+              f);
+      return r;
+    }
+
+    static string_type
+    decode (const char_type* s)
+    {
+      return decode (s, [] (char_type&) {});
+    }
+
+  private:
+    bool empty_ = false;
+  };
+
+  using url_authority = basic_url_authority<std::string>;
+  using url           = basic_url          <std::string>;
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return x.value == y.value && x.kind == y.kind;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return x.user == y.user && x.host == y.host && x.port == y.port;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    if (x.empty () || y.empty ())
+      return x.empty () == y.empty ();
+
+    return x.scheme    == y.scheme    &&
+           x.authority == y.authority &&
+           x.path      == y.path      &&
+           x.query     == y.query     &&
+           x.fragment  == y.fragment  &&
+           x.rootless  == y.rootless;
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline auto
+  operator<< (std::basic_ostream<typename T::string_type::value_type>& o,
+              const basic_url<S, T>& u) -> decltype (o)
+  {
+    return o << u.string ();
+  }
+}
+
+#include <libbutl/url.ixx>
+#include <libbutl/url.txx>