// file : libbutl/url.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file #pragma once #include #include #include // size_t #include // uint*_t #include // move() #include #include // back_inserter #include #include #include #include namespace butl { // RFC3986 Uniform Resource Locator (URL). // // = :[//[]][/][?][#] | // :[][?][#] // // = [@][:] // // Some examples of equivalent URLs to meditate upon: // // file://localhost/tmp (localhost authority) // file:///tmp (empty authority) // file:/tmp (absent authority) // // file://localhost/c:/tmp // file:///c:/tmp // file:/c:/tmp // // We think of the slash between and as a separator but // with the path always interpreted as starting from the "root" of the // authority. Thus: // // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp // // This means that the component is represented as a relative path // and, in the general case, we cannot use our path type for its storage // since it assumes the path is for the host platform. In other words, the // interpretation of the path has to take into account the platform of the // authority host. Note, however, that a custom url_traits implementation // can choose to use the path type if local paths are to be interpreted as // relative to the host. // // For authority-less schemes the component is also represented as a // relative path. Some examples of such URLs (let's call them rootless // rather than authority-less not to confuse with a case where authority is // empty/implied): // // pkcs11:token=sign;object=SIGN%20key // pkcs11:id=%02%38%01?pin-value=12345 // pkcs11: // // Note that a scheme can theoretically allow both rootless and "rootfull" // representations. // // Note also that we currently forbid one character schemes to support // scheme-less (Windows) paths which can be done by // url_traits::translate_scheme() (see below). (A Windows path that uses // forward slashes would be parsed as a valid authority-less URL). // URL host component can be an IPv4 address (if matches its dotted-decimal // notation), an IPv6 address (if enclosed in [square brackets]) or // otherwise a name. // // Note that non-ASCII host names are allowed in URLs. They must be // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed // host name UTF8-encoded without regards to the template argument string // type. Later we may add support for more appropriate encodings for // multi-byte character types. // enum class url_host_kind {ipv4, ipv6, name}; template struct basic_url_host { using string_type = S; using kind_type = url_host_kind; string_type value; kind_type kind; // Can be treated as const string_type&. // operator const string_type& () const noexcept {return value;} // Create an empty host. // basic_url_host (): kind (kind_type::name) {} // Create the host object from its string representation as it appears in // a URL, throwing std::invalid_argument if invalid. Remove the enclosing // square brackets for IPv6 addresses, and URL-decode host names. // // Note that the 'x:x:x:x:x:x:d.d.d.d' IPv6 address mixed notation is not // supported. // explicit basic_url_host (string_type); basic_url_host (string_type v, kind_type k) : value (std::move (v)), kind (k) {} bool empty () const { assert (kind == kind_type::name || !value.empty ()); return value.empty (); } // Return string representation of the host as it would appear in a URL. // string_type string () const; // Normalize the host value in accordance with its type: // // Name - convert to the lower case. Note: only ASCII names are currently // supported. // // IPv4 - strip the leading zeros in its octets. // // IPv6 - strip the leading zeros in its groups (hextets), squash the // longest zero-only hextet sequence, and convert to the lower case // (as per RFC5952). // // Assume that the host value is valid. // void normalize (); }; template struct basic_url_authority { using string_type = S; using host_type = basic_url_host; string_type user; // Empty if not specified. host_type host; std::uint16_t port; // Zero if not specified. bool empty () const { assert (!host.empty () || (user.empty () && port == 0)); return host.empty (); } // Return a string representation of the URL authority. String // representation of an empty instance is the empty string. // string_type string () const; }; template struct url_traits { using scheme_type = H; using string_type = S; using path_type = P; using authority_type = basic_url_authority; // Translate the scheme string representation to its type. May throw // std::invalid_argument. May change the URL components. Should not return // nullopt if called with a non-empty scheme. // // This function is called with an empty scheme if the URL has no scheme, // the scheme is invalid, or it could not be parsed into components // according to the URL syntax. In this case all the passed components // reference empty/absent/false values. If nullopt is returned, the URL is // considered invalid and the std::invalid_argument exception with an // appropriate description is thrown by the URL object constructor. This // can be used to support scheme-less URLs, local paths, etc. // static optional translate_scheme (const string_type& /*url*/, string_type&& scheme, optional& /*authority*/, optional& /*path*/, optional& /*query*/, optional& /*fragment*/, bool& /*rootless*/) { return !scheme.empty () ? optional (std::move (scheme)) : nullopt; // Leave the URL object constructor to throw. } // Translate scheme type back to its string representation. // // Similar to the above the function is called with an empty string // representation. If on return this value is no longer empty, then it is // assume the URL has been translated in a custom manner (in which case // the returned scheme value is ignored). // static string_type translate_scheme (string_type&, /*url*/ const scheme_type& scheme, const optional& /*authority*/, const optional& /*path*/, const optional& /*query*/, const optional& /*fragment*/, bool /*rootless*/) { return string_type (scheme); } // Translate the URL-encoded path string representation to its type. // // Note that encoding for non-ASCII paths is not specified (in contrast // to the host name), and presumably is local to the referenced authority. // Furthermore, for some schemes, the path component can contain encoded // binary data, for example for pkcs11. // static path_type translate_path (string_type&&); // Translate path type back to its URL-encoded string representation. // static string_type translate_path (const path_type&); // Check whether a string looks like a non-rootless URL by searching for // the first ':' (unless its position is specified with the second // argument) and then making sure it's both followed by '/' (e.g., http:// // or file:/) and preceded by a valid scheme at least 2 characters long // (so we don't confuse it with an absolute Windows path, e.g., c:/). // // Return the start of the URL substring or string_type::npos. // static std::size_t find (const string_type&, std::size_t pos = string_type::npos); }; template > class basic_url { public: using traits_type = T; using string_type = typename traits_type::string_type; using char_type = typename string_type::value_type; using path_type = typename traits_type::path_type; using scheme_type = typename traits_type::scheme_type; using authority_type = typename traits_type::authority_type; using host_type = typename authority_type::host_type; scheme_type scheme; optional authority; optional path; optional query; optional fragment; bool rootless = false; // Create an empty URL object. // basic_url (): scheme (), empty_ (true) {} // Create the URL object from its string representation. Verify that the // string is compliant to the generic URL syntax. URL-decode and validate // components with common for all schemes syntax (scheme, host, port). // Throw std::invalid_argument if the passed string is not a valid URL // representation. // // Validation and URL-decoding of the scheme-specific components can be // provided by a custom url_traits::translate_scheme() implementation. // explicit basic_url (const string_type&); // Create the URL object from individual components. Performs no // components URL-decoding or verification. // basic_url (scheme_type, optional, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, host_type host, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, host_type host, std::uint16_t port, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, string_type host, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, string_type host, std::uint16_t port, optional path, optional query = nullopt, optional fragment = nullopt); // Create a rootless URL. // basic_url (scheme_type, optional path, optional query = nullopt, optional fragment = nullopt); bool empty () const noexcept {return empty_;} // Return a string representation of the URL. Note that while this is not // necessarily syntactically the same string as what was used to // initialize this instance, it should be semantically equivalent. String // representation of an empty instance is the empty string. // string_type string () const; // Normalize the URL host, if present. // void normalize (); // The following predicates can be used to classify URL characters while // parsing, validating or encoding scheme-specific components. For the // semantics of character classes see RFC3986. // static bool gen_delim (char_type c) { return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || c == ']' || c == '@'; } static bool sub_delim (char_type c) { return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || c == '\''; } static bool reserved (char_type c) {return sub_delim (c) || gen_delim (c);} static bool unreserved (char_type c) { return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; } static bool path_char (char_type c) { return c == '/' || c == ':' || unreserved (c) || c == '@' || sub_delim (c); } // URL-encode a character sequence. // // Note that the set of characters that should be encoded may differ for // different URL components. The optional callback function must return // true for characters that should be percent-encoded. The function may // encode the passed character in it's own way with another character (but // never with '%'), and return false. By default all characters other than // unreserved are percent-encoded. // // Also note that the characters are interpreted as bytes. In other words, // each character may result in a single encoding triplet. // template static void encode (I begin, I end, O output, F&& efunc); template static void encode (I b, I e, O o) { encode (b, e, o, [] (char_type& c) {return !unreserved (c);}); } template static string_type encode (const string_type& s, F&& f) { string_type r; encode (s.begin (), s.end (), std::back_inserter (r), f); return r; } static string_type encode (const string_type& s) { return encode (s, [] (char_type& c) {return !unreserved (c);}); } template static string_type encode (const char_type* s, F&& f) { string_type r; encode (s, s + string_type::traits_type::length (s), std::back_inserter (r), f); return r; } static string_type encode (const char_type* s) { return encode (s, [] (char_type& c) {return !unreserved (c);}); } // URL-decode a character sequence. Throw std::invalid_argument if an // invalid encoding sequence is encountered. // // If some characters in the sequence are encoded with another characters // (rather than percent-encoded), then one must provide the callback // function to decode them. // template static void decode (I begin, I end, O output, F&& dfunc); template static void decode (I b, I e, O o) { decode (b, e, o, [] (char_type&) {}); } template static string_type decode (const string_type& s, F&& f) { string_type r; decode (s.begin (), s.end (), std::back_inserter (r), f); return r; } static string_type decode (const string_type& s) { return decode (s, [] (char_type&) {}); } template static string_type decode (const char_type* s, F&& f) { string_type r; decode (s, s + string_type::traits_type::length (s), std::back_inserter (r), f); return r; } static string_type decode (const char_type* s) { return decode (s, [] (char_type&) {}); } private: bool empty_ = false; }; using url_authority = basic_url_authority; using url = basic_url ; template inline bool operator== (const basic_url_host& x, const basic_url_host& y) noexcept { return x.value == y.value && x.kind == y.kind; } template inline bool operator!= (const basic_url_host& x, const basic_url_host& y) noexcept { return !(x == y); } template inline bool operator== (const basic_url_authority& x, const basic_url_authority& y) noexcept { return x.user == y.user && x.host == y.host && x.port == y.port; } template inline bool operator!= (const basic_url_authority& x, const basic_url_authority& y) noexcept { return !(x == y); } template inline bool operator== (const basic_url& x, const basic_url& y) noexcept { if (x.empty () || y.empty ()) return x.empty () == y.empty (); return x.scheme == y.scheme && x.authority == y.authority && x.path == y.path && x.query == y.query && x.fragment == y.fragment && x.rootless == y.rootless; } template inline bool operator!= (const basic_url& x, const basic_url& y) noexcept { return !(x == y); } template inline auto operator<< (std::basic_ostream& o, const basic_url& u) -> decltype (o) { return o << u.string (); } } #include #include