// file : libbutl/url.mxx -*- C++ -*- // copyright : Copyright (c) 2014-2018 Code Synthesis Ltd // license : MIT; see accompanying LICENSE file #ifndef __cpp_modules #pragma once #endif // C includes. #include #ifndef __cpp_lib_modules #include #include // uint*_t #include // move() #include #include // back_inserter #include // size_t #include // invalid_argument #endif // Other includes. #ifdef __cpp_modules export module butl.url; #ifdef __cpp_lib_modules import std.core; import std.io; #endif import butl.path; import butl.utility; import butl.optional; #else #include #include #include #endif #include LIBBUTL_MODEXPORT namespace butl { // RFC3986 Uniform Resource Locator (URL). // // = :[//[]][/][?][#] // = [@][:] // // Some examples of equivalent URLs to meditate upon: // // file://localhost/tmp (localhost authority) // file:///tmp (empty authority) // file:/tmp (absent authority) // // file://localhost/c:/tmp // file:///c:/tmp // file:/c:/tmp // // We think of the slash between and as a separator but // with the path always interpreted as starting from the "root" of the // authority. Thus: // // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp // // This means that the component is represented as a relative path // and, in the general case, we cannot use our path type for its storage // since it assumes the path is for the host platform. In other words, the // interpretation of the path has to take into account the platform of the // authority host. Note, however, that a custom url_traits implementation // can choose to use the path type if local paths are to be interpreted as // relative to the host. // // Note that we currently forbid one character schemes to support scheme- // less (Windows) paths which can be done by url_traits::translate_scheme() // (see below). (A Windows path that uses forward slashes would be parsed as // a valid authority-less URL). // URL host component can be an IPv4 address (if matches its dotted-decimal // notation), an IPv6 address (if enclosed in [square brackets]) or // otherwise a name. // // Note that non-ASCII host names are allowed in URLs. They must be // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed // host name UTF8-encoded without regards to the template argument string // type. Later we may add support for more appropriate encodings for // multi-byte character types. // enum class url_host_kind {ipv4, ipv6, name}; template struct basic_url_host { using string_type = S; using kind_type = url_host_kind; string_type value; kind_type kind; // Can be treated as const string_type&. // operator const string_type& () const noexcept {return value;} // Create an empty host. // basic_url_host (): kind (kind_type::name) {} // Create the host object from its string representation as it appears in // a URL, throwing std::invalid_argument if invalid. Remove the enclosing // square brackets for IPv6 addresses, and URL-decode host names. // // Note that currently we don't validate IPv6 addresses. // explicit basic_url_host (string_type); basic_url_host (string_type v, kind_type k) : value (std::move (v)), kind (k) {} bool empty () const { assert (kind == kind_type::name || !value.empty ()); return value.empty (); } // Return string representation of the host as it would appear in a URL. // string_type string () const; }; template struct basic_url_authority { using string_type = S; using host_type = basic_url_host; string_type user; // Empty if not specified. host_type host; std::uint16_t port; // Zero if not specified. bool empty () const { assert (!host.empty () || (user.empty () && port == 0)); return host.empty (); } // Return a string representation of the URL authority. String // representation of an empty instance is the empty string. // string_type string () const; }; template struct url_traits { using scheme_type = H; using string_type = S; using path_type = P; using authority_type = basic_url_authority; // Translate the scheme string representation to its type. May throw // std::invalid_argument. May change the URL components. // // This function is called with an empty scheme if the URL has no scheme, // the scheme is invalid, or it could not be parsed into components // according to the URL syntax. In this case all the passed components // reference empty/absent values and if they remain unchanged on return, // the URL is considered invalid and the std::invalid_argument exception // with an appropriate description is thrown by the URL object constructor. // This can be used to support scheme-less URLs, local paths, etc. // static scheme_type translate_scheme (const string_type& /*url*/, string_type&& scheme, optional& /*authority*/, optional& /*path*/, optional& /*query*/, optional& /*fragment*/) { return scheme_type (std::move (scheme)); } // Translate scheme type back to its string representation. // // Similar to the above the function is called with an empty string // representation. If on return this value is no longer empty, then it is // assume the URL has been translated in a custom manner (in which case // the returned scheme value is ignored). // static string_type translate_scheme (string_type&, /*url*/ const scheme_type& scheme, const optional& /*authority*/, const optional& /*path*/, const optional& /*query*/, const optional& /*fragment*/) { return string_type (scheme); } // Translate the path string representation to its type. // static path_type translate_path (string_type&& path) { return path_type (std::move (path)); } // Translate path type back to its string representation. // static string_type translate_path (const path_type& path) {return string_type (path);} // Check whether a string looks like a URL by searching for the first ':' // (unless its position is specified with the second argument) and then // making sure it's followed by '/' (e.g., http:// or file:/) and preceded // by the scheme at least 2 characters long (so we don't confuse it with // an absolute Windows path, e.g., c:/). // // Return the start of the URL substring or string_type::npos. // static std::size_t find (const string_type&, std::size_t pos = string_type::npos); }; template > class basic_url { public: using traits = T; using string_type = typename traits::string_type; using char_type = typename string_type::value_type; using path_type = typename traits::path_type; using scheme_type = typename traits::scheme_type; using authority_type = typename traits::authority_type; using host_type = typename authority_type::host_type; scheme_type scheme; optional authority; optional path; optional query; optional fragment; // Create an empty URL object. // basic_url () = default; // Create the URL object from its string representation. Verify that the // string is compliant to the generic URL syntax. URL-decode and validate // components with common for all schemes syntax (scheme, host, port, // path). Throw std::invalid_argument if the passed string is not a valid // URL representation. // // Validation and URL-decoding of the scheme-specific components can be // provided by a custom url_traits::translate_scheme() implementation. // explicit basic_url (const string_type&); // Create the URL object from individual components. Performs no // components URL-decoding or verification. // basic_url (scheme_type, optional, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, host_type host, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, host_type host, std::uint16_t port, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, string_type host, optional path, optional query = nullopt, optional fragment = nullopt); basic_url (scheme_type, string_type host, std::uint16_t port, optional path, optional query = nullopt, optional fragment = nullopt); bool empty () const noexcept { assert (authority || path || query || !fragment); return !authority && !path && !query; } // Return a string representation of the URL. Note that while this is not // necessarily syntactically the same string as what was used to // initialize this instance, it should be semantically equivalent. String // representation of an empty instance is the empty string. // string_type string () const; // The following predicates can be used to classify URL characters while // parsing, validating or encoding scheme-specific components. For the // semantics of character classes see RFC3986. // static bool gen_delim (char_type c) { return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || c == ']' || c == '@'; } static bool sub_delim (char_type c) { return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || c == '\''; } static bool reserved (char_type c) {return sub_delim (c) || gen_delim (c);} static bool unreserved (char_type c) { return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; } // URL-encode a character sequence. // // Note that the set of characters that should be encoded may differ for // different URL components. The optional callback function must return // true for characters that should be percent-encoded. The function may // encode the passed character in it's own way with another character (but // never with '%'), and return false. By default all characters other than // unreserved are percent-encoded. // // Also note that the characters are interpreted as bytes. In other words, // each character may result in a single encoding triplet. // template static void encode (I b, I e, O o, // VC (as of 15u3) doesn't see unreserved() unless qualified. // F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}); template static string_type encode (const string_type& s, F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}) { string_type r; encode (s.begin (), s.end (), std::back_inserter (r), f); return r; } template static string_type encode (const char_type* s, F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}) { string_type r; encode (s, s + string_type::traits_type::length (s), std::back_inserter (r), f); return r; } // URL-decode a character sequence. Throw std::invalid_argument if an // invalid encoding sequence is encountered. // // If some characters in the sequence are encoded with another characters // (rather than percent-encoded), then one must provide the callback // function to decode them. // template static void decode (I b, I e, O o, F&& f = [] (char_type&) {}); template static string_type decode (const string_type& s, F&& f = [] (char_type&) {}) { string_type r; decode (s.begin (), s.end (), std::back_inserter (r), f); return r; } template static string_type decode (const char_type* s, F&& f = [] (char_type&) {}) { string_type r; decode (s, s + string_type::traits_type::length (s), std::back_inserter (r), f); return r; } }; using url_authority = basic_url_authority; using url = basic_url ; template inline bool operator== (const basic_url_host& x, const basic_url_host& y) noexcept { return x.value == y.value && x.kind == y.kind; } template inline bool operator!= (const basic_url_host& x, const basic_url_host& y) noexcept { return !(x == y); } template inline bool operator== (const basic_url_authority& x, const basic_url_authority& y) noexcept { return x.user == y.user && x.host == y.host && x.port == y.port; } template inline bool operator!= (const basic_url_authority& x, const basic_url_authority& y) noexcept { return !(x == y); } template inline bool operator== (const basic_url& x, const basic_url& y) noexcept { if (!(x.authority == y.authority && x.path == y.path && x.query == y.query && x.fragment == y.fragment)) return false; assert (x.empty () == y.empty ()); if (x.empty ()) return true; return x.scheme == y.scheme; // None is empty, so schemes are valid. } template inline bool operator!= (const basic_url& x, const basic_url& y) noexcept { return !(x == y); } template inline auto operator<< (std::basic_ostream& o, const basic_url& u) -> decltype (o) { return o << u.string (); } } #include #include