From df1ef68cd8e8582724ce1192bfc202e0b9aeaf0c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 28 Sep 2021 19:24:31 +0300 Subject: Get rid of C++ modules related code and rename *.mxx files to *.hxx --- libbutl/url.hxx | 552 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 552 insertions(+) create mode 100644 libbutl/url.hxx (limited to 'libbutl/url.hxx') diff --git a/libbutl/url.hxx b/libbutl/url.hxx new file mode 100644 index 0000000..5721cfd --- /dev/null +++ b/libbutl/url.hxx @@ -0,0 +1,552 @@ +// file : libbutl/url.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include +#include +#include // size_t +#include // uint*_t +#include // move() +#include +#include // back_inserter + +#include +#include +#include + +#include + +namespace butl +{ + // RFC3986 Uniform Resource Locator (URL). + // + // = :[//[]][/][?][#] | + // :[][?][#] + // + // = [@][:] + // + // Some examples of equivalent URLs to meditate upon: + // + // file://localhost/tmp (localhost authority) + // file:///tmp (empty authority) + // file:/tmp (absent authority) + // + // file://localhost/c:/tmp + // file:///c:/tmp + // file:/c:/tmp + // + // We think of the slash between and as a separator but + // with the path always interpreted as starting from the "root" of the + // authority. Thus: + // + // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp + // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp + // + // This means that the component is represented as a relative path + // and, in the general case, we cannot use our path type for its storage + // since it assumes the path is for the host platform. In other words, the + // interpretation of the path has to take into account the platform of the + // authority host. Note, however, that a custom url_traits implementation + // can choose to use the path type if local paths are to be interpreted as + // relative to the host. + // + // For authority-less schemes the component is also represented as a + // relative path. Some examples of such URLs (let's call them rootless + // rather than authority-less not to confuse with a case where authority is + // empty/implied): + // + // pkcs11:token=sign;object=SIGN%20key + // pkcs11:id=%02%38%01?pin-value=12345 + // pkcs11: + // + // Note that a scheme can theoretically allow both rootless and "rootfull" + // representations. + // + // Note also that we currently forbid one character schemes to support + // scheme-less (Windows) paths which can be done by + // url_traits::translate_scheme() (see below). (A Windows path that uses + // forward slashes would be parsed as a valid authority-less URL). + + // URL host component can be an IPv4 address (if matches its dotted-decimal + // notation), an IPv6 address (if enclosed in [square brackets]) or + // otherwise a name. + // + // Note that non-ASCII host names are allowed in URLs. They must be + // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed + // host name UTF8-encoded without regards to the template argument string + // type. Later we may add support for more appropriate encodings for + // multi-byte character types. + // + enum class url_host_kind {ipv4, ipv6, name}; + + template + struct basic_url_host + { + using string_type = S; + using kind_type = url_host_kind; + + string_type value; + kind_type kind; + + // Can be treated as const string_type&. + // + operator const string_type& () const noexcept {return value;} + + // Create an empty host. + // + basic_url_host (): kind (kind_type::name) {} + + // Create the host object from its string representation as it appears in + // a URL, throwing std::invalid_argument if invalid. Remove the enclosing + // square brackets for IPv6 addresses, and URL-decode host names. + // + // Note that the 'x:x:x:x:x:x:d.d.d.d' IPv6 address mixed notation is not + // supported. + // + explicit + basic_url_host (string_type); + + basic_url_host (string_type v, kind_type k) + : value (std::move (v)), kind (k) {} + + bool + empty () const + { + assert (kind == kind_type::name || !value.empty ()); + return value.empty (); + } + + // Return string representation of the host as it would appear in a URL. + // + string_type + string () const; + + // Normalize the host value in accordance with its type: + // + // Name - convert to the lower case. Note: only ASCII names are currently + // supported. + // + // IPv4 - strip the leading zeros in its octets. + // + // IPv6 - strip the leading zeros in its groups (hextets), squash the + // longest zero-only hextet sequence, and convert to the lower case + // (as per RFC5952). + // + // Assume that the host value is valid. + // + void + normalize (); + }; + + template + struct basic_url_authority + { + using string_type = S; + using host_type = basic_url_host; + + string_type user; // Empty if not specified. + host_type host; + std::uint16_t port; // Zero if not specified. + + bool + empty () const + { + assert (!host.empty () || (user.empty () && port == 0)); + return host.empty (); + } + + // Return a string representation of the URL authority. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + }; + + template + struct url_traits + { + using scheme_type = H; + using string_type = S; + using path_type = P; + + using authority_type = basic_url_authority; + + // Translate the scheme string representation to its type. May throw + // std::invalid_argument. May change the URL components. Should not return + // nullopt if called with a non-empty scheme. + // + // This function is called with an empty scheme if the URL has no scheme, + // the scheme is invalid, or it could not be parsed into components + // according to the URL syntax. In this case all the passed components + // reference empty/absent/false values. If nullopt is returned, the URL is + // considered invalid and the std::invalid_argument exception with an + // appropriate description is thrown by the URL object constructor. This + // can be used to support scheme-less URLs, local paths, etc. + // + static optional + translate_scheme (const string_type& /*url*/, + string_type&& scheme, + optional& /*authority*/, + optional& /*path*/, + optional& /*query*/, + optional& /*fragment*/, + bool& /*rootless*/) + { + return !scheme.empty () + ? optional (std::move (scheme)) + : nullopt; // Leave the URL object constructor to throw. + } + + // Translate scheme type back to its string representation. + // + // Similar to the above the function is called with an empty string + // representation. If on return this value is no longer empty, then it is + // assume the URL has been translated in a custom manner (in which case + // the returned scheme value is ignored). + // + static string_type + translate_scheme (string_type&, /*url*/ + const scheme_type& scheme, + const optional& /*authority*/, + const optional& /*path*/, + const optional& /*query*/, + const optional& /*fragment*/, + bool /*rootless*/) + { + return string_type (scheme); + } + + // Translate the URL-encoded path string representation to its type. + // + // Note that encoding for non-ASCII paths is not specified (in contrast + // to the host name), and presumably is local to the referenced authority. + // Furthermore, for some schemes, the path component can contain encoded + // binary data, for example for pkcs11. + // + static path_type + translate_path (string_type&&); + + // Translate path type back to its URL-encoded string representation. + // + static string_type + translate_path (const path_type&); + + // Check whether a string looks like a non-rootless URL by searching for + // the first ':' (unless its position is specified with the second + // argument) and then making sure it's both followed by '/' (e.g., http:// + // or file:/) and preceded by a valid scheme at least 2 characters long + // (so we don't confuse it with an absolute Windows path, e.g., c:/). + // + // Return the start of the URL substring or string_type::npos. + // + static std::size_t + find (const string_type&, std::size_t pos = string_type::npos); + }; + + template > + class basic_url + { + public: + using traits_type = T; + + using string_type = typename traits_type::string_type; + using char_type = typename string_type::value_type; + using path_type = typename traits_type::path_type; + + using scheme_type = typename traits_type::scheme_type; + using authority_type = typename traits_type::authority_type; + using host_type = typename authority_type::host_type; + + scheme_type scheme; + optional authority; + optional path; + optional query; + optional fragment; + bool rootless = false; + + // Create an empty URL object. + // + basic_url (): scheme (), empty_ (true) {} + + // Create the URL object from its string representation. Verify that the + // string is compliant to the generic URL syntax. URL-decode and validate + // components with common for all schemes syntax (scheme, host, port). + // Throw std::invalid_argument if the passed string is not a valid URL + // representation. + // + // Validation and URL-decoding of the scheme-specific components can be + // provided by a custom url_traits::translate_scheme() implementation. + // + explicit + basic_url (const string_type&); + + // Create the URL object from individual components. Performs no + // components URL-decoding or verification. + // + basic_url (scheme_type, + optional, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + host_type host, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + host_type host, + std::uint16_t port, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + string_type host, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + string_type host, + std::uint16_t port, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + // Create a rootless URL. + // + basic_url (scheme_type, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + bool + empty () const noexcept {return empty_;} + + // Return a string representation of the URL. Note that while this is not + // necessarily syntactically the same string as what was used to + // initialize this instance, it should be semantically equivalent. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + + // Normalize the URL host, if present. + // + void + normalize (); + + // The following predicates can be used to classify URL characters while + // parsing, validating or encoding scheme-specific components. For the + // semantics of character classes see RFC3986. + // + static bool + gen_delim (char_type c) + { + return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || + c == ']' || c == '@'; + } + + static bool + sub_delim (char_type c) + { + return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || + c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || + c == '\''; + } + + static bool + reserved (char_type c) {return sub_delim (c) || gen_delim (c);} + + static bool + unreserved (char_type c) + { + return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; + } + + static bool + path_char (char_type c) + { + return c == '/' || c == ':' || unreserved (c) || c == '@' || + sub_delim (c); + } + + // URL-encode a character sequence. + // + // Note that the set of characters that should be encoded may differ for + // different URL components. The optional callback function must return + // true for characters that should be percent-encoded. The function may + // encode the passed character in it's own way with another character (but + // never with '%'), and return false. By default all characters other than + // unreserved are percent-encoded. + // + // Also note that the characters are interpreted as bytes. In other words, + // each character may result in a single encoding triplet. + // + template + static void + encode (I begin, I end, O output, F&& efunc); + + template + static void + encode (I b, I e, O o) + { + encode (b, e, o, [] (char_type& c) {return !unreserved (c);}); + } + + template + static string_type + encode (const string_type& s, F&& f) + { + string_type r; + encode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + static string_type + encode (const string_type& s) + { + return encode (s, [] (char_type& c) {return !unreserved (c);}); + } + + template + static string_type + encode (const char_type* s, F&& f) + { + string_type r; + encode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), + f); + return r; + } + + static string_type + encode (const char_type* s) + { + return encode (s, [] (char_type& c) {return !unreserved (c);}); + } + + // URL-decode a character sequence. Throw std::invalid_argument if an + // invalid encoding sequence is encountered. + // + // If some characters in the sequence are encoded with another characters + // (rather than percent-encoded), then one must provide the callback + // function to decode them. + // + template + static void + decode (I begin, I end, O output, F&& dfunc); + + template + static void + decode (I b, I e, O o) + { + decode (b, e, o, [] (char_type&) {}); + } + + template + static string_type + decode (const string_type& s, F&& f) + { + string_type r; + decode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + static string_type + decode (const string_type& s) + { + return decode (s, [] (char_type&) {}); + } + + template + static string_type + decode (const char_type* s, F&& f) + { + string_type r; + decode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), + f); + return r; + } + + static string_type + decode (const char_type* s) + { + return decode (s, [] (char_type&) {}); + } + + private: + bool empty_ = false; + }; + + using url_authority = basic_url_authority; + using url = basic_url ; + + template + inline bool + operator== (const basic_url_host& x, const basic_url_host& y) noexcept + { + return x.value == y.value && x.kind == y.kind; + } + + template + inline bool + operator!= (const basic_url_host& x, const basic_url_host& y) noexcept + { + return !(x == y); + } + + template + inline bool + operator== (const basic_url_authority& x, + const basic_url_authority& y) noexcept + { + return x.user == y.user && x.host == y.host && x.port == y.port; + } + + template + inline bool + operator!= (const basic_url_authority& x, + const basic_url_authority& y) noexcept + { + return !(x == y); + } + + template + inline bool + operator== (const basic_url& x, const basic_url& y) noexcept + { + if (x.empty () || y.empty ()) + return x.empty () == y.empty (); + + return x.scheme == y.scheme && + x.authority == y.authority && + x.path == y.path && + x.query == y.query && + x.fragment == y.fragment && + x.rootless == y.rootless; + } + + template + inline bool + operator!= (const basic_url& x, const basic_url& y) noexcept + { + return !(x == y); + } + + template + inline auto + operator<< (std::basic_ostream& o, + const basic_url& u) -> decltype (o) + { + return o << u.string (); + } +} + +#include +#include -- cgit v1.1