aboutsummaryrefslogtreecommitdiff
path: root/libbutl
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-12-10 10:02:19 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-12-26 13:25:37 +0300
commite5bfd17637bf297c3cfe509d51027916864092d5 (patch)
tree5dab56d6a5aee0a38da6f597c52b12838b2836b5 /libbutl
parentb1cd193f1bd28837a00cbe6629f9a562f99d961f (diff)
Add basic_url<H,T> class template
Diffstat (limited to 'libbutl')
-rw-r--r--libbutl/url.ixx84
-rw-r--r--libbutl/url.mxx476
-rw-r--r--libbutl/url.txx509
-rw-r--r--libbutl/utility.ixx30
-rw-r--r--libbutl/utility.mxx18
5 files changed, 1109 insertions, 8 deletions
diff --git a/libbutl/url.ixx b/libbutl/url.ixx
new file mode 100644
index 0000000..4ff7a06
--- /dev/null
+++ b/libbutl/url.ixx
@@ -0,0 +1,84 @@
+// file : libbutl/url.ixx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ optional<authority_type> a,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : scheme (std::move (s)),
+ authority (std::move (a)),
+ path (std::move (p)),
+ query (std::move (q)),
+ fragment (std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ host_type h,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ authority_type {string_type (), std::move (h), 0},
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ host_type h,
+ std::uint16_t o,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ authority_type {string_type (), std::move (h), o},
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ string_type h,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ host_type (std::move (h)),
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ string_type h,
+ std::uint16_t o,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ host_type (std::move (h)),
+ o,
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+}
diff --git a/libbutl/url.mxx b/libbutl/url.mxx
new file mode 100644
index 0000000..fe091f1
--- /dev/null
+++ b/libbutl/url.mxx
@@ -0,0 +1,476 @@
+// file : libbutl/url.mxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules
+#pragma once
+#endif
+
+// C includes.
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <cstdint> // uint*_t
+#include <utility> // move()
+#include <ostream>
+#include <iterator> // back_inserter
+
+#include <cstddef> // size_t
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+export module butl.url;
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.path;
+import butl.utility;
+import butl.optional;
+#else
+#include <libbutl/path.mxx>
+#include <libbutl/utility.mxx>
+#include <libbutl/optional.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // RFC3986 Uniform Resource Locator (URL).
+ //
+ // <url> = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>]
+ // <authority> = [<user>@]<host>[:<port>]
+ //
+ // Some examples of equivalent URLs to meditate upon:
+ //
+ // file://localhost/tmp (localhost authority)
+ // file:///tmp (empty authority)
+ // file:/tmp (absent authority)
+ //
+ // file://localhost/c:/tmp
+ // file:///c:/tmp
+ // file:/c:/tmp
+ //
+ // We think of the slash between <authority> and <path> as a separator but
+ // with the path always interpreted as starting from the "root" of the
+ // authority. Thus:
+ //
+ // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp
+ // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp
+ //
+ // This means that the <path> component is represented as a relative path
+ // and, in the general case, we cannot use our path type for its storage
+ // since it assumes the path is for the host platform. In other words, the
+ // interpretation of the path has to take into account the platform of the
+ // authority host. Note, however, that a custom url_traits implementation
+ // can choose to use the path type if local paths are to be interpreted as
+ // relative to the host.
+ //
+ // Note that we currently forbid one character schemes to support scheme-
+ // less (Windows) paths which can be done by url_traits::translate_scheme()
+ // (see below). (A Windows path that uses forward slashes would be parsed as
+ // a valid authority-less URL).
+
+ // URL host component can be an IPv4 address (if matches its dotted-decimal
+ // notation), an IPv6 address (if enclosed in [square brackets]) or
+ // otherwise a name.
+ //
+ // Note that non-ASCII host names are allowed in URLs. They must be
+ // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed
+ // host name UTF8-encoded without regards to the template argument string
+ // type. Later we may add support for more appropriate encodings for
+ // multi-byte character types.
+ //
+ enum class url_host_kind {ipv4, ipv6, name};
+
+ template <typename S>
+ struct basic_url_host
+ {
+ using string_type = S;
+ using kind_type = url_host_kind;
+
+ string_type value;
+ kind_type kind;
+
+ // Can be treated as const string_type&.
+ //
+ operator const string_type& () const noexcept {return value;}
+
+ // Create an empty host.
+ //
+ basic_url_host (): kind (kind_type::name) {}
+
+ // Create the host object from its string representation as it appears in
+ // a URL, throwing std::invalid_argument if invalid. Remove the enclosing
+ // square brackets for IPv6 addresses, and URL-decode host names.
+ //
+ // Note that currently we don't validate IPv6 addresses.
+ //
+ explicit
+ basic_url_host (string_type);
+
+ basic_url_host (string_type v, kind_type k)
+ : value (std::move (v)), kind (k) {}
+
+ bool
+ empty () const
+ {
+ assert (kind == kind_type::name || !value.empty ());
+ return value.empty ();
+ }
+
+ // Return string representation of the host as it would appear in a URL.
+ //
+ string_type
+ string () const;
+ };
+
+ template <typename S>
+ struct basic_url_authority
+ {
+ using string_type = S;
+ using host_type = basic_url_host<string_type>;
+
+ string_type user; // Empty if not specified.
+ host_type host;
+ std::uint16_t port; // Zero if not specified.
+
+ bool
+ empty () const
+ {
+ assert (!host.empty () || (user.empty () && port == 0));
+ return host.empty ();
+ }
+
+ // Return a string representation of the URL authority. String
+ // representation of an empty instance is the empty string.
+ //
+ string_type
+ string () const;
+ };
+
+ template <typename H, typename S = H, typename P = S>
+ struct url_traits
+ {
+ using scheme_type = H;
+ using string_type = S;
+ using path_type = P;
+
+ using authority_type = basic_url_authority<string_type>;
+
+ // Translate the scheme string representation to its type. May throw
+ // std::invalid_argument. May change the URL components.
+ //
+ // This function is called with an empty scheme if the URL has no scheme,
+ // the scheme is invalid, or it could not be parsed into components
+ // according to the URL syntax. In this case all the passed components
+ // reference empty/absent values and if they remain unchanged on return,
+ // the URL is considered invalid and the std::invalid_argument exception
+ // with an appropriate description is thrown by the URL object constructor.
+ // This can be used to support scheme-less URLs, local paths, etc.
+ //
+ static scheme_type
+ translate_scheme (const string_type& /*url*/,
+ string_type&& scheme,
+ optional<authority_type>& /*authority*/,
+ optional<path_type>& /*path*/,
+ optional<string_type>& /*query*/,
+ optional<string_type>& /*fragment*/)
+ {
+ return scheme_type (std::move (scheme));
+ }
+
+ // Translate scheme type back to its string representation.
+ //
+ // Similar to the above the function is called with an empty string
+ // representation. If on return this value is no longer empty, then it is
+ // assume the URL has been translated in a custom manner (in which case
+ // the returned scheme value is ignored).
+ //
+ static string_type
+ translate_scheme (string_type&, /*url*/
+ const scheme_type& scheme,
+ const optional<authority_type>& /*authority*/,
+ const optional<path_type>& /*path*/,
+ const optional<string_type>& /*query*/,
+ const optional<string_type>& /*fragment*/)
+ {
+ return string_type (scheme);
+ }
+
+ // Translate the path string representation to its type.
+ //
+ static path_type
+ translate_path (string_type&& path)
+ {
+ return path_type (std::move (path));
+ }
+
+ // Translate path type back to its string representation.
+ //
+ static string_type
+ translate_path (const path_type& path) {return string_type (path);}
+ };
+
+ template <typename H, // scheme
+ typename T = url_traits<H>>
+ class basic_url
+ {
+ public:
+ using traits = T;
+
+ using string_type = typename traits::string_type;
+ using char_type = typename string_type::value_type;
+ using path_type = typename traits::path_type;
+
+ using scheme_type = typename traits::scheme_type;
+ using authority_type = typename traits::authority_type;
+ using host_type = typename authority_type::host_type;
+
+ scheme_type scheme;
+ optional<authority_type> authority;
+ optional<path_type> path;
+ optional<string_type> query;
+ optional<string_type> fragment;
+
+ // Create an empty URL object.
+ //
+ basic_url () = default;
+
+ // Create the URL object from its string representation. If the argument is
+ // empty, then create an empty object. Otherwise verify that the string is
+ // compliant to the generic URL syntax. URL-decode and validate components
+ // with common for all schemes syntax (scheme, host, port, path).
+ // Throw std::invalid_argument if the passed string is not a valid URL
+ // representation.
+ //
+ // Validation and URL-decoding of the scheme-specific components can be
+ // provided by a custom url_traits::translate_scheme() implementation.
+ //
+ explicit
+ basic_url (const string_type&);
+
+ // Create the URL object from individual components. Performs no
+ // components URL-decoding or verification.
+ //
+ basic_url (scheme_type,
+ optional<authority_type>,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ host_type host,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ host_type host,
+ std::uint16_t port,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ string_type host,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ string_type host,
+ std::uint16_t port,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ bool
+ empty () const noexcept
+ {
+ assert (authority || path || query || !fragment);
+ return !authority && !path && !query;
+ }
+
+ // Return a string representation of the URL. Note that while this is not
+ // necessarily syntactically the same string as what was used to
+ // initialize this instance, it should be semantically equivalent. String
+ // representation of an empty instance is the empty string.
+ //
+ string_type
+ string () const;
+
+ // The following predicates can be used to classify URL characters while
+ // parsing, validating or encoding scheme-specific components. For the
+ // semantics of character classes see RFC3986.
+ //
+ static bool
+ gen_delim (char_type c)
+ {
+ return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' ||
+ c == ']' || c == '@';
+ }
+
+ static bool
+ sub_delim (char_type c)
+ {
+ return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' ||
+ c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||
+ c == '\'';
+ }
+
+ static bool
+ reserved (char_type c) {return sub_delim (c) || gen_delim (c);}
+
+ static bool
+ unreserved (char_type c)
+ {
+ return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~';
+ }
+
+ // URL-encode a character sequence.
+ //
+ // Note that the set of characters that should be encoded may differ for
+ // different URL components. The optional callback function must return
+ // true for characters that should be percent-encoded. The function may
+ // encode the passed character in it's own way with another character (but
+ // never with '%'), and return false. By default all characters other than
+ // unreserved are percent-encoded.
+ //
+ // Also note that the characters are interpreted as bytes. In other words,
+ // each character may result in a single encoding triplet.
+ //
+ template <typename I, typename O, typename F = bool (*) (char_type&)>
+ static void
+ encode (I b, I e,
+ O o,
+
+ // VC (as of 15u3) doesn't see unreserved() unless qualified.
+ //
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);});
+
+ template <typename F = bool (*) (char_type&)>
+ static string_type
+ encode (const string_type& s,
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+ {
+ string_type r;
+ encode (s.begin (), s.end (), std::back_inserter (r), f);
+ return r;
+ }
+
+ template <typename F = bool (*) (char_type&)>
+ static string_type
+ encode (const char_type* s,
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+ {
+ string_type r;
+ encode (s, s + string_type::traits_type::length (s),
+ std::back_inserter (r), f);
+ return r;
+ }
+
+ // URL-decode a character sequence. Throw std::invalid_argument if an
+ // invalid encoding sequence is encountered.
+ //
+ // If some characters in the sequence are encoded with another characters
+ // (rather than percent-encoded), then one must provide the callback
+ // function to decode them.
+ //
+ template <typename I, typename O, typename F = void (*) (char_type&)>
+ static void
+ decode (I b, I e, O o, F&& f = [] (char_type&) {});
+
+ template <typename F = void (*) (char_type&)>
+ static string_type
+ decode (const string_type& s, F&& f = [] (char_type&) {})
+ {
+ string_type r;
+ decode (s.begin (), s.end (), std::back_inserter (r), f);
+ return r;
+ }
+
+ template <typename F = void (*) (char_type&)>
+ static string_type
+ decode (const char_type* s, F&& f = [] (char_type&) {})
+ {
+ string_type r;
+ decode (s, s + string_type::traits_type::length (s),
+ std::back_inserter (r), f);
+ return r;
+ }
+ };
+
+ using url_authority = basic_url_authority<std::string>;
+ using url = basic_url <std::string>;
+
+ template <typename S>
+ inline bool
+ operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+ {
+ return x.value == y.value && x.kind == y.kind;
+ }
+
+ template <typename S>
+ inline bool
+ operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S>
+ inline bool
+ operator== (const basic_url_authority<S>& x,
+ const basic_url_authority<S>& y) noexcept
+ {
+ return x.user == y.user && x.host == y.host && x.port == y.port;
+ }
+
+ template <typename S>
+ inline bool
+ operator!= (const basic_url_authority<S>& x,
+ const basic_url_authority<S>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S, typename T>
+ inline bool
+ operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+ {
+ if (!(x.authority == y.authority && x.path == y.path &&
+ x.query == y.query && x.fragment == y.fragment))
+ return false;
+
+ assert (x.empty () == y.empty ());
+
+ if (x.empty ())
+ return true;
+
+ return x.scheme == y.scheme; // None is empty, so schemes are valid.
+ }
+
+ template <typename S, typename T>
+ inline bool
+ operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S, typename T>
+ inline auto
+ operator<< (std::basic_ostream<typename T::string_type::value_type>& o,
+ const basic_url<S, T>& u) -> decltype (o)
+ {
+ return o << u.string ();
+ }
+}
+
+#include <libbutl/url.ixx>
+#include <libbutl/url.txx>
diff --git a/libbutl/url.txx b/libbutl/url.txx
new file mode 100644
index 0000000..addfe88
--- /dev/null
+++ b/libbutl/url.txx
@@ -0,0 +1,509 @@
+// file : libbutl/url.txx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+ // Convenience functions.
+ //
+ template <typename C>
+ inline bool
+ url_path_char (C c)
+ {
+ using url = basic_url<std::basic_string<C>>;
+
+ return c == '/' || c == ':' || url::unreserved (c) ||
+ c == '@' || url::sub_delim (c);
+ }
+
+ // basic_url_host
+ //
+ template <typename S>
+ basic_url_host<S>::
+ basic_url_host (string_type v)
+ {
+ using std::invalid_argument;
+
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name;
+
+ if (kind == url_host_kind::ipv6)
+ {
+ if (v.back () != ']')
+ throw invalid_argument ("invalid IPv6 address");
+
+ value.assign (v, 1, v.size () - 2);
+ }
+ else
+ {
+ // Detect the IPv4 address host type.
+ //
+ {
+ size_t n (0);
+ string_type oct;
+
+ auto ipv4_oct = [&oct, &n] () -> bool
+ {
+ if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255)
+ return false;
+
+ ++n;
+ oct.clear ();
+ return true;
+ };
+
+ auto i (v.cbegin ());
+ auto e (v.cend ());
+
+ for (; i != e; ++i)
+ {
+ char_type c (*i);
+
+ if (digit (c))
+ oct += c;
+ else if (c != '.' || !ipv4_oct ())
+ break;
+ }
+
+ if (i == e && ipv4_oct () && n == 4)
+ kind = url_host_kind::ipv4;
+ }
+
+ // Verify and decode the host name.
+ //
+ bool dec (false);
+ if (kind == url_host_kind::name)
+ {
+ for (auto c: v)
+ {
+ if (!(url::unreserved (c) || url::sub_delim (c) || c == '%'))
+ throw invalid_argument ("invalid host name");
+
+ if (c == '%')
+ dec = true;
+ }
+ }
+
+ value = dec ? url::decode (v) : move (v);
+ }
+ }
+
+ template <typename S>
+ S basic_url_host<S>::
+ string () const
+ {
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ if (empty ())
+ return string_type ();
+
+ switch (kind)
+ {
+ case url_host_kind::ipv4: return value;
+ case url_host_kind::ipv6:
+ {
+ string_type r;
+ r += '[';
+ r += value;
+ r += ']';
+ return r;
+ }
+ case url_host_kind::name:
+ {
+ // We don't encode all characters that are disallowed for the host
+ // part as RFC3986 requests:
+ //
+ // URI producing applications must not use percent-encoding in host
+ // unless it is used to represent a UTF-8 character sequence.
+ //
+ // The callback requests to encode characters outside the ASCII
+ // character set.
+ //
+ return url::encode (value,
+ [] (char_type& c)
+ {
+ // Convert to the unsigned numeric type, that is
+ // long enough to hold any character type.
+ //
+ return static_cast<unsigned long> (c) >= 0x80;
+ });
+ }
+ }
+
+ assert (false); // Can't be here.
+ return string_type ();
+ }
+
+ // basic_url_authority
+ //
+ template <typename S>
+ S
+ port_string (std::uint16_t p);
+
+ template <>
+ inline std::string
+ port_string (std::uint16_t p)
+ {
+ return std::to_string (p);
+ }
+
+ template <>
+ inline std::wstring
+ port_string (std::uint16_t p)
+ {
+ return std::to_wstring (p);
+ }
+
+ template <typename S>
+ S basic_url_authority<S>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type r;
+ if (!user.empty ())
+ {
+ r += user;
+ r += '@';
+ }
+
+ r += host.string ();
+
+ if (port != 0)
+ {
+ r += ':';
+ r += port_string<string_type> (port);
+ }
+
+ return r;
+ }
+
+ // basic_url
+ //
+ template <typename S, typename T>
+ basic_url<S, T>::
+ basic_url (const string_type& u)
+ {
+ using namespace std;
+
+ using iterator = typename string_type::const_iterator;
+
+ // Create an empty URL object for the empty argument. Note that the scheme
+ // is default-constructed, and so may stay undefined in this case.
+ //
+ if (u.empty ())
+ return;
+
+ try
+ {
+ // At the end of a component parsing 'i' points to the next component
+ // start, and 'b' stays unchanged.
+ //
+ iterator b (u.cbegin ());
+ iterator i (b);
+ iterator e (u.cend ());
+
+ // Extract scheme.
+ //
+ for(char_type c; i != e && (c = *i) != ':'; ++i)
+ {
+ if (!(i == b
+ ? alpha (c)
+ : (alnum (c) || c == '+' || c == '-' || c == '.')))
+ throw invalid_argument ("invalid scheme");
+ }
+
+ if (i == b || i == e || i == b + 1) // Forbids one letter length schemes.
+ throw invalid_argument ("no scheme");
+
+ string_type sc (b, i++); // Skip ':'.
+
+ // Parse authority.
+ //
+ if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/')
+ {
+ i += 2; // Skip '//'.
+
+ // Find the authority end.
+ //
+ size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b));
+ iterator ae (p != string_type::npos ? b + p : e);
+
+ string_type auth (i, ae);
+ i = ae;
+
+ // Extract user information.
+ //
+ string_type user;
+ p = auth.find ('@');
+ if (p != string_type::npos)
+ {
+ // Don't URL-decode the user information (scheme-specific).
+ //
+ user = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+
+ // Extract host.
+ //
+ string_type host;
+ p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6.
+
+ if (p != string_type::npos && auth[p] == ']') // There is no port.
+ p = string_type::npos;
+
+ if (p != string_type::npos)
+ {
+ host = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+ else
+ {
+ host = move (auth);
+ auth = string_type ();
+ }
+
+ // Extract port.
+ //
+ uint16_t port (0);
+ if (!auth.empty ())
+ {
+ auto bad_port = [] () {throw invalid_argument ("invalid port");};
+
+ for (auto c: auth)
+ {
+ if (!digit (c))
+ bad_port ();
+ }
+
+ unsigned long long n (stoull (auth));
+ if (n == 0 || n > UINT16_MAX)
+ bad_port ();
+
+ port = static_cast<uint16_t> (n);
+ }
+
+ // User information and port are only meaningful if the host part is
+ // present.
+ //
+ if (host.empty () && (!user.empty () || port != 0))
+ throw invalid_argument ("no host");
+
+ authority = {move (user), host_type (move (host)), port};
+ }
+
+ // Extract path.
+ //
+ if (i != e && *i == '/')
+ {
+ ++i; // Skip '/'.
+
+ // Verify and URL-decode the path.
+ //
+ iterator j (i);
+ for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j)
+ {
+ if (!(url_path_char (c) || c == '%'))
+ throw invalid_argument ("invalid path");
+ }
+
+ // Note that encoding for non-ASCII path is not specified (in contrast
+ // to the host name), and presumably is local to the referenced
+ // authority.
+ //
+ string_type s;
+ decode (i, j, back_inserter (s));
+ path = traits::translate_path (move (s));
+ i = j;
+ }
+
+ // Extract query.
+ //
+ if (i != e && *i == '?')
+ {
+ ++i; // Skip '?'.
+
+ // Find the query component end.
+ //
+ size_t p (u.find ('#', i - b));
+ iterator qe (p != string_type::npos ? b + p : e);
+
+ // Don't URL-decode the query (scheme-specific).
+ //
+ query = string_type (i, qe);
+ i = qe;
+ }
+
+ // We don't suppose to end up with an empty URL.
+ //
+ if (empty ())
+ throw invalid_argument ("no authority, path or query");
+
+ // Parse fragment.
+ //
+ if (i != e)
+ {
+ ++i; // Skip '#'.
+
+ // Don't URL-decode the fragment (media type-specific).
+ //
+ fragment = string_type (i, e);
+ i = e;
+ }
+
+ assert (i == e);
+
+ // Translate the scheme string representation to its type.
+ //
+ scheme = traits::translate_scheme (u,
+ move (sc),
+ authority,
+ path,
+ query,
+ fragment);
+ }
+ // If we fail to parse the URL, then delegate this job to
+ // traits::translate_scheme(). If it also fails, leaving the components
+ // absent, then we re-throw.
+ //
+ catch (const invalid_argument&)
+ {
+ authority = nullopt;
+ path = nullopt;
+ query = nullopt;
+ fragment = nullopt;
+
+ scheme = traits::translate_scheme (u,
+ string_type () /* scheme */,
+ authority,
+ path,
+ query,
+ fragment);
+
+ if (!authority && !path && !query && !fragment)
+ throw;
+ }
+ }
+
+ template <typename S, typename T>
+ typename basic_url<S, T>::string_type basic_url<S, T>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type u;
+ string_type r (traits::translate_scheme (u,
+ scheme,
+ authority,
+ path,
+ query,
+ fragment));
+
+ // Return the custom URL pbject representation if provided.
+ //
+ if (!u.empty ())
+ return u;
+
+ r += ':';
+
+ if (authority)
+ {
+ r += '/';
+ r += '/';
+ r += authority->string ();
+ }
+
+ if (path)
+ {
+ r += '/';
+ r += encode (traits::translate_path (*path),
+ [] (char_type& c) {return !url_path_char (c);});
+ }
+
+ if (query)
+ {
+ r += '?';
+ r += *query;
+ }
+
+ if (fragment)
+ {
+ r += '#';
+ r += *fragment;
+ }
+
+ return r;
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ encode (I b, I e, O o, F&& f)
+ {
+ const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
+ '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ if (c == '%' || f (c))
+ {
+ assert (c == *b); // Must not be custom-encoded.
+
+ *o++ = '%';
+ *o++ = digits[(c >> 4) & 0xF];
+ *o++ = digits[c & 0xF];
+ }
+ else
+ {
+ assert (c != '%'); // Otherwise decoding will be ambiguous.
+ *o++ = c;
+ }
+ }
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ decode (I b, I e, O o, F&& f)
+ {
+ using namespace std;
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ // URL-decode the character.
+ //
+ if (c == '%')
+ {
+ // Note that we can't use (potentially more efficient) strtoul() here
+ // as it doesn't have an overload for the wide character string.
+ // However, the code below shouldn't be inefficient, given that the
+ // string is short, and so is (probably) stack-allocated.
+ //
+ // Note that stoul() throws if no conversion could be performed, so we
+ // explicitly check for xdigits.
+ //
+ if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1)))
+ c = static_cast<char_type> (stoul (string_type (b, b + 2),
+ nullptr,
+ 16));
+ else
+ throw invalid_argument ("invalid URL-encoding");
+
+ ++b; // Position to the second xdigit.
+ }
+ else
+ f (c);
+
+ *o++ = c;
+ }
+ }
+}
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index d703211..fcb8789 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -137,4 +137,34 @@ namespace butl
{
return std::isalnum (c);
}
+
+ inline bool
+ xdigit (char c)
+ {
+ return std::isxdigit (c);
+ }
+
+ inline bool
+ alpha (wchar_t c)
+ {
+ return std::iswalpha (c);
+ }
+
+ inline bool
+ digit (wchar_t c)
+ {
+ return std::iswdigit (c);
+ }
+
+ inline bool
+ alnum (wchar_t c)
+ {
+ return std::iswalnum (c);
+ }
+
+ inline bool
+ xdigit (wchar_t c)
+ {
+ return std::iswxdigit (c);
+ }
}
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 3f23581..988ca22 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -22,7 +22,8 @@
#include <exception> // exception, uncaught_exception[s]()
//#include <functional> // hash
-#include <cctype> // toupper(), tolower(), isalpha(), isdigit(), isalnum()
+#include <cctype> // toupper(), tolower(), is*()
+#include <cwctype> // isw*()
#endif
#include <libbutl/ft/lang.hxx> // thread_local
@@ -138,14 +139,15 @@ LIBBUTL_MODEXPORT namespace butl
}
};
- bool
- alpha (char);
-
- bool
- digit (char);
+ bool alpha (char);
+ bool digit (char);
+ bool alnum (char);
+ bool xdigit (char);
- bool
- alnum (char);
+ bool alpha (wchar_t);
+ bool digit (wchar_t);
+ bool alnum (wchar_t);
+ bool xdigit (wchar_t);
// Key comparators (i.e., to be used in sets, maps, etc).
//