aboutsummaryrefslogtreecommitdiff
path: root/libbutl/url.txx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-12-10 10:02:19 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-12-26 13:25:37 +0300
commite5bfd17637bf297c3cfe509d51027916864092d5 (patch)
tree5dab56d6a5aee0a38da6f597c52b12838b2836b5 /libbutl/url.txx
parentb1cd193f1bd28837a00cbe6629f9a562f99d961f (diff)
Add basic_url<H,T> class template
Diffstat (limited to 'libbutl/url.txx')
-rw-r--r--libbutl/url.txx509
1 files changed, 509 insertions, 0 deletions
diff --git a/libbutl/url.txx b/libbutl/url.txx
new file mode 100644
index 0000000..addfe88
--- /dev/null
+++ b/libbutl/url.txx
@@ -0,0 +1,509 @@
+// file : libbutl/url.txx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+ // Convenience functions.
+ //
+ template <typename C>
+ inline bool
+ url_path_char (C c)
+ {
+ using url = basic_url<std::basic_string<C>>;
+
+ return c == '/' || c == ':' || url::unreserved (c) ||
+ c == '@' || url::sub_delim (c);
+ }
+
+ // basic_url_host
+ //
+ template <typename S>
+ basic_url_host<S>::
+ basic_url_host (string_type v)
+ {
+ using std::invalid_argument;
+
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name;
+
+ if (kind == url_host_kind::ipv6)
+ {
+ if (v.back () != ']')
+ throw invalid_argument ("invalid IPv6 address");
+
+ value.assign (v, 1, v.size () - 2);
+ }
+ else
+ {
+ // Detect the IPv4 address host type.
+ //
+ {
+ size_t n (0);
+ string_type oct;
+
+ auto ipv4_oct = [&oct, &n] () -> bool
+ {
+ if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255)
+ return false;
+
+ ++n;
+ oct.clear ();
+ return true;
+ };
+
+ auto i (v.cbegin ());
+ auto e (v.cend ());
+
+ for (; i != e; ++i)
+ {
+ char_type c (*i);
+
+ if (digit (c))
+ oct += c;
+ else if (c != '.' || !ipv4_oct ())
+ break;
+ }
+
+ if (i == e && ipv4_oct () && n == 4)
+ kind = url_host_kind::ipv4;
+ }
+
+ // Verify and decode the host name.
+ //
+ bool dec (false);
+ if (kind == url_host_kind::name)
+ {
+ for (auto c: v)
+ {
+ if (!(url::unreserved (c) || url::sub_delim (c) || c == '%'))
+ throw invalid_argument ("invalid host name");
+
+ if (c == '%')
+ dec = true;
+ }
+ }
+
+ value = dec ? url::decode (v) : move (v);
+ }
+ }
+
+ template <typename S>
+ S basic_url_host<S>::
+ string () const
+ {
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ if (empty ())
+ return string_type ();
+
+ switch (kind)
+ {
+ case url_host_kind::ipv4: return value;
+ case url_host_kind::ipv6:
+ {
+ string_type r;
+ r += '[';
+ r += value;
+ r += ']';
+ return r;
+ }
+ case url_host_kind::name:
+ {
+ // We don't encode all characters that are disallowed for the host
+ // part as RFC3986 requests:
+ //
+ // URI producing applications must not use percent-encoding in host
+ // unless it is used to represent a UTF-8 character sequence.
+ //
+ // The callback requests to encode characters outside the ASCII
+ // character set.
+ //
+ return url::encode (value,
+ [] (char_type& c)
+ {
+ // Convert to the unsigned numeric type, that is
+ // long enough to hold any character type.
+ //
+ return static_cast<unsigned long> (c) >= 0x80;
+ });
+ }
+ }
+
+ assert (false); // Can't be here.
+ return string_type ();
+ }
+
+ // basic_url_authority
+ //
+ template <typename S>
+ S
+ port_string (std::uint16_t p);
+
+ template <>
+ inline std::string
+ port_string (std::uint16_t p)
+ {
+ return std::to_string (p);
+ }
+
+ template <>
+ inline std::wstring
+ port_string (std::uint16_t p)
+ {
+ return std::to_wstring (p);
+ }
+
+ template <typename S>
+ S basic_url_authority<S>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type r;
+ if (!user.empty ())
+ {
+ r += user;
+ r += '@';
+ }
+
+ r += host.string ();
+
+ if (port != 0)
+ {
+ r += ':';
+ r += port_string<string_type> (port);
+ }
+
+ return r;
+ }
+
+ // basic_url
+ //
+ template <typename S, typename T>
+ basic_url<S, T>::
+ basic_url (const string_type& u)
+ {
+ using namespace std;
+
+ using iterator = typename string_type::const_iterator;
+
+ // Create an empty URL object for the empty argument. Note that the scheme
+ // is default-constructed, and so may stay undefined in this case.
+ //
+ if (u.empty ())
+ return;
+
+ try
+ {
+ // At the end of a component parsing 'i' points to the next component
+ // start, and 'b' stays unchanged.
+ //
+ iterator b (u.cbegin ());
+ iterator i (b);
+ iterator e (u.cend ());
+
+ // Extract scheme.
+ //
+ for(char_type c; i != e && (c = *i) != ':'; ++i)
+ {
+ if (!(i == b
+ ? alpha (c)
+ : (alnum (c) || c == '+' || c == '-' || c == '.')))
+ throw invalid_argument ("invalid scheme");
+ }
+
+ if (i == b || i == e || i == b + 1) // Forbids one letter length schemes.
+ throw invalid_argument ("no scheme");
+
+ string_type sc (b, i++); // Skip ':'.
+
+ // Parse authority.
+ //
+ if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/')
+ {
+ i += 2; // Skip '//'.
+
+ // Find the authority end.
+ //
+ size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b));
+ iterator ae (p != string_type::npos ? b + p : e);
+
+ string_type auth (i, ae);
+ i = ae;
+
+ // Extract user information.
+ //
+ string_type user;
+ p = auth.find ('@');
+ if (p != string_type::npos)
+ {
+ // Don't URL-decode the user information (scheme-specific).
+ //
+ user = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+
+ // Extract host.
+ //
+ string_type host;
+ p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6.
+
+ if (p != string_type::npos && auth[p] == ']') // There is no port.
+ p = string_type::npos;
+
+ if (p != string_type::npos)
+ {
+ host = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+ else
+ {
+ host = move (auth);
+ auth = string_type ();
+ }
+
+ // Extract port.
+ //
+ uint16_t port (0);
+ if (!auth.empty ())
+ {
+ auto bad_port = [] () {throw invalid_argument ("invalid port");};
+
+ for (auto c: auth)
+ {
+ if (!digit (c))
+ bad_port ();
+ }
+
+ unsigned long long n (stoull (auth));
+ if (n == 0 || n > UINT16_MAX)
+ bad_port ();
+
+ port = static_cast<uint16_t> (n);
+ }
+
+ // User information and port are only meaningful if the host part is
+ // present.
+ //
+ if (host.empty () && (!user.empty () || port != 0))
+ throw invalid_argument ("no host");
+
+ authority = {move (user), host_type (move (host)), port};
+ }
+
+ // Extract path.
+ //
+ if (i != e && *i == '/')
+ {
+ ++i; // Skip '/'.
+
+ // Verify and URL-decode the path.
+ //
+ iterator j (i);
+ for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j)
+ {
+ if (!(url_path_char (c) || c == '%'))
+ throw invalid_argument ("invalid path");
+ }
+
+ // Note that encoding for non-ASCII path is not specified (in contrast
+ // to the host name), and presumably is local to the referenced
+ // authority.
+ //
+ string_type s;
+ decode (i, j, back_inserter (s));
+ path = traits::translate_path (move (s));
+ i = j;
+ }
+
+ // Extract query.
+ //
+ if (i != e && *i == '?')
+ {
+ ++i; // Skip '?'.
+
+ // Find the query component end.
+ //
+ size_t p (u.find ('#', i - b));
+ iterator qe (p != string_type::npos ? b + p : e);
+
+ // Don't URL-decode the query (scheme-specific).
+ //
+ query = string_type (i, qe);
+ i = qe;
+ }
+
+ // We don't suppose to end up with an empty URL.
+ //
+ if (empty ())
+ throw invalid_argument ("no authority, path or query");
+
+ // Parse fragment.
+ //
+ if (i != e)
+ {
+ ++i; // Skip '#'.
+
+ // Don't URL-decode the fragment (media type-specific).
+ //
+ fragment = string_type (i, e);
+ i = e;
+ }
+
+ assert (i == e);
+
+ // Translate the scheme string representation to its type.
+ //
+ scheme = traits::translate_scheme (u,
+ move (sc),
+ authority,
+ path,
+ query,
+ fragment);
+ }
+ // If we fail to parse the URL, then delegate this job to
+ // traits::translate_scheme(). If it also fails, leaving the components
+ // absent, then we re-throw.
+ //
+ catch (const invalid_argument&)
+ {
+ authority = nullopt;
+ path = nullopt;
+ query = nullopt;
+ fragment = nullopt;
+
+ scheme = traits::translate_scheme (u,
+ string_type () /* scheme */,
+ authority,
+ path,
+ query,
+ fragment);
+
+ if (!authority && !path && !query && !fragment)
+ throw;
+ }
+ }
+
+ template <typename S, typename T>
+ typename basic_url<S, T>::string_type basic_url<S, T>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type u;
+ string_type r (traits::translate_scheme (u,
+ scheme,
+ authority,
+ path,
+ query,
+ fragment));
+
+ // Return the custom URL pbject representation if provided.
+ //
+ if (!u.empty ())
+ return u;
+
+ r += ':';
+
+ if (authority)
+ {
+ r += '/';
+ r += '/';
+ r += authority->string ();
+ }
+
+ if (path)
+ {
+ r += '/';
+ r += encode (traits::translate_path (*path),
+ [] (char_type& c) {return !url_path_char (c);});
+ }
+
+ if (query)
+ {
+ r += '?';
+ r += *query;
+ }
+
+ if (fragment)
+ {
+ r += '#';
+ r += *fragment;
+ }
+
+ return r;
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ encode (I b, I e, O o, F&& f)
+ {
+ const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
+ '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ if (c == '%' || f (c))
+ {
+ assert (c == *b); // Must not be custom-encoded.
+
+ *o++ = '%';
+ *o++ = digits[(c >> 4) & 0xF];
+ *o++ = digits[c & 0xF];
+ }
+ else
+ {
+ assert (c != '%'); // Otherwise decoding will be ambiguous.
+ *o++ = c;
+ }
+ }
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ decode (I b, I e, O o, F&& f)
+ {
+ using namespace std;
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ // URL-decode the character.
+ //
+ if (c == '%')
+ {
+ // Note that we can't use (potentially more efficient) strtoul() here
+ // as it doesn't have an overload for the wide character string.
+ // However, the code below shouldn't be inefficient, given that the
+ // string is short, and so is (probably) stack-allocated.
+ //
+ // Note that stoul() throws if no conversion could be performed, so we
+ // explicitly check for xdigits.
+ //
+ if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1)))
+ c = static_cast<char_type> (stoul (string_type (b, b + 2),
+ nullptr,
+ 16));
+ else
+ throw invalid_argument ("invalid URL-encoding");
+
+ ++b; // Position to the second xdigit.
+ }
+ else
+ f (c);
+
+ *o++ = c;
+ }
+ }
+}