// file : libbutl/url.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { // Convenience functions. // // basic_url_host // template basic_url_host:: basic_url_host (string_type v) { using std::invalid_argument; using url = basic_url; using char_type = typename string_type::value_type; kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name; if (kind == url_host_kind::ipv6) { if (v.back () != ']') throw invalid_argument ("invalid IPv6 address"); value.assign (v, 1, v.size () - 2); } else { // Detect the IPv4 address host type. // { size_t n (0); string_type oct; auto ipv4_oct = [&oct, &n] () -> bool { if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255) return false; ++n; oct.clear (); return true; }; auto i (v.cbegin ()); auto e (v.cend ()); for (; i != e; ++i) { char_type c (*i); if (digit (c)) oct += c; else if (c != '.' || !ipv4_oct ()) break; } if (i == e && ipv4_oct () && n == 4) kind = url_host_kind::ipv4; } // Verify and decode the host name. // bool dec (false); if (kind == url_host_kind::name) { for (auto c: v) { if (!(url::unreserved (c) || url::sub_delim (c) || c == '%')) throw invalid_argument ("invalid host name"); if (c == '%') dec = true; } } value = dec ? url::decode (v) : move (v); } } template S basic_url_host:: string () const { using url = basic_url; using char_type = typename string_type::value_type; if (empty ()) return string_type (); switch (kind) { case url_host_kind::ipv4: return value; case url_host_kind::ipv6: { string_type r; r += '['; r += value; r += ']'; return r; } case url_host_kind::name: { // We don't encode all characters that are disallowed for the host // part as RFC3986 requests: // // URI producing applications must not use percent-encoding in host // unless it is used to represent a UTF-8 character sequence. // // The callback requests to encode characters outside the ASCII // character set. // return url::encode (value, [] (char_type& c) { // Convert to the unsigned numeric type, that is // long enough to hold any character type. // return static_cast (c) >= 0x80; }); } } assert (false); // Can't be here. return string_type (); } // basic_url_authority // template S port_string (std::uint16_t p); template <> inline std::string port_string (std::uint16_t p) { return std::to_string (p); } template <> inline std::wstring port_string (std::uint16_t p) { return std::to_wstring (p); } template S basic_url_authority:: string () const { if (empty ()) return string_type (); string_type r; if (!user.empty ()) { r += user; r += '@'; } r += host.string (); if (port != 0) { r += ':'; r += port_string (port); } return r; } // url_traits // template std::size_t url_traits:: find (const string_type& s, std::size_t p) { if (p == string_type::npos) p = s.find (':'); if (p == string_type::npos || p < 2 || p + 1 == s.size () || s[p + 1] != '/') return string_type::npos; // Scan backwards for as long as it is a valid scheme. // std::size_t i (p); for (; i != 0; --i) { auto c (s[i - 1]); if (!(alnum (c) || c == '+' || c == '-' || c == '.')) break; } if (i != p && !alpha (s[i])) // First must be alpha. ++i; if (p - i < 2) return string_type::npos; return i; } // basic_url // template basic_url:: basic_url (const string_type& u) { using namespace std; using iterator = typename string_type::const_iterator; try { if (u.empty ()) throw invalid_argument ("empty URL"); // At the end of a component parsing 'i' points to the next component // start, and 'b' stays unchanged. // iterator b (u.cbegin ()); iterator i (b); iterator e (u.cend ()); // Extract scheme. // for (char_type c; i != e && (c = *i) != ':'; ++i) { if (!(i == b ? alpha (c) : (alnum (c) || c == '+' || c == '-' || c == '.'))) throw invalid_argument ("invalid scheme"); } if (i == b || i == e || i == b + 1) // Forbids one letter length schemes. throw invalid_argument ("no scheme"); string_type sc (b, i++); // Skip ':'. // Parse authority. // if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/') { i += 2; // Skip '//'. // Find the authority end. // size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b)); iterator ae (p != string_type::npos ? b + p : e); string_type auth (i, ae); i = ae; // Extract user information. // string_type user; p = auth.find ('@'); if (p != string_type::npos) { // Don't URL-decode the user information (scheme-specific). // user = string_type (auth, 0, p); auth = string_type (auth, p + 1); } // Extract host. // string_type host; p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6. if (p != string_type::npos && auth[p] == ']') // There is no port. p = string_type::npos; if (p != string_type::npos) { host = string_type (auth, 0, p); auth = string_type (auth, p + 1); } else { host = move (auth); auth = string_type (); } // Extract port. // uint16_t port (0); if (!auth.empty ()) { auto bad_port = [] () {throw invalid_argument ("invalid port");}; for (auto c: auth) { if (!digit (c)) bad_port (); } unsigned long long n (stoull (auth)); if (n == 0 || n > UINT16_MAX) bad_port (); port = static_cast (n); } // User information and port are only meaningful if the host part is // present. // if (host.empty () && (!user.empty () || port != 0)) throw invalid_argument ("no host"); authority = {move (user), host_type (move (host)), port}; } // Extract path. // if (i != e && *i != '?' && *i != '#') { rootless = *i != '/'; if (!rootless) ++i; // Verify and translate the path. // iterator j (i); for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j) { if (!(path_char (c) || c == '%')) throw invalid_argument ("invalid path"); } path = traits_type::translate_path (string_type (i, j)); i = j; } // Extract query. // if (i != e && *i == '?') { ++i; // Skip '?'. // Find the query component end. // size_t p (u.find ('#', i - b)); iterator qe (p != string_type::npos ? b + p : e); // Don't URL-decode the query (scheme-specific). // query = string_type (i, qe); i = qe; } // Parse fragment. // if (i != e) { ++i; // Skip '#'. // Don't URL-decode the fragment (media type-specific). // fragment = string_type (i, e); i = e; } assert (i == e); // Translate the scheme string representation to its type. // optional s (traits_type::translate_scheme (u, move (sc), authority, path, query, fragment, rootless)); assert (s); scheme = *s; } // If we fail to parse the URL, then delegate this job to // traits::translate_scheme(). If it also fails, returning nullopt, then // we re-throw. // catch (const invalid_argument&) { authority = nullopt; path = nullopt; query = nullopt; fragment = nullopt; rootless = false; optional s ( traits_type::translate_scheme (u, string_type () /* scheme */, authority, path, query, fragment, rootless)); if (!s) throw; scheme = *s; } } template typename basic_url::string_type basic_url:: string () const { if (empty ()) return string_type (); string_type u; string_type r (traits_type::translate_scheme (u, scheme, authority, path, query, fragment, rootless)); // Return the custom URL pbject representation if provided. // if (!u.empty ()) return u; if (!r.empty ()) r += ':'; if (authority) { // We can't append '//' string literal, so appending characters. // if (!r.empty ()) { r += '/'; r += '/'; } r += authority->string (); } if (path) { if (!rootless) r += '/'; r += traits_type::translate_path (*path); } if (query) { r += '?'; r += *query; } if (fragment) { r += '#'; r += *fragment; } return r; } template template void basic_url:: encode (I b, I e, O o, F&& f) { const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; for (; b != e; ++b) { char_type c (*b); if (c == '%' || f (c)) { assert (c == *b); // Must not be custom-encoded. *o++ = '%'; *o++ = digits[(c >> 4) & 0xF]; *o++ = digits[c & 0xF]; } else { assert (c != '%'); // Otherwise decoding will be ambiguous. *o++ = c; } } } template template void basic_url:: decode (I b, I e, O o, F&& f) { using namespace std; for (; b != e; ++b) { char_type c (*b); // URL-decode the character. // if (c == '%') { // Note that we can't use (potentially more efficient) strtoul() here // as it doesn't have an overload for the wide character string. // However, the code below shouldn't be inefficient, given that the // string is short, and so is (probably) stack-allocated. // // Note that stoul() throws if no conversion could be performed, so we // explicitly check for xdigits. // if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1))) c = static_cast (stoul (string_type (b, b + 2), nullptr, 16)); else throw invalid_argument ("invalid URL-encoding"); ++b; // Position to the second xdigit. } else f (c); *o++ = c; } } }