From ec9c6f1bbdfd3d86fba493ea56473c0aaf9acad1 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Sat, 29 Sep 2018 00:12:26 +0300 Subject: Add support for rootless URLs --- libbutl/url.ixx | 36 +++++++++++++++ libbutl/url.mxx | 122 ++++++++++++++++++++++++++++++++------------------- libbutl/url.txx | 87 ++++++++++++++++-------------------- tests/url/driver.cxx | 86 ++++++++++++++++++++++++++---------- tests/url/testscript | 21 ++++++++- 5 files changed, 234 insertions(+), 118 deletions(-) diff --git a/libbutl/url.ixx b/libbutl/url.ixx index 2feb347..7a6d86f 100644 --- a/libbutl/url.ixx +++ b/libbutl/url.ixx @@ -4,6 +4,28 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { + // url_traits + // + template + inline typename url_traits::path_type url_traits:: + translate_path (string_type&& path) + { + return path_type (basic_url::decode (path)); + } + + template + inline typename url_traits::string_type url_traits:: + translate_path (const path_type& path) + { + using url = basic_url; + + return url::encode ( + string_type (path), + [] (typename url::char_type& c) {return !url::path_char (c);}); + } + + // basic_url + // template inline basic_url:: basic_url (scheme_type s, @@ -81,4 +103,18 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. std::move (f)) { } + + template + inline basic_url:: + basic_url (scheme_type s, + optional p, + optional q, + optional f) + : scheme (std::move (s)), + path (std::move (p)), + query (std::move (q)), + fragment (std::move (f)), + rootless (true) + { + } } diff --git a/libbutl/url.mxx b/libbutl/url.mxx index c696eaa..6e7acde 100644 --- a/libbutl/url.mxx +++ b/libbutl/url.mxx @@ -44,7 +44,9 @@ LIBBUTL_MODEXPORT namespace butl { // RFC3986 Uniform Resource Locator (URL). // - // = :[//[]][/][?][#] + // = :[//[]][/][?][#] | + // :[][?][#] + // // = [@][:] // // Some examples of equivalent URLs to meditate upon: @@ -72,10 +74,22 @@ LIBBUTL_MODEXPORT namespace butl // can choose to use the path type if local paths are to be interpreted as // relative to the host. // - // Note that we currently forbid one character schemes to support scheme- - // less (Windows) paths which can be done by url_traits::translate_scheme() - // (see below). (A Windows path that uses forward slashes would be parsed as - // a valid authority-less URL). + // For authority-less schemes the component is also represented as a + // relative path. Some examples of such URLs (let's call them rootless + // rather than authority-less not to confuse with a case where authority is + // empty/implied): + // + // pkcs11:token=sign;object=SIGN%20key + // pkcs11:id=%02%38%01?pin-value=12345 + // pkcs11: + // + // Note that a scheme can theoretically allow both rootless and "rootfull" + // representations. + // + // Note also that we currently forbid one character schemes to support + // scheme- less (Windows) paths which can be done by + // url_traits::translate_scheme() (see below). (A Windows path that uses + // forward slashes would be parsed as a valid authority-less URL). // URL host component can be an IPv4 address (if matches its dotted-decimal // notation), an IPv6 address (if enclosed in [square brackets]) or @@ -165,23 +179,25 @@ LIBBUTL_MODEXPORT namespace butl using authority_type = basic_url_authority; // Translate the scheme string representation to its type. May throw - // std::invalid_argument. May change the URL components. + // std::invalid_argument. May change the URL components. Should not return + // nullopt if called with a non-empty scheme. // // This function is called with an empty scheme if the URL has no scheme, // the scheme is invalid, or it could not be parsed into components // according to the URL syntax. In this case all the passed components - // reference empty/absent values and if they remain unchanged on return, - // the URL is considered invalid and the std::invalid_argument exception - // with an appropriate description is thrown by the URL object constructor. - // This can be used to support scheme-less URLs, local paths, etc. + // reference empty/absent/false values. If nullopt is returned, the URL is + // considered invalid and the std::invalid_argument exception with an + // appropriate description is thrown by the URL object constructor. This + // can be used to support scheme-less URLs, local paths, etc. // - static scheme_type + static optional translate_scheme (const string_type& /*url*/, string_type&& scheme, optional& /*authority*/, optional& /*path*/, optional& /*query*/, - optional& /*fragment*/) + optional& /*fragment*/, + bool& /*rootless*/) { return scheme_type (std::move (scheme)); } @@ -195,33 +211,36 @@ LIBBUTL_MODEXPORT namespace butl // static string_type translate_scheme (string_type&, /*url*/ - const scheme_type& scheme, + const scheme_type& scheme, const optional& /*authority*/, const optional& /*path*/, const optional& /*query*/, - const optional& /*fragment*/) + const optional& /*fragment*/, + bool /*rootless*/) { return string_type (scheme); } - // Translate the path string representation to its type. + // Translate the URL-encoded path string representation to its type. + // + // Note that encoding for non-ASCII paths is not specified (in contrast + // to the host name), and presumably is local to the referenced authority. + // Furthermore, for some schemes, the path component can contain encoded + // binary data, for example for pkcs11. // static path_type - translate_path (string_type&& path) - { - return path_type (std::move (path)); - } + translate_path (string_type&&); - // Translate path type back to its string representation. + // Translate path type back to its URL-encoded string representation. // static string_type - translate_path (const path_type& path) {return string_type (path);} + translate_path (const path_type&); - // Check whether a string looks like a URL by searching for the first ':' - // (unless its position is specified with the second argument) and then - // making sure it's followed by '/' (e.g., http:// or file:/) and preceded - // by the scheme at least 2 characters long (so we don't confuse it with - // an absolute Windows path, e.g., c:/). + // Check whether a string looks like a non-rootless URL by searching for + // the first ':' (unless its position is specified with the second + // argument) and then making sure it's both followed by '/' (e.g., http:// + // or file:/) and preceded by a valid scheme at least 2 characters long + // (so we don't confuse it with an absolute Windows path, e.g., c:/). // // Return the start of the URL substring or string_type::npos. // @@ -249,16 +268,17 @@ LIBBUTL_MODEXPORT namespace butl optional path; optional query; optional fragment; + bool rootless = false; // Create an empty URL object. // - basic_url () = default; + basic_url (): empty_ (true) {} // Create the URL object from its string representation. Verify that the // string is compliant to the generic URL syntax. URL-decode and validate - // components with common for all schemes syntax (scheme, host, port, - // path). Throw std::invalid_argument if the passed string is not a valid - // URL representation. + // components with common for all schemes syntax (scheme, host, port). + // Throw std::invalid_argument if the passed string is not a valid URL + // representation. // // Validation and URL-decoding of the scheme-specific components can be // provided by a custom url_traits::translate_scheme() implementation. @@ -301,12 +321,15 @@ LIBBUTL_MODEXPORT namespace butl optional query = nullopt, optional fragment = nullopt); + // Create a rootless URL. + // + basic_url (scheme_type, + optional path, + optional query = nullopt, + optional fragment = nullopt); + bool - empty () const noexcept - { - assert (authority || path || query || !fragment); - return !authority && !path && !query; - } + empty () const noexcept {return empty_;} // Return a string representation of the URL. Note that while this is not // necessarily syntactically the same string as what was used to @@ -344,6 +367,13 @@ LIBBUTL_MODEXPORT namespace butl return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; } + static bool + path_char (char_type c) + { + return c == '/' || c == ':' || unreserved (c) || c == '@' || + sub_delim (c); + } + // URL-encode a character sequence. // // Note that the set of characters that should be encoded may differ for @@ -415,6 +445,9 @@ LIBBUTL_MODEXPORT namespace butl std::back_inserter (r), f); return r; } + + private: + bool empty_ = false; }; using url_authority = basic_url_authority; @@ -454,16 +487,15 @@ LIBBUTL_MODEXPORT namespace butl inline bool operator== (const basic_url& x, const basic_url& y) noexcept { - if (!(x.authority == y.authority && x.path == y.path && - x.query == y.query && x.fragment == y.fragment)) - return false; - - assert (x.empty () == y.empty ()); - - if (x.empty ()) - return true; - - return x.scheme == y.scheme; // None is empty, so schemes are valid. + if (x.empty () || y.empty ()) + return x.empty () == y.empty (); + + return x.scheme == y.scheme && + x.authority == y.authority && + x.path == y.path && + x.query == y.query && + x.fragment == y.fragment && + x.rootless == y.rootless; } template diff --git a/libbutl/url.txx b/libbutl/url.txx index b520509..2a2a215 100644 --- a/libbutl/url.txx +++ b/libbutl/url.txx @@ -6,16 +6,6 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { // Convenience functions. // - template - inline bool - url_path_char (C c) - { - using url = basic_url>; - - return c == '/' || c == ':' || url::unreserved (c) || - c == '@' || url::sub_delim (c); - } - // basic_url_host // template @@ -224,7 +214,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { using namespace std; - using iterator = typename string_type::const_iterator; + using iterator = typename string_type::const_iterator; try { @@ -329,26 +319,23 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // Extract path. // - if (i != e && *i == '/') + if (i != e && *i != '?' && *i != '#') { - ++i; // Skip '/'. + rootless = *i != '/'; - // Verify and URL-decode the path. + if (!rootless) + ++i; + + // Verify and translate the path. // iterator j (i); for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j) { - if (!(url_path_char (c) || c == '%')) + if (!(path_char (c) || c == '%')) throw invalid_argument ("invalid path"); } - // Note that encoding for non-ASCII path is not specified (in contrast - // to the host name), and presumably is local to the referenced - // authority. - // - string_type s; - decode (i, j, back_inserter (s)); - path = traits::translate_path (move (s)); + path = traits::translate_path (string_type (i, j)); i = j; } @@ -369,11 +356,6 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. i = qe; } - // We don't suppose to end up with an empty URL. - // - if (empty ()) - throw invalid_argument ("no authority, path or query"); - // Parse fragment. // if (i != e) @@ -390,16 +372,19 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // Translate the scheme string representation to its type. // - scheme = traits::translate_scheme (u, - move (sc), - authority, - path, - query, - fragment); + optional s (traits::translate_scheme (u, + move (sc), + authority, + path, + query, + fragment, + rootless)); + assert (s); + scheme = *s; } // If we fail to parse the URL, then delegate this job to - // traits::translate_scheme(). If it also fails, leaving the components - // absent, then we re-throw. + // traits::translate_scheme(). If it also fails, returning nullopt, then + // we re-throw. // catch (const invalid_argument&) { @@ -407,16 +392,20 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. path = nullopt; query = nullopt; fragment = nullopt; - - scheme = traits::translate_scheme (u, - string_type () /* scheme */, - authority, - path, - query, - fragment); - - if (!authority && !path && !query && !fragment) + rootless = false; + + optional s ( + traits::translate_scheme (u, + string_type () /* scheme */, + authority, + path, + query, + fragment, + rootless)); + if (!s) throw; + + scheme = *s; } } @@ -433,7 +422,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. authority, path, query, - fragment)); + fragment, + rootless)); // Return the custom URL pbject representation if provided. // @@ -454,9 +444,10 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. if (path) { - r += '/'; - r += encode (traits::translate_path (*path), - [] (char_type& c) {return !url_path_char (c);}); + if (!rootless) + r += '/'; + + r += traits::translate_path (*path); } if (query) diff --git a/tests/url/driver.cxx b/tests/url/driver.cxx index 64bc2b5..dd124cd 100644 --- a/tests/url/driver.cxx +++ b/tests/url/driver.cxx @@ -33,7 +33,8 @@ enum class scheme { http, https, - file + file, + pkcs11 }; namespace butl @@ -47,37 +48,63 @@ namespace butl using scheme_type = scheme; using authority_type = basic_url_authority; - static scheme_type + static optional translate_scheme (const string_type& url, string_type&& scheme, - optional& /*authority*/, + optional& authority, optional& path, - optional& /*query*/, - optional& /*fragment*/) + optional& query, + optional& /*fragment*/, + bool& rootless) { - // Note that we must compare case-insensitive in the real program. - // - if (scheme == L"http") - return scheme_type::http; - else if (scheme == L"https") - return scheme_type::https; - else if (scheme == L"file") - return scheme_type::file; - else if (scheme.empty ()) + if (scheme.empty ()) { // If the URL looks like an absolute filesystem path, then translate it - // to the file URL. If it is not, then leave all the components absent - // to fail with a proper exception description. + // to the file URL. If it is not, then return nullopt to fail with a + // proper exception description. // wchar_t c; if ((c = url[0]) == '/' || (url.size () > 2 && alpha (c) && url[1] == ':' && url[2] == '/')) + { path = url; + rootless = false; + return scheme_type::file; + } - return scheme_type::file; + return nullopt; } + + scheme_type t; + + // Note that we must compare case-insensitive in the real program. + // + if (scheme == L"http") + t = scheme_type::http; + else if (scheme == L"https") + t = scheme_type::https; + else if (scheme == L"file") + t = scheme_type::file; + else if (scheme == L"pkcs11") + t = scheme_type::pkcs11; else throw invalid_argument ("unknown scheme"); + + if (t != scheme_type::pkcs11 && !authority && !path && !query) + throw invalid_argument ("no authority, path or query"); + + if (path) + { + if (t == scheme_type::pkcs11) + { + if (!rootless || path->find (L'/') != string_type::npos) + throw invalid_argument ("unexpected slash"); + } + else if (rootless) + throw invalid_argument ("rootless path"); + } + + return t; } // Translate scheme type back to its string representation. @@ -88,13 +115,15 @@ namespace butl const optional& /*authority*/, const optional& /*path*/, const optional& /*query*/, - const optional& /*fragment*/) + const optional& /*fragment*/, + bool /*rootless*/) { switch (scheme) { - case scheme_type::http: return L"http"; - case scheme_type::https: return L"https"; - case scheme_type::file: return L"file"; + case scheme_type::http: return L"http"; + case scheme_type::https: return L"https"; + case scheme_type::file: return L"file"; + case scheme_type::pkcs11: return L"pkcs11"; } assert (false); // Can't be here. @@ -104,11 +133,19 @@ namespace butl static path_type translate_path (string_type&& path) { - return path_type (move (path)); + // Note that a real pkcs11-supporting URL most likely would keep the + // path URL-encoded as its components can contain binary data. Or, it + // would split the path into components before decoding them. + // + return path_type (basic_url::decode (path)); } static string_type - translate_path (const path_type& path) {return string_type (path);} + translate_path (const path_type& path) + { + using url = basic_url; + return url::encode (path, [] (wchar_t& c) {return !url::path_char (c);}); + } }; } @@ -317,7 +354,8 @@ try nullopt, nullopt, nullopt, - nullopt) << endl; + nullopt, + false) << endl; } else wcout << L"[null]" << endl; diff --git a/tests/url/testscript b/tests/url/testscript index 4166007..05cc528 100644 --- a/tests/url/testscript +++ b/tests/url/testscript @@ -33,7 +33,6 @@ $* : { $* 'file:#f' 2>'no authority, path or query' != 0 : fragment - $* 'file:aaa' 2>'no authority, path or query' != 0 : junk $* 'file:' 2>'no authority, path or query' != 0 : none } @@ -324,6 +323,24 @@ $* [null] [null] EOO + + $* 'http:a/b/c' 2>'rootless path' != 0 : rootless-path + $* 'pkcs11:/abc' 2>'unexpected slash' != 0 : unexpected-slash1 + $* 'pkcs11:a/bc' 2>'unexpected slash' != 0 : unexpected-slash2 + } + + : rootless + : + { + : non-empty + : + $* 'pkcs11:token=sign;object=SIGN%20key' >>EOO + pkcs11 + [null] + token=sign;object=SIGN key + [null] + [null] + EOO } : query @@ -388,6 +405,8 @@ $* $* 'file:/b%7C2' >'file:/b%7C2' : path $* 'http://a?q=' >'http://a?q=' : query $* 'http://a#f' >'http://a#f' : fragment + + $* 'pkcs11:object=SIGN%20key' >'pkcs11:object=SIGN%20key' : rootless } : wstring -- cgit v1.1