aboutsummaryrefslogtreecommitdiff
path: root/libbutl
diff options
context:
space:
mode:
Diffstat (limited to 'libbutl')
-rw-r--r--libbutl/url.ixx8
-rw-r--r--libbutl/url.mxx29
-rw-r--r--libbutl/url.txx269
3 files changed, 299 insertions, 7 deletions
diff --git a/libbutl/url.ixx b/libbutl/url.ixx
index 9ff3653..b823ee7 100644
--- a/libbutl/url.ixx
+++ b/libbutl/url.ixx
@@ -116,4 +116,12 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
rootless (true)
{
}
+
+ template <typename S, typename T>
+ inline void basic_url<S, T>::
+ normalize ()
+ {
+ if (authority)
+ authority->host.normalize ();
+ }
}
diff --git a/libbutl/url.mxx b/libbutl/url.mxx
index 3ced734..713bc3e 100644
--- a/libbutl/url.mxx
+++ b/libbutl/url.mxx
@@ -18,6 +18,7 @@
#include <cstddef> // size_t
#include <stdexcept> // invalid_argument
+#include <algorithm> // find(), find_if()
#endif
// Other includes.
@@ -31,10 +32,14 @@ import std.io;
import butl.path;
import butl.utility;
import butl.optional;
+
+import butl.small_vector;
#else
#include <libbutl/path.mxx>
#include <libbutl/utility.mxx>
#include <libbutl/optional.mxx>
+
+#include <libbutl/small-vector.mxx>
#endif
#include <libbutl/export.hxx>
@@ -123,7 +128,8 @@ LIBBUTL_MODEXPORT namespace butl
// a URL, throwing std::invalid_argument if invalid. Remove the enclosing
// square brackets for IPv6 addresses, and URL-decode host names.
//
- // Note that currently we don't validate IPv6 addresses.
+ // Note that the 'x:x:x:x:x:x:d.d.d.d' IPv6 address mixed notation is not
+ // supported.
//
explicit
basic_url_host (string_type);
@@ -142,6 +148,22 @@ LIBBUTL_MODEXPORT namespace butl
//
string_type
string () const;
+
+ // Normalize the host value in accordance with its type:
+ //
+ // Name - convert to the lower case. Note: only ASCII names are currently
+ // supported.
+ //
+ // IPv4 - strip the leading zeros in its octets.
+ //
+ // IPv6 - strip the leading zeros in its groups (hextets), squash the
+ // longest zero-only hextet sequence, and convert to the lower case
+ // (as per RFC5952).
+ //
+ // Assume that the host value is valid.
+ //
+ void
+ normalize ();
};
template <typename S>
@@ -340,6 +362,11 @@ LIBBUTL_MODEXPORT namespace butl
string_type
string () const;
+ // Normalize the URL host, if present.
+ //
+ void
+ normalize ();
+
// The following predicates can be used to classify URL characters while
// parsing, validating or encoding scheme-specific components. For the
// semantics of character classes see RFC3986.
diff --git a/libbutl/url.txx b/libbutl/url.txx
index 546ebd1..0951e80 100644
--- a/libbutl/url.txx
+++ b/libbutl/url.txx
@@ -11,28 +11,99 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
basic_url_host<S>::
basic_url_host (string_type v)
{
- using std::invalid_argument;
+ using namespace std;
using url = basic_url<string_type>;
using char_type = typename string_type::value_type;
kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name;
+ // Note that an IPv6 address is represented as eight colon-separated
+ // groups (hextets) of four or less hexadecimal digits. One or more
+ // consecutive zero hextets can be represented by double-colon (squashed),
+ // but only once, for example: 1::2:0:0:3.
+ //
if (kind == url_host_kind::ipv6)
{
+ auto bad_ip = [] () {throw invalid_argument ("invalid IPv6 address");};
+
if (v.back () != ']')
- throw invalid_argument ("invalid IPv6 address");
+ bad_ip ();
+
+ // Validate the IPv6 address.
+ //
+ // If the address doesn't contain the double-colon, then we will verify
+ // that it is composed of eight valid hextets. Otherwise, we will split
+ // the address by the double-colon into two hextet sequences, validate
+ // their hextets, and verify that their cumulative length is less than
+ // eight.
+ //
+ using iter = typename string_type::const_iterator;
+
+ // Validate a hextet sequence and return its length.
+ //
+ auto len = [&bad_ip] (iter b, iter e)
+ {
+ size_t r (0);
+
+ if (b == e)
+ return r;
+
+ size_t n (0); // Current hextet length.
+
+ // Fail if the current hextet is of an invalid length and increment
+ // the sequence length counter otherwise.
+ //
+ auto validate = [&r, &n, &bad_ip] ()
+ {
+ if (n == 0 || n > 4)
+ bad_ip ();
- value.assign (v, 1, v.size () - 2);
+ ++r;
+ n = 0;
+ };
+
+ for (iter i (b); i != e; ++i)
+ {
+ char_type c (*i);
+
+ if (xdigit (c))
+ ++n;
+ else if (c == ':')
+ validate ();
+ else
+ bad_ip ();
+ }
+
+ validate (); // Validate the trailing hextet.
+ return r;
+ };
+
+ size_t p (v.find (string_type (2, ':'), 1));
+
+ size_t n1 (p != string_type::npos
+ ? len (v.begin () + 1, v.begin () + p)
+ : len (v.begin () + 1, v.end () - 1));
+
+ size_t n2 (p != string_type::npos
+ ? len (v.begin () + p + 2, v.end () - 1)
+ : 0);
+
+ if (p != string_type::npos ? (n1 + n2 < 8) : (n1 == 8))
+ value.assign (v, 1, v.size () - 2);
+ else
+ bad_ip ();
}
- else
+ else // IPV4 or name.
{
// Detect the IPv4 address host type.
//
{
- size_t n (0);
- string_type oct;
+ size_t n (0); // Number of octets.
+ string_type oct; // Current octet.
+ // Return true if the current octet is valid.
+ //
auto ipv4_oct = [&oct, &n] () -> bool
{
if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255)
@@ -126,6 +197,192 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
return string_type ();
}
+ template <typename S>
+ void basic_url_host<S>::
+ normalize ()
+ {
+ using namespace std;
+
+ using char_type = typename string_type::value_type;
+
+ switch (kind)
+ {
+ case url_host_kind::name:
+ {
+ for (char_type& c: value)
+ c = lcase (static_cast<char> (c));
+
+ break;
+ }
+ case url_host_kind::ipv4:
+ {
+ // Strip the leading zeros in the octets.
+ //
+ string_type v; // Normalized address.
+ size_t n (0); // End of the last octet (including dot).
+
+ for (char_type c: value)
+ {
+ if (c == '.')
+ {
+ // If no digits were added since the last octet was processed,
+ // then the current octet is zero and so we add it.
+ //
+ if (n == v.size ())
+ v += '0';
+
+ v += '.';
+ n = v.size ();
+ }
+ else if (c != '0' || n != v.size ()) // Not a leading zero?
+ v += c;
+ }
+
+ // Handle the trailing zero octet.
+ //
+ if (n == v.size ())
+ v += '0';
+
+ value = move (v);
+ break;
+ }
+ case url_host_kind::ipv6:
+ {
+ // The overall plan is to (1) normalize the address hextets by
+ // converting them into lower case and stripping the leading zeros,
+ // (2) expand the potentially present double-colon into the zero
+ // hextet sequence, and (3) then squash the longest zero hextet
+ // sequence into double-colon. For example:
+ //
+ // 0ABC::1:0:0:0:0 -> abc:0:0:1::
+
+ // Parse the address into an array of normalized hextets.
+ //
+ // Note that if we meet the double-colon, we cannot expand it into the
+ // zero hextet sequence right away, since its length is unknown at
+ // this stage. Instead, we will save its index and expand it later.
+ //
+ small_vector<string_type, 8> v; // Normalized address.
+ string_type hex; // Current hextet.
+ optional<size_t> dci; // Double-colon index, if present.
+ const string_type z (1, '0'); // Zero hextet.
+
+ // True if any leading zeros are stripped for the current hextet.
+ //
+ bool stripped (false);
+
+ auto add_hex = [&v, &hex, &stripped, &dci, &z] ()
+ {
+ if (!hex.empty ())
+ {
+ v.emplace_back (move (hex));
+ hex.clear ();
+ }
+ else
+ {
+ if (!stripped) // Double-colon?
+ dci = v.size (); // Note: can be set twice to 0 (think of ::1).
+ else
+ v.push_back (z);
+ }
+
+ stripped = false;
+ };
+
+ for (char_type c: value)
+ {
+ if (c == ':')
+ add_hex ();
+ else if (c == '0' && hex.empty ()) // Leading zero?
+ stripped = true;
+ else
+ hex += lcase (static_cast<char> (c));
+ }
+
+ // Handle the trailing hextet.
+ //
+ if (!hex.empty ())
+ v.emplace_back (move (hex));
+ else if (stripped)
+ v.push_back (z);
+ //
+ // Else this is the trailing (already handled) double-colon.
+
+ // Expand double-colon, if present.
+ //
+ if (dci)
+ {
+ if (v.size () < 8)
+ v.insert (v.begin () + *dci, 8 - v.size (), z);
+ else
+ assert (false); // Too long address.
+ }
+
+ // Find the longest zero hextet sequence.
+ //
+ // Note that the first sequence of zeros is chosen between the two of
+ // the same length.
+ //
+ // Also note that we don't squash the single zero.
+ //
+ using iter = typename small_vector<string_type, 8>::const_iterator;
+
+ iter e (v.end ());
+ iter mxi (e); // Longest sequence start.
+ iter mxe; // Longest sequence end.
+ size_t mxn (1); // Longest sequence length (must be > 1).
+
+ for (iter i (v.begin ()); i != e; )
+ {
+ i = find (i, e, z);
+
+ if (i != e)
+ {
+ iter ze (find_if (i + 1, e,
+ [&z] (const string_type& h) {return h != z;}));
+
+ size_t n (ze - i);
+
+ if (mxn < n)
+ {
+ mxn = n;
+ mxi = i;
+ mxe = ze;
+ }
+
+ i = ze;
+ }
+ }
+
+ // Compose the IPv6 string, squashing the longest zero hextet
+ // sequence, if present.
+ //
+ value.clear ();
+
+ for (iter i (v.begin ()); i != e; )
+ {
+ if (i != mxi)
+ {
+ // Add ':', unless the hextet is the first or follows double-
+ // colon.
+ //
+ if (!value.empty () && value.back () != ':')
+ value += ':';
+
+ value += *i++;
+ }
+ else
+ {
+ value.append (2, ':');
+ i = mxe;
+ }
+ }
+
+ break;
+ }
+ }
+ }
+
// basic_url_authority
//
template <typename S>