aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
-rw-r--r--libbutl/char-scanner.cxx126
-rw-r--r--libbutl/char-scanner.ixx56
-rw-r--r--libbutl/char-scanner.mxx90
-rw-r--r--libbutl/char-scanner.txx146
-rw-r--r--libbutl/manifest-parser.cxx24
-rw-r--r--libbutl/manifest-parser.ixx31
-rw-r--r--libbutl/manifest-parser.mxx33
-rw-r--r--libbutl/manifest-rewriter.cxx22
-rw-r--r--libbutl/manifest-serializer.cxx84
-rw-r--r--libbutl/manifest-serializer.mxx16
-rw-r--r--libbutl/standard-version.cxx2
-rw-r--r--libbutl/unicode.cxx165
-rw-r--r--libbutl/unicode.ixx72
-rw-r--r--libbutl/unicode.mxx82
-rw-r--r--libbutl/utf8.cxx342
-rw-r--r--libbutl/utf8.ixx305
-rw-r--r--libbutl/utf8.mxx130
-rw-r--r--libbutl/utility.cxx132
-rw-r--r--libbutl/utility.ixx95
-rw-r--r--libbutl/utility.mxx70
-rw-r--r--tests/manifest-parser/driver.cxx40
-rw-r--r--tests/manifest-rewriter/driver.cxx4
-rw-r--r--tests/manifest-serializer/driver.cxx18
-rw-r--r--tests/utf8/driver.cxx207
24 files changed, 1696 insertions, 596 deletions
diff --git a/libbutl/char-scanner.cxx b/libbutl/char-scanner.cxx
deleted file mode 100644
index 85416e5..0000000
--- a/libbutl/char-scanner.cxx
+++ /dev/null
@@ -1,126 +0,0 @@
-// file : libbutl/char-scanner.cxx -*- C++ -*-
-// license : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#include <libbutl/char-scanner.mxx>
-#endif
-
-// C includes.
-
-#ifndef __cpp_lib_modules_ts
-#include <string> // char_traits
-#include <cstdint> // uint64_t
-#include <istream>
-#endif
-
-// Other includes.
-
-#ifdef __cpp_modules_ts
-module butl.char_scanner;
-
-// Only imports additional to interface.
-#ifdef __clang__
-#ifdef __cpp_lib_modules_ts
-import std.core;
-import std.io;
-#endif
-import butl.fdstream;
-#endif
-
-#endif
-
-using namespace std;
-
-namespace butl
-{
- char_scanner::
- char_scanner (istream& is, bool crlf, uint64_t l, uint64_t p)
- : line (l),
- column (1),
- position (p),
- is_ (is),
- buf_ (dynamic_cast<fdbuf*> (is.rdbuf ())),
- gptr_ (nullptr),
- egptr_ (nullptr),
- crlf_ (crlf)
- {
- }
-
- auto char_scanner::
- peek () -> xchar
- {
- if (unget_)
- return ungetc_;
-
- if (unpeek_)
- return unpeekc_;
-
- if (eos_)
- return xchar (xchar::traits_type::eof (), line, column, position);
-
- int_type v (peek_ ());
-
- if (v == xchar::traits_type::eof ())
- eos_ = true;
- else if (crlf_ && v == '\r')
- {
- int_type v1;
- do
- {
- get_ ();
- v1 = peek_ ();
- }
- while (v1 == '\r');
-
- if (v1 != '\n')
- {
- // We need to make sure subsequent calls to peek() return newline.
- //
- unpeek_ = true;
- unpeekc_ = xchar ('\n', line, column, position);
-
- if (v1 == xchar::traits_type::eof ())
- eos_ = true;
- }
-
- v = '\n';
- }
-
- return xchar (v, line, column, position);
- }
-
- void char_scanner::
- get (const xchar& c)
- {
- if (unget_)
- unget_ = false;
- else
- {
- if (unpeek_)
- {
- unpeek_ = false;
- }
- // When is_.get () returns eof, the failbit is also set (stupid,
- // isn't?) which may trigger an exception. To work around this
- // we will call peek() first and only call get() if it is not
- // eof. But we can only call peek() on eof once; any subsequent
- // calls will spoil the failbit (even more stupid).
- //
- else if (!eos (c))
- get_ ();
-
- if (!eos (c))
- {
- if (c == '\n')
- {
- line++;
- column = 1;
- }
- else
- column++;
-
- position = pos_ ();
- }
- }
- }
-}
diff --git a/libbutl/char-scanner.ixx b/libbutl/char-scanner.ixx
index 36cc93d..7e9c4b0 100644
--- a/libbutl/char-scanner.ixx
+++ b/libbutl/char-scanner.ixx
@@ -3,8 +3,30 @@
namespace butl
{
- inline auto char_scanner::
- get () -> xchar
+ template <typename V>
+ inline char_scanner<V>::
+ char_scanner (std::istream& is, bool crlf, std::uint64_t l, std::uint64_t p)
+ : char_scanner (is, validator_type (), crlf, l, p)
+ {
+ }
+
+ template <typename V>
+ inline auto char_scanner<V>::
+ peek (std::string& what) -> xchar
+ {
+ return peek (&what);
+ }
+
+ template <typename V>
+ inline auto char_scanner<V>::
+ peek () -> xchar
+ {
+ return peek (nullptr /* what */);
+ }
+
+ template <typename V>
+ inline auto char_scanner<V>::
+ get (std::string* what) -> xchar
{
if (unget_)
{
@@ -13,13 +35,28 @@ namespace butl
}
else
{
- xchar c (peek ());
+ xchar c (peek (what));
get (c);
return c;
}
}
- inline void char_scanner::
+ template <typename V>
+ inline auto char_scanner<V>::
+ get (std::string& what) -> xchar
+ {
+ return get (&what);
+ }
+
+ template <typename V>
+ inline auto char_scanner<V>::
+ get () -> xchar
+ {
+ return get (nullptr /* what */);
+ }
+
+ template <typename V>
+ inline void char_scanner<V>::
unget (const xchar& c)
{
// Because iostream::unget cannot work once eos is reached, we have to
@@ -29,7 +66,8 @@ namespace butl
ungetc_ = c;
}
- inline auto char_scanner::
+ template <typename V>
+ inline auto char_scanner<V>::
peek_ () -> int_type
{
if (gptr_ != egptr_)
@@ -48,7 +86,8 @@ namespace butl
return r;
}
- inline void char_scanner::
+ template <typename V>
+ inline void char_scanner<V>::
get_ ()
{
int_type c;
@@ -61,11 +100,14 @@ namespace butl
else
c = is_.get (); // About as fast as ignore() and way faster than tellg().
+ validated_ = false;
+
if (save_ != nullptr && c != xchar::traits_type::eof ())
save_->push_back (static_cast<char_type> (c));
}
- inline std::uint64_t char_scanner::
+ template <typename V>
+ inline std::uint64_t char_scanner<V>::
pos_ () const
{
return buf_ != nullptr ? buf_->tellg () : 0;
diff --git a/libbutl/char-scanner.mxx b/libbutl/char-scanner.mxx
index 5ad3d61..e57245b 100644
--- a/libbutl/char-scanner.mxx
+++ b/libbutl/char-scanner.mxx
@@ -10,6 +10,8 @@
#ifndef __cpp_lib_modules_ts
#include <string> // char_traits
#include <cstdint> // uint64_t
+#include <climits> // INT_*
+#include <utility> // pair, make_pair()
#include <istream>
#endif
@@ -30,12 +32,26 @@ import butl.fdstream;
LIBBUTL_MODEXPORT namespace butl
{
+ // Refer to utf8_validator for details.
+ //
+ struct noop_validator
+ {
+ std::pair<bool, bool>
+ validate (char) {return std::make_pair (true, true);}
+
+ std::pair<bool, bool>
+ validate (char c, std::string&) {return validate (c);}
+ };
+
// Low-level character stream scanner. Normally used as a base for
// higher-level lexers.
//
- class LIBBUTL_SYMEXPORT char_scanner
+ template <typename V = noop_validator>
+ class char_scanner
{
public:
+ using validator_type = V;
+
// If the crlf argument is true, then recognize Windows newlines (0x0D
// 0x0A) and convert them to just '\n' (0x0A). Note that a standalone
// 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D
@@ -49,7 +65,13 @@ LIBBUTL_MODEXPORT namespace butl
// and position in the stream (useful when re-scanning data saved with the
// save_* facility).
//
- char_scanner (std::istream& is,
+ char_scanner (std::istream&,
+ bool crlf = true,
+ std::uint64_t line = 1,
+ std::uint64_t position = 0);
+
+ char_scanner (std::istream&,
+ validator_type,
bool crlf = true,
std::uint64_t line = 1,
std::uint64_t position = 0);
@@ -62,10 +84,10 @@ LIBBUTL_MODEXPORT namespace butl
public:
// Extended character. It includes line/column/position information and is
- // capable of representing EOF.
+ // capable of representing EOF and invalid characters.
//
- // Note that implicit conversion of EOF to char_type results in NUL
- // character (which means in most cases it is safe to compare xchar to
+ // Note that implicit conversion of EOF/invalid to char_type results in
+ // NUL character (which means in most cases it is safe to compare xchar to
// char without checking for EOF).
//
class xchar
@@ -76,6 +98,9 @@ LIBBUTL_MODEXPORT namespace butl
using char_type = traits_type::char_type;
int_type value;
+
+ // Note that the column is of the codepoint this byte belongs to.
+ //
std::uint64_t line;
std::uint64_t column;
@@ -84,9 +109,12 @@ LIBBUTL_MODEXPORT namespace butl
//
std::uint64_t position;
+ static int_type
+ invalid () {return traits_type::eof () != INT_MIN ? INT_MIN : INT_MAX;}
+
operator char_type () const
{
- return value != traits_type::eof ()
+ return value != traits_type::eof () && value != invalid ()
? static_cast<char_type> (value)
: char_type (0);
}
@@ -98,27 +126,44 @@ LIBBUTL_MODEXPORT namespace butl
: value (v), line (l), column (c), position (p) {}
};
+ // Note that if any of the get() or peek() functions return an invalid
+ // character, then the scanning has failed and none of them should be
+ // called again.
+
xchar
get ();
+ // As above but in case of an invalid character also return the
+ // description of why it is invalid.
+ //
+ xchar
+ get (std::string& what);
+
void
get (const xchar& peeked); // Get previously peeked character (faster).
void
unget (const xchar&);
- // Note that if there is an "ungot" character, peek() will return
- // that.
+ // Note that if there is an "ungot" character, peek() will return that.
//
xchar
peek ();
- // Tests. In the future we can add tests line alpha(), alnum(),
- // etc.
+ // As above but in case of an invalid character also return the
+ // description of why it is invalid.
+ //
+ xchar
+ peek (std::string& what);
+
+ // Tests. In the future we can add tests line alpha(), alnum(), etc.
//
static bool
eos (const xchar& c) {return c.value == xchar::traits_type::eof ();}
+ static bool
+ invalid (const xchar& c) {return c.value == xchar::invalid ();}
+
// Line, column and position of the next character to be extracted from
// the stream by peek() or get().
//
@@ -159,8 +204,8 @@ LIBBUTL_MODEXPORT namespace butl
};
protected:
- using int_type = xchar::int_type;
- using char_type = xchar::char_type;
+ using int_type = typename xchar::int_type;
+ using char_type = typename xchar::char_type;
int_type
peek_ ();
@@ -171,11 +216,27 @@ LIBBUTL_MODEXPORT namespace butl
std::uint64_t
pos_ () const;
+ xchar
+ get (std::string* what);
+
+ xchar
+ peek (std::string* what);
+
protected:
std::istream& is_;
- // Note that if you are reading from the buffer directly, then it is
- // also your responsibility to save the data.
+ validator_type val_;
+ bool decoded_ = true; // The peeked character is last byte of sequence.
+ bool validated_ = false; // The peeked character has been validated.
+
+ // Note that if you are reading from the buffer directly, then it is also
+ // your responsibility to call the validator and save the data (see
+ // save_*().
+ //
+ // Besides that, make sure that the peek() call preceding the scan is
+ // followed by the get() call (see validated_, decoded_, and unpeek_ for
+ // the hairy details; realistically, you would probably only direct-scan
+ // ASCII fragments).
//
fdbuf* buf_; // NULL if not ifdstream.
const char_type* gptr_;
@@ -195,3 +256,4 @@ LIBBUTL_MODEXPORT namespace butl
}
#include <libbutl/char-scanner.ixx>
+#include <libbutl/char-scanner.txx>
diff --git a/libbutl/char-scanner.txx b/libbutl/char-scanner.txx
new file mode 100644
index 0000000..d4e2082
--- /dev/null
+++ b/libbutl/char-scanner.txx
@@ -0,0 +1,146 @@
+// file : libbutl/char-scanner.txx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_lib_modules_ts
+#include <utility> // move
+#endif
+
+namespace butl
+{
+ template <typename V>
+ char_scanner<V>::
+ char_scanner (std::istream& is,
+ validator_type v,
+ bool crlf,
+ std::uint64_t l,
+ std::uint64_t p)
+ : line (l),
+ column (1),
+ position (p),
+ is_ (is),
+ val_ (std::move (v)),
+ buf_ (dynamic_cast<fdbuf*> (is.rdbuf ())),
+ gptr_ (nullptr),
+ egptr_ (nullptr),
+ crlf_ (crlf)
+ {
+ }
+
+ template <typename V>
+ auto char_scanner<V>::
+ peek (std::string* what) -> xchar
+ {
+ if (unget_)
+ return ungetc_;
+
+ if (unpeek_)
+ return unpeekc_;
+
+ if (eos_)
+ return xchar (xchar::traits_type::eof (), line, column, position);
+
+ int_type v (peek_ ());
+
+ if (v == xchar::traits_type::eof ())
+ {
+ if (!decoded_)
+ {
+ if (what != nullptr)
+ *what = "unexpected end of stream";
+
+ v = xchar::invalid ();
+ }
+
+ eos_ = true;
+ }
+ else
+ {
+ auto valid = [what, this] (int_type v)
+ {
+ if (validated_)
+ return true;
+
+ char c (xchar::traits_type::to_char_type (v));
+ std::pair<bool, bool> r (what != nullptr
+ ? val_.validate (c, *what)
+ : val_.validate (c));
+
+ decoded_ = r.second;
+ validated_ = true;
+ return r.first;
+ };
+
+ if (!valid (v))
+ v = xchar::invalid ();
+ else if (crlf_ && v == '\r')
+ {
+ // Note that '\r' is a valid character (otherwise we won't be here),
+ // so we don't validate it again below. We also postpone the
+ // validation of the next non-'\r' character (except EOF) until the
+ // next peek() call.
+ //
+ int_type v1;
+ do
+ {
+ get_ (); // Sets validated_ to false.
+ v1 = peek_ ();
+ }
+ while (v1 == '\r');
+
+ if (v1 != '\n')
+ {
+ // We need to make sure subsequent calls to peek() return newline.
+ //
+ unpeek_ = true;
+ unpeekc_ = xchar ('\n', line, column, position);
+
+ // Note that the previous character is decoded ('\r') and so EOF is
+ // legitimate.
+ //
+ if (v1 == xchar::traits_type::eof ())
+ eos_ = true;
+ }
+
+ v = '\n';
+ }
+ }
+
+ return xchar (v, line, column, position);
+ }
+
+ template <typename V>
+ void char_scanner<V>::
+ get (const xchar& c)
+ {
+ if (unget_)
+ unget_ = false;
+ else
+ {
+ if (unpeek_)
+ {
+ unpeek_ = false;
+ }
+ // When is_.get () returns eof, the failbit is also set (stupid,
+ // isn't?) which may trigger an exception. To work around this
+ // we will call peek() first and only call get() if it is not
+ // eof. But we can only call peek() on eof once; any subsequent
+ // calls will spoil the failbit (even more stupid).
+ //
+ else if (!eos (c))
+ get_ ();
+
+ if (!eos (c))
+ {
+ if (c == '\n')
+ {
+ line++;
+ column = 1;
+ }
+ else if (decoded_) // The character is the last in a sequence?
+ column++;
+
+ position = pos_ ();
+ }
+ }
+ }
+}
diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx
index 4de59b7..9514bbd 100644
--- a/libbutl/manifest-parser.cxx
+++ b/libbutl/manifest-parser.cxx
@@ -89,7 +89,7 @@ namespace butl
parse_name (r);
skip_spaces ();
- c = get ();
+ c = get ("manifest");
if (eos (c))
{
@@ -117,7 +117,7 @@ namespace butl
skip_spaces ();
parse_value (r);
- c = peek ();
+ c = peek ("manifest");
// The character after the value should be either a newline or eos.
//
@@ -126,7 +126,7 @@ namespace butl
r.end_pos = c.position;
if (c == '\n')
- get ();
+ get (c);
// Now figure out whether what we've got makes sense, depending
// on the state we are in.
@@ -217,6 +217,8 @@ namespace butl
void manifest_parser::
parse_name (name_value& r)
{
+ auto peek = [this] () {return manifest_parser::peek ("manifest name");};
+
xchar c (peek ());
r.name_line = c.line;
@@ -228,13 +230,19 @@ namespace butl
break;
r.name += c;
- get ();
+ get (c);
}
}
void manifest_parser::
parse_value (name_value& r)
{
+ auto peek = [this] () {return manifest_parser::peek ("manifest value");};
+
+ // Here we don't always track the last peeked character.
+ //
+ auto get = [this] () {manifest_parser::get ("manifest value");};
+
xchar c (peek ());
r.value_line = c.line;
@@ -408,6 +416,8 @@ namespace butl
pair<manifest_parser::xchar, uint64_t> manifest_parser::
skip_spaces ()
{
+ auto peek = [this] () {return manifest_parser::peek ("manifest");};
+
xchar c (peek ());
bool start (c.column == 1);
uint64_t lp (c.position);
@@ -437,12 +447,12 @@ namespace butl
if (!start)
return make_pair (c, lp);
- get ();
+ get (c);
// Read until newline or eos.
//
for (c = peek (); !eos (c) && c != '\n'; c = peek ())
- get ();
+ get (c);
continue;
}
@@ -450,7 +460,7 @@ namespace butl
return make_pair (c, lp); // Not a space.
}
- get ();
+ get (c);
}
return make_pair (c, lp);
diff --git a/libbutl/manifest-parser.ixx b/libbutl/manifest-parser.ixx
index e616ad9..bc5246c 100644
--- a/libbutl/manifest-parser.ixx
+++ b/libbutl/manifest-parser.ixx
@@ -3,6 +3,37 @@
namespace butl
{
+
+ inline auto manifest_parser::
+ get (const char* what) -> xchar
+ {
+ xchar c (base::get (ebuf_));
+
+ if (invalid (c))
+ throw manifest_parsing (name_,
+ c.line, c.column,
+ std::string ("invalid ") + what + ": " + ebuf_);
+ return c;
+ }
+
+ inline void manifest_parser::
+ get (const xchar& peeked)
+ {
+ base::get (peeked);
+ }
+
+ inline auto manifest_parser::
+ peek (const char* what) -> xchar
+ {
+ xchar c (base::peek (ebuf_));
+
+ if (invalid (c))
+ throw manifest_parsing (name_,
+ c.line, c.column,
+ std::string ("invalid ") + what + ": " + ebuf_);
+ return c;
+ }
+
inline manifest_name_value manifest_parser::
next ()
{
diff --git a/libbutl/manifest-parser.mxx b/libbutl/manifest-parser.mxx
index adf6181..77addff 100644
--- a/libbutl/manifest-parser.mxx
+++ b/libbutl/manifest-parser.mxx
@@ -25,10 +25,12 @@ export module butl.manifest_parser;
import std.core;
import std.io;
#endif
+import butl.utf8;
import butl.optional;
import butl.char_scanner;
import butl.manifest_types;
#else
+#include <libbutl/utf8.mxx>
#include <libbutl/optional.mxx>
#include <libbutl/char-scanner.mxx>
#include <libbutl/manifest-types.mxx>
@@ -54,7 +56,8 @@ LIBBUTL_MODEXPORT namespace butl
std::string description;
};
- class LIBBUTL_SYMEXPORT manifest_parser: protected butl::char_scanner
+ class LIBBUTL_SYMEXPORT manifest_parser:
+ protected char_scanner<utf8_validator>
{
public:
// The filter, if specified, is called by next() prior to returning the
@@ -69,7 +72,10 @@ LIBBUTL_MODEXPORT namespace butl
manifest_parser (std::istream& is,
const std::string& name,
std::function<filter_function> filter = {})
- : char_scanner (is), name_ (name), filter_ (std::move (filter)) {}
+ : char_scanner (is,
+ utf8_validator (codepoint_types::graphic, U"\n\r\t")),
+ name_ (name),
+ filter_ (std::move (filter)) {}
const std::string&
name () const {return name_;}
@@ -97,6 +103,8 @@ LIBBUTL_MODEXPORT namespace butl
split_comment (const std::string&);
private:
+ using base = char_scanner<utf8_validator>;
+
void
parse_next (manifest_name_value&);
@@ -114,12 +122,33 @@ LIBBUTL_MODEXPORT namespace butl
std::pair<xchar, std::uint64_t>
skip_spaces ();
+ // As base::get() but in case of an invalid character throws
+ // manifest_parsing.
+ //
+ xchar
+ get (const char* what);
+
+ // Get previously peeked character (faster).
+ //
+ void
+ get (const xchar&);
+
+ // As base::peek() but in case of an invalid character throws
+ // manifest_parsing.
+ //
+ xchar
+ peek (const char* what);
+
private:
const std::string name_;
const std::function<filter_function> filter_;
enum {start, body, end} s_ = start;
std::string version_; // Current format version.
+
+ // Buffer for a get()/peek() potential error.
+ //
+ std::string ebuf_;
};
// Parse and return a single manifest. Throw manifest_parsing in case of an
diff --git a/libbutl/manifest-rewriter.cxx b/libbutl/manifest-rewriter.cxx
index ba0c866..e38d5f4 100644
--- a/libbutl/manifest-rewriter.cxx
+++ b/libbutl/manifest-rewriter.cxx
@@ -30,8 +30,10 @@ import butl.fdstream;
import butl.manifest_types;
#endif
+import butl.utility; // utf8_length()
import butl.manifest_serializer;
#else
+#include <libbutl/utility.mxx>
#include <libbutl/manifest-serializer.mxx>
#endif
@@ -101,8 +103,16 @@ namespace butl
manifest_serializer s (os, path_.string (), long_lines_);
+ // Note that the name can be surrounded with the ASCII whitespace
+ // characters and the start_pos refers to the first character in the
+ // line.
+ //
+ // Also note that we assume the already serialized name to be a valid
+ // UTF-8 byte string and so utf8_length() may not throw.
+ //
s.write_value (nv.value,
- static_cast<size_t> (nv.colon_pos - nv.start_pos + 2));
+ static_cast<size_t> (nv.colon_pos - nv.start_pos) -
+ (nv.name.size () - utf8_length (nv.name)) + 2);
}
os << suffix;
@@ -128,15 +138,21 @@ namespace butl
os << '\n';
manifest_serializer s (os, path_.string (), long_lines_);
- s.write_name (nv.name);
+ size_t n (s.write_name (nv.name));
os << ':';
if (!nv.value.empty ())
{
os << ' ';
+
+ // Note that the name can be surrounded with the ASCII whitespace
+ // characters and the start_pos refers to the first character in the
+ // line.
+ //
s.write_value (nv.value,
- static_cast<size_t> (nv.colon_pos - nv.start_pos + 2));
+ static_cast<size_t> (nv.colon_pos - nv.start_pos) -
+ (nv.name.size () - n) + 2);
}
os << suffix;
diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx
index 0a81478..6a26a15 100644
--- a/libbutl/manifest-serializer.cxx
+++ b/libbutl/manifest-serializer.cxx
@@ -30,6 +30,11 @@ import std.io;
import butl.manifest_types;
#endif
+import butl.utf8;
+import butl.utility;
+#else
+#include <libbutl/utf8.mxx>
+#include <libbutl/utility.mxx>
#endif
using namespace std;
@@ -86,13 +91,13 @@ namespace butl
break;
}
- write_name (n);
+ size_t l (write_name (n));
os_ << ':';
if (!v.empty ())
{
os_ << ' ';
- write_value (v, n.size () + 2);
+ write_value (v, l + 2);
}
os_ << endl;
@@ -111,6 +116,10 @@ namespace butl
if (s_ == end)
throw serialization (name_, "serialization after eos");
+ string what;
+ if (!utf8 (t, what, codepoint_types::graphic, U"\n\r\t"))
+ throw serialization (name_, "invalid comment: " + what);
+
os_ << '#';
if (!t.empty ())
@@ -144,7 +153,7 @@ namespace butl
return r;
}
- void manifest_serializer::
+ size_t manifest_serializer::
write_name (const string& n)
{
if (n.empty ())
@@ -153,43 +162,76 @@ namespace butl
if (n[0] == '#')
throw serialization (name_, "name starts with '#'");
+ size_t r (0);
+ pair<bool, bool> v;
+ utf8_validator val (codepoint_types::graphic, U"\n\r\t");
+
+ string what;
for (char c: n)
{
- switch (c)
+ v = val.validate (c, what);
+
+ if (!v.first)
+ throw serialization (name_, "invalid name: " + what);
+
+ if (v.second) // Sequence last byte?
{
- case ' ':
- case '\t':
- case '\r':
- case '\n': throw serialization (name_, "name contains whitespace");
- case ':': throw serialization (name_, "name contains ':'");
- default: break;
+ // Note: ASCII characters may not be a part of a multi-byte sequence.
+ //
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n': throw serialization (name_, "name contains whitespace");
+ case ':': throw serialization (name_, "name contains ':'");
+ default: break;
+ }
+
+ ++r;
}
}
+ // Make sure that the last UTF-8 sequence is complete.
+ //
+ if (!v.second)
+ throw serialization (name_, "invalid name: incomplete UTF-8 sequence");
+
os_ << n;
+ return r;
}
void manifest_serializer::
write_value (const char* s, size_t n, size_t cl)
{
+ utf8_validator val (codepoint_types::graphic, U"\n\r\t");
+
char c ('\0');
+ bool b (true); // Begin of UTF-8 byte sequence.
- // The idea is to break on the 77th character (i.e., write it
- // on the next line) which means we have written 76 characters
+ // The idea is to break on the 77th codepoint (i.e., write it
+ // on the next line) which means we have written 76 codepoints
// on this line plus 2 for '\' and '\n', which gives us 78.
//
- for (const char* e (s + n); s != e; s++, cl++)
+ string what;
+ for (const char* e (s + n); s != e; s++)
{
char pc (c);
c = *s;
+ pair<bool, bool> v (val.validate (c, what));
+
+ if (!v.first)
+ throw serialization (name_, "invalid value: " + what);
+
// Note that even the "hard" break (see below) is not that hard when it
// comes to breaking the line right after the backslash. Doing so would
// inject the redundant newline character, as the line-terminating
// backslash would be escaped. So we delay breaking till the next
- // non-backslash character.
+ // non-backslash character. We also delay until the beginning of a UTF-8
+ // sequence.
//
- if (pc != '\\' && !long_lines_)
+ if (pc != '\\' && b && !long_lines_)
{
bool br (false); // Break the line.
@@ -237,8 +279,18 @@ namespace butl
}
os_ << c;
+
+ b = v.second;
+
+ if (b)
+ ++cl;
}
+ // Make sure that the last UTF-8 sequence is complete.
+ //
+ if (!b)
+ throw serialization (name_, "invalid value: incomplete UTF-8 sequence");
+
// What comes next is always a newline. If the last character that
// we have written is a backslash, escape it.
//
@@ -256,7 +308,7 @@ namespace butl
// Use the multi-line mode in any of the following cases:
//
- // - column offset is too large (say greater than 39 (78/2) characters; we
+ // - column offset is too large (say greater than 39 (78/2) codepoints; we
// cannot start on the next line since that would start the multi-line
// mode)
// - value contains newlines
diff --git a/libbutl/manifest-serializer.mxx b/libbutl/manifest-serializer.mxx
index f114ffb..b73c255 100644
--- a/libbutl/manifest-serializer.mxx
+++ b/libbutl/manifest-serializer.mxx
@@ -60,7 +60,7 @@ LIBBUTL_MODEXPORT namespace butl
const std::string& value);
// Unless long_lines is true, break lines in values (including multi-line)
- // so that their length does not exceed 78 characters (including '\n').
+ // so that their length does not exceed 78 codepoints (including '\n').
//
manifest_serializer (std::ostream& os,
const std::string& name,
@@ -108,23 +108,23 @@ LIBBUTL_MODEXPORT namespace butl
void
write_next (const std::string& name, const std::string& value);
- // Validate and write a name.
+ // Validate and write a name and return its length in codepoints.
//
- void
+ size_t
write_name (const std::string&);
// Write a value assuming the current line already has the specified
- // offset. If the resulting line length would be too large then the
- // multi-line representation will be used. It is assumed that the name,
- // followed by the colon, is already written.
+ // codepoint offset. If the resulting line length would be too large then
+ // the multi-line representation will be used. It is assumed that the
+ // name, followed by the colon, is already written.
//
void
write_value (const std::string&, std::size_t offset);
// Write the specified number of characters from the specified string
// (assuming there are no newlines) split into multiple lines at or near
- // the 78 characters boundary. Assume the current line already has the
- // specified offset.
+ // the 78 codepoints boundary. Assume the current line already has the
+ // specified codepoint offset.
//
void
write_value (const char* s, std::size_t n, std::size_t offset);
diff --git a/libbutl/standard-version.cxx b/libbutl/standard-version.cxx
index c27b064..a9f5eb8 100644
--- a/libbutl/standard-version.cxx
+++ b/libbutl/standard-version.cxx
@@ -41,6 +41,8 @@ using namespace std;
namespace butl
{
+ using std::to_string;
+
// Parse uint64_t from the specified string starting at the specified
// position and check the min/max constraints. If successful, save the
// result, update the position to point to the next character, and return
diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx
new file mode 100644
index 0000000..4219846
--- /dev/null
+++ b/libbutl/unicode.cxx
@@ -0,0 +1,165 @@
+// file : libbutl/unicode.cxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#include <libbutl/unicode.mxx>
+#endif
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint>
+
+#include <cstddef> // size_t
+#include <utility> // pair
+#include <algorithm> // lower_bound()
+#endif
+
+#ifdef __cpp_modules_ts
+module butl.unicode;
+
+// Only imports additional to interface.
+#ifdef __clang__
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#endif
+
+using namespace std;
+
+namespace butl
+{
+ // Sorted arrays of the Unicode codepoint ranges corresponding to the
+ // codepoint types (see the Types of Code Points table in the Unicode 12.0
+ // Standard for details). Note that code type range lists (but not ranges
+ // themselves) may overlap.
+ //
+ // Also note that the graphic type codepoints are numerous and scattered.
+ // Thus, we will consider a codepoint to be of the graphic type if it is not
+ // of any other type.
+ //
+ using codepoint_range = pair<char32_t, char32_t>;
+
+ static const codepoint_range cn_rs[] = // Control.
+ {
+ {0x00, 0x1F},
+ {0x7F, 0x9F}
+ };
+
+ static const codepoint_range fr_rs[] = // Format.
+ {
+ {0x000AD, 0x000AD},
+ {0x00600, 0x00605},
+ {0x0061C, 0x0061C},
+ {0x006DD, 0x006DD},
+ {0x0070F, 0x0070F},
+ {0x008E2, 0x008E2},
+ {0x0180E, 0x0180E},
+ {0x0200B, 0x0200F},
+ {0x0202A, 0x0202E},
+ {0x02060, 0x02064},
+ {0x02066, 0x0206F},
+ {0x0FEFF, 0x0FEFF},
+ {0x0FFF9, 0x0FFFB},
+ {0x110BD, 0x110BD},
+ {0x110CD, 0x110CD},
+ {0x13430, 0x13438},
+ {0x1BCA0, 0x1BCA3},
+ {0x1D173, 0x1D17A},
+ {0xE0001, 0xE0001},
+ {0xE0020, 0xE007F}
+ };
+
+ static const codepoint_range pr_rs[] = // Private-use.
+ {
+ {0x00E000, 0x00F8FF},
+ {0x0F0000, 0x10FFFF}
+ };
+
+ static const codepoint_range nc_rs[] = // Non-character.
+ {
+ {0xFDD0, 0xFDEF}
+ };
+
+ static const codepoint_range rs_rs[] = // Reserved.
+ {
+ {0x30000, 0xE0000},
+ {0xE0002, 0xE001F},
+ {0xE0080, 0xE00FF},
+ {0xE01F0, 0xEFFFF}
+ };
+
+ struct codepoint_type_ranges
+ {
+ codepoint_types type;
+ const codepoint_range* begin;
+ const codepoint_range* end;
+ };
+
+ static const codepoint_type_ranges ct_ranges[] =
+ {
+ {
+ codepoint_types::control,
+ cn_rs,
+ cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
+ },
+ {
+ codepoint_types::format,
+ fr_rs,
+ fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
+ },
+ {
+ codepoint_types::private_use,
+ pr_rs,
+ pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
+ },
+ {
+ codepoint_types::non_character,
+ nc_rs,
+ nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
+ },
+ {
+ codepoint_types::reserved,
+ rs_rs,
+ rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
+ }
+ };
+
+ // Return the codepoint type of a range if the codepoint value falls into
+ // one and the graphic type otherwise.
+ //
+ // Note that this is a type detection fallback (see codepoint_type() for
+ // details).
+ //
+ codepoint_types
+ codepoint_type_lookup (char32_t c)
+ {
+ // Note that the codepoint type range lists may overlap. Thus, we iterate
+ // over all of them until there is a match.
+ //
+ for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
+ {
+ const codepoint_type_ranges& rs (ct_ranges[i]);
+
+ // Find the range that either contains the codepoint or lays to the
+ // right of it. Note that here we assume a range to be less than a
+ // codepoint value if it lays to the left of the codepoint.
+ //
+ const codepoint_range* r (
+ lower_bound (rs.begin, rs.end,
+ c,
+ [] (const codepoint_range& r, char32_t c)
+ {
+ return r.second < c;
+ }));
+
+ if (r != rs.end && r->first <= c) // Contains the codepoint?
+ return rs.type;
+ }
+
+ return codepoint_types::graphic;
+ }
+}
diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx
new file mode 100644
index 0000000..cba4fd2
--- /dev/null
+++ b/libbutl/unicode.ixx
@@ -0,0 +1,72 @@
+// file : libbutl/unicode.ixx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+ inline codepoint_types
+ operator&= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) &
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator|= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) |
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator& (codepoint_types x, codepoint_types y)
+ {
+ return x &= y;
+ }
+
+ inline codepoint_types
+ operator| (codepoint_types x, codepoint_types y)
+ {
+ return x |= y;
+ }
+
+ LIBBUTL_SYMEXPORT codepoint_types
+ codepoint_type_lookup (char32_t);
+
+ inline codepoint_types
+ codepoint_type (char32_t c)
+ {
+ // Optimize for the common case (printable ASCII characters).
+ //
+ if (c >= 0x20 && c <= 0x7E) // Printable ASCII?
+ return codepoint_types::graphic;
+ else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid?
+ return codepoint_types::none;
+ else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based?
+ return codepoint_types::non_character;
+ else
+ return codepoint_type_lookup (c);
+ }
+
+ inline std::string
+ to_string (codepoint_types t)
+ {
+ // Note that we use the terms from the Unicode standard ("private-use"
+ // rather than "private use", "noncharacter" rather than "non-character").
+ //
+ switch (t)
+ {
+ case codepoint_types::graphic: return "graphic";
+ case codepoint_types::format: return "format";
+ case codepoint_types::control: return "control";
+ case codepoint_types::private_use: return "private-use";
+ case codepoint_types::non_character: return "noncharacter"; // No dash.
+ case codepoint_types::reserved: return "reserved";
+ case codepoint_types::none:
+ case codepoint_types::any: return "";
+ }
+
+ return ""; // Types combination.
+ }
+}
diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx
new file mode 100644
index 0000000..b846476
--- /dev/null
+++ b/libbutl/unicode.mxx
@@ -0,0 +1,82 @@
+// file : libbutl/unicode.mxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint> // uint16_t
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.unicode;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to
+ // only be used in the context of the UTF-16 character encoding form. Thus,
+ // we omit the surrogate codepoint type and assume surrogates as invalid
+ // codepoints.
+ //
+ enum class codepoint_types: std::uint16_t
+ {
+ // Useful to denote invalid codepoints or when building the type set
+ // incrementally.
+ //
+ none = 0x00,
+
+ graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
+ // S(symbol), Zs(separator, space)
+ format = 0x02,
+ control = 0x04,
+ private_use = 0x08,
+ non_character = 0x10,
+ reserved = 0x20,
+
+ any = 0x3f
+ };
+
+ codepoint_types operator& (codepoint_types, codepoint_types);
+ codepoint_types operator| (codepoint_types, codepoint_types);
+ codepoint_types operator&= (codepoint_types&, codepoint_types);
+ codepoint_types operator|= (codepoint_types&, codepoint_types);
+
+ // Return the codepoint type for a valid codepoint value and none otherwise.
+ //
+ // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF].
+ //
+ codepoint_types
+ codepoint_type (char32_t);
+
+ // Return the type name for a single codepoint type and empty string for
+ // `none` and `any`.
+ //
+ // Potential future improvements:
+ // - add the none value name parameter ("invalid" by default)
+ // - produce names for type masks ("graphic, format", "any", etc)
+ //
+ std::string
+ to_string (codepoint_types);
+
+ inline std::ostream&
+ operator<< (std::ostream& os, codepoint_types ts)
+ {
+ return os << to_string (ts);
+ }
+}
+
+#include <libbutl/unicode.ixx>
diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx
deleted file mode 100644
index 0f24559..0000000
--- a/libbutl/utf8.cxx
+++ /dev/null
@@ -1,342 +0,0 @@
-// file : libbutl/utf8.cxx -*- C++ -*-
-// license : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#include <libbutl/utility.mxx>
-#endif
-
-#ifndef __cpp_lib_modules_ts
-#include <string>
-#include <cstddef>
-
-#include <algorithm> // lower_bound()
-#endif
-
-#ifdef __cpp_modules_ts
-module butl.utility;
-
-// Only imports additional to interface.
-#ifdef __clang__
-#ifdef __cpp_lib_modules_ts
-import std.core;
-import std.io;
-#endif
-#endif
-
-#endif
-
-namespace butl
-{
- using namespace std;
-
- // Sorted arrays of the Unicode codepoint ranges corresponding to the
- // codepoint types. Note that code type range lists (but not ranges
- // themselves) may overlap.
- //
- // Note that the graphic type codepoints are numerous and scattered. Thus,
- // we will consider a codepoint to be of the graphic type if it is not of
- // any other type.
- //
- using codepoint_range = pair<char32_t, char32_t>;
-
- static const codepoint_range cn_rs[] = // Control.
- {
- {0x00, 0x1F},
- {0x7F, 0x9F}
- };
-
- static const codepoint_range fr_rs[] = // Format.
- {
- {0x000AD, 0x000AD},
- {0x00600, 0x00605},
- {0x0061C, 0x0061C},
- {0x006DD, 0x006DD},
- {0x0070F, 0x0070F},
- {0x008E2, 0x008E2},
- {0x0180E, 0x0180E},
- {0x0200B, 0x0200F},
- {0x0202A, 0x0202E},
- {0x02060, 0x02064},
- {0x02066, 0x0206F},
- {0x0FEFF, 0x0FEFF},
- {0x0FFF9, 0x0FFFB},
- {0x110BD, 0x110BD},
- {0x110CD, 0x110CD},
- {0x13430, 0x13438},
- {0x1BCA0, 0x1BCA3},
- {0x1D173, 0x1D17A},
- {0xE0001, 0xE0001},
- {0xE0020, 0xE007F}
- };
-
- static const codepoint_range pr_rs[] = // Private-use.
- {
- {0x00E000, 0x00F8FF},
- {0x0F0000, 0x10FFFF}
- };
-
- static const codepoint_range nc_rs[] = // Non-character.
- {
- {0xFDD0, 0xFDEF}
- };
-
- static const codepoint_range rs_rs[] = // Reserved.
- {
- {0x30000, 0xE0000},
- {0xE0002, 0xE001F},
- {0xE0080, 0xE00FF},
- {0xE01F0, 0xEFFFF}
- };
-
- struct codepoint_type_ranges
- {
- codepoint_types type;
- const codepoint_range* begin;
- const codepoint_range* end;
- };
-
- static const codepoint_type_ranges ct_ranges[] =
- {
- {
- codepoint_types::control,
- cn_rs,
- cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
- },
- {
- codepoint_types::format,
- fr_rs,
- fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
- },
- {
- codepoint_types::private_use,
- pr_rs,
- pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
- },
- {
- codepoint_types::non_character,
- nc_rs,
- nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
- },
- {
- codepoint_types::reserved,
- rs_rs,
- rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
- }
- };
-
- bool
- utf8 (const string& s, codepoint_types ts, const char32_t* wl)
- {
- // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
- // depending on the value range it falls into:
- //
- // 0x00000000 - 0x0000007F:
- // 0xxxxxxx
- //
- // 0x00000080 - 0x000007FF:
- // 110xxxxx 10xxxxxx
- //
- // 0x00000800 - 0x0000FFFF:
- // 1110xxxx 10xxxxxx 10xxxxxx
- //
- // 0x00010000 - 0x001FFFFF:
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- //
- // 0x00200000 - 0x03FFFFFF:
- // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- //
- // 0x04000000 - 0x7FFFFFFF:
- // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- //
- // Also note that the Unicode Standard (as of 12.1) specifies no
- // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
- // sequences as invalid (we could have added `unspecified` codepoint type
- // except that there are no UTF-8 validation tables defined for these
- // sequences).
- //
- size_t n (s.size ());
-
- for (size_t i (0); i != n; )
- {
- // Detect the UTF-8 byte sequence length based on its first byte. While
- // at it, start calculating the Unicode codepoint value.
- //
- size_t sn;
- char32_t c;
- unsigned char b1 (s[i]);
-
- if (b1 < 0x80)
- {
- sn = 1;
- c = b1;
- }
- else if (b1 < 0xE0)
- {
- sn = 2;
- c = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte.
- }
- else if (b1 < 0xF0)
- {
- sn = 3;
- c = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte.
- }
- else if (b1 < 0xF8)
- {
- sn = 4;
- c = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte.
- }
- else
- return false; // The byte starts 5- or 6-byte length sequence.
-
- // Bail out if the string doesn't contain all the requred codepoint
- // encoding bytes.
- //
- if (sn > n - i)
- return false;
-
- // Note that while a codepoint may potentially be encoded with byte
- // sequences of different lengths, only the shortest encoding sequence
- // is considered well-formed. Also a well-formed sequence may not be
- // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that
- // is greater than the max codepoint value (0x10FFFF). We will check all
- // that using the Well-Formed UTF-8 Byte Sequences table (provided by
- // the Unicode 12.0 Standard) which also takes care of the missing UTF-8
- // sequence bytes.
- //
- // Return true if a byte value belongs to the specified range.
- //
- auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
- {
- return c >= l && c <= r;
- };
-
- switch (sn)
- {
- case 1: break; // Always well-formed by the definition (see above).
- case 2:
- {
- // [000080 0007FF]: [C2 DF] [80 BF]
- //
- // Check the first/second bytes combinations:
- //
- if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF)))
- return false;
-
- break;
- }
- case 3:
- {
- // [000800 000FFF]: E0 [A0 BF] [80 BF]
- // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF]
- // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates.
- // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF]
- //
- unsigned char b2 (s[i + 1]);
-
- if (!((b1 == 0xE0 && belongs (b2, 0xA0, 0xBF)) ||
- (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) ||
- (b1 == 0xED && belongs (b2, 0x80, 0x9F)) ||
- (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) ||
- !belongs (s[i + 2], 0x80, 0xBF))
- return false;
-
- break;
- }
- case 4:
- {
- // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF]
- // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF]
- // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF]
- //
- unsigned char b2 (s[i + 1]);
-
- if (!((b1 == 0xF0 && belongs (b2, 0x90, 0xBF)) ||
- (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) ||
- (b1 == 0xF4 && belongs (b2, 0x80, 0x8F))) ||
- !belongs (s[i + 2], 0x80, 0xBF) ||
- !belongs (s[i + 3], 0x80, 0xBF))
- return false;
-
- break;
- }
- }
-
- // For the remaining sequence bytes, "append" their 6 rightmost bits to
- // the resulting codepoint value.
- //
- --sn;
- ++i;
-
- for (size_t n (i + sn); i != n; ++i)
- c = (c << 6) | (s[i] & 0x3F);
-
- // Check the decoded codepoint, unless any codepoint type is allowed.
- //
- if (ts == codepoint_types::any)
- continue;
-
- using traits = u32string::traits_type;
-
- // Check if the decoded codepoint is whitelisted.
- //
- if (wl != nullptr &&
- traits::find (wl, traits::length (wl), c) != nullptr)
- continue;
-
- // Match the decoded codepoint type against the specified type set.
- //
- // Detect the codepoint type (see the Types of Code Points table in the
- // Unicode 12.0 Standard for details).
- //
- codepoint_types ct;
-
- // Optimize for the common case (printable ASCII characters).
- //
- if (c >= 0x20 && c <= 0x7E)
- ct = codepoint_types::graphic;
- else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection.
- ct = codepoint_types::non_character;
- else
- {
- // Note that we consider a codepoint to be of the graphic type if it
- // is not of any other type (see above).
- //
- ct = codepoint_types::graphic;
-
- // Note that the codepoint type range lists may overlap. Thus, we
- // iterate over all of them until there is a match.
- //
- for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
- {
- const codepoint_type_ranges& rs (ct_ranges[i]);
-
- // Find the range that either contains the codepoint or lays to the
- // right of it. Note that here we assume a range to be less than a
- // codepoint if it lays to the left of the codepoint.
- //
- const codepoint_range* r (
- lower_bound (rs.begin, rs.end,
- c,
- [] (const codepoint_range& r, char32_t c)
- {
- return r.second < c;
- }));
-
- if (r != rs.end && r->first <= c) // Contains the codepoint?
- {
- ct = rs.type;
- break;
- }
- }
- }
-
- // Now check if the codepoint type matches the specified set. Note: also
- // covers the `ts == codepoint_types::none` case.
- //
- if ((ct & ts) == codepoint_types::none)
- return false;
- }
-
- return true;
- }
-}
diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx
new file mode 100644
index 0000000..3d2e092
--- /dev/null
+++ b/libbutl/utf8.ixx
@@ -0,0 +1,305 @@
+// file : libbutl/utf8.ixx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+ inline utf8_validator::
+ utf8_validator (codepoint_types ts, const char32_t* wl)
+ : types_ (ts),
+ whitelist_ (wl)
+ {
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c)
+ {
+ return validate (c, nullptr /* what */);
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c, std::string& what)
+ {
+ return validate (c, &what);
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c, std::string* what)
+ {
+ using namespace std;
+
+ // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
+ // depending on the value range it falls into:
+ //
+ // 0x00000000 - 0x0000007F:
+ // 0xxxxxxx
+ //
+ // 0x00000080 - 0x000007FF:
+ // 110xxxxx 10xxxxxx
+ //
+ // 0x00000800 - 0x0000FFFF:
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00010000 - 0x001FFFFF:
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00200000 - 0x03FFFFFF:
+ // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x04000000 - 0x7FFFFFFF:
+ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // Also note that the Unicode Standard (as of 12.1) specifies no
+ // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
+ // sequences as invalid (we could have added `unspecified` codepoint type
+ // except that there are no UTF-8 validation tables defined for these
+ // sequences).
+ //
+ unsigned char b (c);
+
+ // Compose the detailed "invalid UTF-8 sequence byte" error.
+ //
+ auto byte_error = [c, b, this] ()
+ {
+ string s ("invalid UTF-8 sequence ");
+
+ const char* names[] = {"first", "second", "third", "forth"};
+ s += names[seq_index_];
+ s += " byte (0x";
+
+ const char digits[] = "0123456789ABCDEF";
+ s += digits[(b >> 4) & 0xF];
+ s += digits[b & 0xF];
+
+ // If the byte happens to be a printable ASCII character then let's
+ // print it as a character as well. This can help a bit with grepping
+ // through text while troubleshooting.
+ //
+ if (b >= 0x20 && b <= 0x7E)
+ {
+ s += " '";
+ s += c;
+ s += "'";
+ }
+
+ s += ")";
+ return s;
+ };
+
+ // Detect the byte sequence length based on its first byte. While at it,
+ // start calculating the resulting Unicode codepoint value.
+ //
+ if (seq_index_ == 0)
+ {
+ if (b < 0x80)
+ {
+ seq_size_ = 1;
+ codepoint_ = b;
+ }
+ else if (b < 0xE0)
+ {
+ seq_size_ = 2;
+ codepoint_ = b & 0x1F; // Takes 5 rightmost bits.
+ }
+ else if (b < 0xF0)
+ {
+ seq_size_ = 3;
+ codepoint_ = b & 0xF; // Takes 4 rightmost bits.
+ }
+ else if (b < 0xF8)
+ {
+ seq_size_ = 4;
+ codepoint_ = b & 0x7; // Takes 3 rightmost bits.
+ }
+ else
+ {
+ if (what != nullptr)
+ {
+ if (b < 0xFE)
+ {
+ *what = b < 0xFC ? "5" : "6";
+ *what += "-byte length UTF-8 sequence";
+ }
+ else
+ *what = byte_error ();
+ }
+
+ return make_pair (false, false); // Invalid byte.
+ }
+ }
+
+ // Note that while a codepoint may potentially be encoded with byte
+ // sequences of different lengths, only the shortest encoding sequence is
+ // considered well-formed. Also a well-formed sequence may not be decoded
+ // into invalid codepoint value (see codepoint_type() for details). We
+ // will check all that using the Well-Formed UTF-8 Byte Sequences table
+ // (provided by the Unicode 12.0 Standard) which also takes care of the
+ // missing UTF-8 sequence bytes.
+ //
+ bool valid (false);
+
+ // Return true if a byte value belongs to the specified range.
+ //
+ auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
+ {
+ return c >= l && c <= r;
+ };
+
+ switch (seq_size_)
+ {
+ case 1: valid = true; break; // Well-formed by the definition (see above).
+ case 2:
+ {
+ // [000080 0007FF]: [C2 DF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = belongs (b, 0xC2, 0xDF)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ }
+ else // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+
+ break;
+ }
+ case 3:
+ {
+ // [000800 000FFF]: E0 [A0 BF] [80 BF]
+ // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF]
+ // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates.
+ // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = (b == 0xE0)))
+ byte2_range_ = make_pair (0xA0, 0xBF);
+ else if ((valid = belongs (b, 0xE1, 0xEC)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ else if ((valid = (b == 0xED)))
+ byte2_range_ = make_pair (0x80, 0x9F);
+ else if ((valid = belongs (b, 0xEE, 0xEF)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ }
+ else if (seq_index_ == 1) // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+ else // Check the third byte.
+ valid = belongs (b, 0x80, 0xBF);
+
+ break;
+ }
+ case 4:
+ {
+ // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF]
+ // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF]
+ // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = (b == 0xF0)))
+ byte2_range_ = make_pair (0x90, 0xBF);
+ else if ((valid = belongs (b, 0xF1, 0xF3)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ else if ((valid = (b == 0xF4)))
+ byte2_range_ = make_pair (0x80, 0x8F);
+ }
+ else if (seq_index_ == 1) // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+ else // Check the third and forth bytes.
+ valid = belongs (b, 0x80, 0xBF);
+
+ break;
+ }
+ }
+
+ // Bail out if the current UTF-8 sequence byte is invalid.
+ //
+ if (!valid)
+ {
+ // We could probably distinguish "surrogate" and "exceed max value" from
+ // other ill-formedness cases (amend the well-formedness table, keep
+ // decoding the sequence, and test the codepoint in the end) and produce
+ // more specific error messages, but this doesn't seem worth the
+ // trouble.
+ //
+ if (what != nullptr)
+ *what = byte_error ();
+
+ return make_pair (false, false); // Invalid byte.
+ }
+
+ // "Append" the sequence byte's 6 rightmost bits to the resulting
+ // codepoint value, unless this is the first byte (which value is already
+ // taken into account; see above).
+ //
+ if (seq_index_ != 0)
+ codepoint_ = (codepoint_ << 6) | (b & 0x3F);
+
+ // If we didn't get to the end of the UTF-8 sequence, then we are done
+ // with this byte.
+ //
+ if (++seq_index_ != seq_size_)
+ return make_pair (true, false); // Valid byte.
+
+ // Prepare for the next UTF-8 sequence validation, regardless of the
+ // decoded codepoint validity.
+ //
+ seq_index_ = 0;
+
+ // Check the decoded codepoint, unless any codepoint type is allowed.
+ //
+ // Note that the well-formedness sequence check guarantees that we decoded
+ // a valid Unicode codepoint (see above).
+ //
+ if (types_ == codepoint_types::any)
+ return make_pair (true, true); // Valid codepoint.
+
+ // Check if the decoded codepoint is whitelisted.
+ //
+ using traits = u32string::traits_type;
+
+ if (whitelist_ != nullptr &&
+ traits::find (whitelist_, traits::length (whitelist_), codepoint_) !=
+ nullptr)
+ return make_pair (true, true); // Valid codepoint.
+
+ // Now check if the codepoint type matches the specified set. Note: also
+ // covers the `types_ == codepoint_types::none` case.
+ //
+ codepoint_types t (codepoint_type (codepoint_));
+
+ if ((t & types_) != codepoint_types::none)
+ return make_pair (true, true); // Valid codepoint.
+
+ if (what != nullptr)
+ *what = "invalid Unicode codepoint (" + to_string (t) + ")";
+
+ return make_pair (false, true); // Invalid codepoint.
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ recover (char c)
+ {
+ // We are recovered if the character can be interpreted as a sequence
+ // leading byte.
+ //
+ // As an optimization, bail out if the byte is a sequence trailing byte
+ // (10xxxxxx).
+ //
+ if ((c & 0xC0) == 0x80)
+ return std::make_pair (false, false); // Invalid byte.
+
+ seq_index_ = 0;
+ return validate (c);
+ }
+
+ inline char32_t utf8_validator::
+ codepoint () const
+ {
+ return codepoint_;
+ }
+}
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
new file mode 100644
index 0000000..15e8ded
--- /dev/null
+++ b/libbutl/utf8.mxx
@@ -0,0 +1,130 @@
+// file : libbutl/utf8.mxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint8_t
+#include <utility> // pair
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.utf8;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.unicode;
+#else
+#include <libbutl/unicode.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // Here and below we will refer to bytes that encode a singe Unicode
+ // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
+ // for short) and a sequence of such sequences as "UTF-8 encoded byte
+ // string" ("byte string" for short).
+ //
+
+ // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
+ // validate that its decoded codepoints belong to the specified types or
+ // codepoint whitelist.
+ //
+ class utf8_validator
+ {
+ public:
+ // Note: use whitelist via shallow copy.
+ //
+ explicit
+ utf8_validator (codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ // Validate the next byte returning true if it is valid (first) and
+ // whether it is the last byte of a codepoint (second). The {false, true}
+ // result indicates a byte sequence decoded into a codepoint of undesired
+ // type rather than an invalid byte that happens to be the last in the
+ // sequence (and may well be a valid starting byte of the next sequence).
+ //
+ // Note that in case the byte is invalid, calling this function again
+ // without recovery is illegal.
+ //
+ std::pair<bool, bool>
+ validate (char);
+
+ // As above but in case of an invalid byte also return the description of
+ // why it is invalid.
+ //
+ // Note that the description only contains the reason why the specified
+ // byte is not part of a valid UTF-8 sequence or the desired codepoint
+ // type, for example:
+ //
+ // "invalid UTF-8 sequence first byte (0xB0)"
+ // "invalid Unicode codepoint (reserved)"
+ //
+ // It can be used to form complete diagnostics along these lines:
+ //
+ // cerr << "invalid manifest value " << name << ": " << what << endl;
+ //
+ std::pair<bool, bool>
+ validate (char, std::string& what);
+
+ // As above but decide whether the description is needed at runtime (what
+ // may be NULL).
+ //
+ std::pair<bool, bool>
+ validate (char, std::string* what);
+
+ // Recover from an invalid byte.
+ //
+ // This function must be called with the first invalid and then subsequent
+ // bytes until it signals that the specified byte is valid. Note that it
+ // shall not be called if the sequence is decoded into a codepoint of an
+ // undesired type.
+ //
+ // Note also that a byte being invalid in the middle of a UTF-8 sequence
+ // may be valid as a first byte of the next sequence.
+ //
+ std::pair<bool, bool>
+ recover (char);
+
+ // Return the codepoint of the last byte sequence.
+ //
+ // This function can only be legally called after validate() or recover()
+ // signal that the preceding byte is valid and last.
+ //
+ char32_t
+ codepoint () const;
+
+ private:
+ codepoint_types types_;
+ const char32_t* whitelist_;
+
+ // State machine.
+ //
+ uint8_t seq_size_; // [1 4]; calculated at the first byte validation.
+ uint8_t seq_index_ = 0; // [0 3]
+
+ // Last byte sequence decoded codepoint (built incrementally).
+ //
+ char32_t codepoint_;
+
+ // The byte range a valid UTF-8 sequence second byte must belong to as
+ // calculated during the first byte validation.
+ //
+ // Note that the subsequent (third and forth) bytes must belong to the
+ // [80 BF] range regardless to the previous bytes.
+ //
+ std::pair<unsigned char, unsigned char> byte2_range_;
+ };
+}
+
+#include <libbutl/utf8.ixx>
diff --git a/libbutl/utility.cxx b/libbutl/utility.cxx
index ce78295..d6a21c6 100644
--- a/libbutl/utility.cxx
+++ b/libbutl/utility.cxx
@@ -35,6 +35,9 @@ import std.io;
#endif
#endif
+import butl.utf8;
+#else
+#include <libbutl/utf8.mxx>
#endif
namespace butl
@@ -191,6 +194,135 @@ namespace butl
}
void
+ to_utf8 (string& s, char repl, codepoint_types ts, const char32_t* wl)
+ {
+ using iterator = string::iterator;
+
+ utf8_validator val (ts, wl);
+
+ iterator i (s.begin ()); // Source current position.
+ iterator e (s.end ()); // Source end position.
+ iterator d (i); // Destination current position.
+ iterator b (d); // Begin of the current destination sequence.
+
+ // Replace the current byte and prepare for the next sequence.
+ //
+ auto replace_byte = [&d, &b, repl] ()
+ {
+ *d++ = repl;
+ b = d;
+ };
+
+ // Replace bytes of the current sequence excluding the current byte and
+ // prepare for the next sequence.
+ //
+ auto replace_sequence = [&d, &b, repl] ()
+ {
+ for (; b != d; ++b)
+ *b = repl;
+ };
+
+ // Replace sequence bytes with a single replacement byte and prepare for
+ // the next sequence.
+ //
+ auto replace_codepoint = [&d, &b, &replace_byte] ()
+ {
+ d = b; // Rewind to the beginning of the sequence.
+ replace_byte ();
+ };
+
+ // Iterate over the byte string appending valid bytes, replacing invalid
+ // bytes/codepoints, and recovering after invalid bytes.
+ //
+ for (; i != e; ++i)
+ {
+ char c (*i);
+ pair<bool, bool> v (val.validate (c));
+
+ // Append a valid byte and prepare for the next sequence if the sequence
+ // end is reached.
+ //
+ auto append_byte = [&d, &b, &v, &c] ()
+ {
+ *d++ = c;
+
+ if (v.second) // Sequence last byte?
+ b = d;
+ };
+
+ // If this is a valid byte/codepoint, then append the byte and proceed
+ // to the next string byte.
+ //
+ if (v.first)
+ {
+ append_byte ();
+ continue;
+ }
+
+ // If this is an invalid codepoint, then replace the sequence with a
+ // single replacement character and proceed to the next byte sequence
+ // (no recovery is necessary).
+ //
+ if (v.second)
+ {
+ replace_codepoint ();
+ continue;
+ }
+
+ // Now, given this is an invalid byte, replace the current sequence
+ // bytes and recover.
+ //
+ replace_sequence ();
+
+ // Stay in the recovery cycle until a valid byte is encountered. Note
+ // that we start from where we left off, not from the next byte (see
+ // utf8_validator::recover() for details).
+ //
+ for (; i != e; ++i)
+ {
+ c = *i;
+ v = val.recover (c);
+
+ // End the recovery cycle for a valid byte.
+ //
+ if (v.first)
+ {
+ append_byte ();
+ break;
+ }
+
+ // End the recovery cycle for a decoded but invalid (ASCII-range)
+ // codepoint.
+ //
+ if (v.second)
+ {
+ replace_codepoint ();
+ break;
+ }
+
+ replace_byte ();
+ }
+
+ // Bail out if we reached the end of the byte string. Note that while we
+ // failed to recover (otherwise i != e), all the bytes are already
+ // replaced.
+ //
+ if (i == e)
+ break;
+ }
+
+ // If the last byte sequence is incomplete, then replace its bytes.
+ //
+ if (b != d)
+ replace_sequence ();
+
+ // Shrink the byte string if we replaced any invalid codepoints.
+ //
+ if (d != e)
+ s.resize (d - s.begin ());
+ }
+
+ void
setenv (const string& name, const string& value)
{
#ifndef _WIN32
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index c5fdbac..27ef7fb 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -2,8 +2,11 @@
// license : MIT; see accompanying LICENSE file
#ifndef __cpp_lib_modules_ts
-#include <cstdlib> // getenv()
-#include <algorithm>
+#include <cctype> // toupper(), tolower(), is*()
+#include <cwctype> // isw*()
+#include <cstdlib> // getenv()
+#include <algorithm> // for_each()
+#include <stdexcept> // invalid_argument
#endif
namespace butl
@@ -216,44 +219,84 @@ namespace butl
return sanitize_identifier (std::string (s));
}
- inline codepoint_types
- operator&= (codepoint_types& x, codepoint_types y)
+ inline bool
+ eof (std::istream& is)
{
- return x = static_cast<codepoint_types> (
- static_cast<std::uint16_t> (x) &
- static_cast<std::uint16_t> (y));
+ if (!is.fail ())
+ return false;
+
+ if (is.eof ())
+ return true;
+
+ throw std::istream::failure ("");
}
- inline codepoint_types
- operator|= (codepoint_types& x, codepoint_types y)
+ inline optional<std::size_t>
+ utf8_length_impl (const std::string& s,
+ std::string* what,
+ codepoint_types ts,
+ const char32_t* wl)
{
- return x = static_cast<codepoint_types> (
- static_cast<std::uint16_t> (x) |
- static_cast<std::uint16_t> (y));
+ using namespace std;
+
+ // Optimize for an empty string.
+ //
+ if (s.empty ())
+ return 0;
+
+ size_t r (0);
+ pair<bool, bool> v;
+ utf8_validator val (ts, wl);
+
+ for (char c: s)
+ {
+ v = val.validate (c, what);
+
+ if (!v.first) // Invalid byte?
+ return nullopt;
+
+ if (v.second) // Last byte in the sequence?
+ ++r;
+ }
+
+ // Make sure that the last UTF-8 sequence is complete.
+ //
+ if (!v.second)
+ {
+ if (what != nullptr)
+ *what = "incomplete UTF-8 sequence";
+
+ return nullopt;
+ }
+
+ return r;
}
- inline codepoint_types
- operator& (codepoint_types x, codepoint_types y)
+ inline std::size_t
+ utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl)
{
- return x &= y;
+ using namespace std;
+
+ string what;
+ if (optional<size_t> r = utf8_length_impl (s, &what, ts, wl))
+ return *r;
+
+ throw invalid_argument (what);
}
- inline codepoint_types
- operator| (codepoint_types x, codepoint_types y)
+ inline bool
+ utf8 (const std::string& s,
+ std::string& what,
+ codepoint_types ts,
+ const char32_t* wl)
{
- return x |= y;
+ return utf8_length_impl (s, &what, ts, wl).has_value ();
}
inline bool
- eof (std::istream& is)
+ utf8 (const std::string& s, codepoint_types ts, const char32_t* wl)
{
- if (!is.fail ())
- return false;
-
- if (is.eof ())
- return true;
-
- throw std::istream::failure ("");
+ return utf8_length_impl (s, nullptr, ts, wl).has_value ();
}
inline optional<std::string>
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 71c2860..b84e731 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -16,13 +16,10 @@
#include <iosfwd> // ostream
#include <istream>
#include <cstddef> // size_t
-#include <utility> // move(), forward()
+#include <utility> // move(), forward(), pair
#include <cstring> // strcmp(), strlen()
#include <exception> // exception, uncaught_exception[s]()
//#include <functional> // hash
-
-#include <cctype> // toupper(), tolower(), is*()
-#include <cwctype> // isw*()
#endif
#include <libbutl/ft/lang.hxx> // thread_local
@@ -34,8 +31,12 @@ export module butl.utility;
import std.core;
import std.io;
#endif
+import butl.utf8;
+import butl.unicode;
import butl.optional;
#else
+#include <libbutl/utf8.mxx>
+#include <libbutl/unicode.mxx>
#include <libbutl/optional.mxx>
#endif
@@ -194,41 +195,44 @@ LIBBUTL_MODEXPORT namespace butl
std::string sanitize_identifier (std::string&&);
std::string sanitize_identifier (const std::string&);
- // Return true if the string is a valid UTF-8 encoded byte sequence and,
- // optionally, its decoded codepoints belong to the specified types or to
- // the codepoint whitelist.
- //
- // Note that the Unicode Standard considers a UTF-8 byte sequence decoded
- // into a codepoint of the surrogate type as invalid. Thus, the surrogate
- // type may not be specified.
+ // Return true if the string is a valid UTF-8 encoded byte string and,
+ // optionally, its decoded codepoints belong to the specified types or
+ // codepoint whitelist.
//
- enum class codepoint_types: std::uint16_t
- {
- // Useful to only allow the whitelisted codepoints or when building the
- // type set incrementally.
- //
- none = 0x00,
-
- graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
- // S(symbol), Zs(separator, space)
- format = 0x02,
- control = 0x04,
- private_use = 0x08,
- non_character = 0x10,
- reserved = 0x20,
-
- any = 0x3f
- };
+ bool
+ utf8 (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
- LIBBUTL_SYMEXPORT bool
+ // As above but in case of an invalid sequence also return the description
+ // of why it is invalid.
+ //
+ bool
utf8 (const std::string&,
+ std::string& what,
codepoint_types = codepoint_types::any,
const char32_t* whitelist = nullptr);
- codepoint_types operator& (codepoint_types, codepoint_types);
- codepoint_types operator| (codepoint_types, codepoint_types);
- codepoint_types operator&= (codepoint_types&, codepoint_types);
- codepoint_types operator|= (codepoint_types&, codepoint_types);
+ // Return UTF-8 byte string length in codepoints. Throw
+ // std::invalid_argument if this is not a valid UTF-8.
+ //
+ std::size_t
+ utf8_length (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ // Fixup the specified string (in place) to be valid UTF-8 replacing invalid
+ // bytes and codepoints with the specified character, for example, '?'.
+ //
+ // Potential future improvements:
+ // - char32_t replacement (will need UTF-8 encoding)
+ // - different replacement for bytes and codepoints
+ //
+ LIBBUTL_SYMEXPORT void
+ to_utf8 (std::string&,
+ char replacement,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
// If an input stream is in a failed state, then return true if this is
// because of the eof and throw istream::failure otherwise. If the stream
diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx
index 57674cb..a34f2b7 100644
--- a/tests/manifest-parser/driver.cxx
+++ b/tests/manifest-parser/driver.cxx
@@ -40,6 +40,9 @@ namespace butl
static bool
equal (const optional<pairs>& actual, const optional<pairs>& expected);
+ static pairs
+ parse (const char* m, manifest_parser::filter_function f = {});
+
// Test manifest as it is represented in the stream, including format
// version and end-of-manifest values.
//
@@ -188,6 +191,41 @@ namespace butl
assert (p.first == "" && p.second == "comment");
}
+ // UTF-8.
+ //
+ assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0",
+ {{"","1"},
+ {"\xD0\xB0y\xD0\xB0", "\xD0\xB0z\xD0\xB0"},
+ {"",""},
+ {"",""}}));
+
+ assert (fail (":1\n#\xD0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0"));
+ assert (fail (":1\r\r\xB0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\r\xD0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0\r\xD0"));
+
+ // Test parsing failure for manifest with multi-byte UTF-8 sequences
+ // (the column is properly reported, etc).
+ //
+ try
+ {
+ parse (":1\na\xD0\xB0\xD0\xB0\xFE");
+ assert (false);
+ }
+ catch (const manifest_parsing& e)
+ {
+ assert (e.line == 2 &&
+ e.column == 4 &&
+ e.description ==
+ "invalid manifest name: "
+ "invalid UTF-8 sequence first byte (0xFE)");
+ }
+
// Filtering.
//
assert (test (":1\na: abc\nb: bca\nc: cab",
@@ -281,7 +319,7 @@ namespace butl
}
static pairs
- parse (const char* m, manifest_parser::filter_function f = {})
+ parse (const char* m, manifest_parser::filter_function f)
{
istringstream is (m);
is.exceptions (istream::failbit | istream::badbit);
diff --git a/tests/manifest-rewriter/driver.cxx b/tests/manifest-rewriter/driver.cxx
index fd76929..ec73d81 100644
--- a/tests/manifest-rewriter/driver.cxx
+++ b/tests/manifest-rewriter/driver.cxx
@@ -90,6 +90,10 @@ namespace butl
{{"abc", "xyz"}}) ==
":1\n abc: \\\nxyz\n\\");
+ assert (edit (":1\n a\xD0\xB0g : b",
+ {{"a\xD0\xB0g", "xyz"}}) ==
+ ":1\n a\xD0\xB0g : \\\nxyz\n\\");
+
// Test editing of manifests that contains CR characters.
//
assert (edit (":1\r\na: b\r\r\n", {{"a", "xyz"}}) == ":1\r\na: xyz\r\r\n");
diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx
index 148a281..c818b4a 100644
--- a/tests/manifest-serializer/driver.cxx
+++ b/tests/manifest-serializer/driver.cxx
@@ -46,6 +46,7 @@ main ()
assert (test ({{"#", "x"}}, "# x\n"));
assert (test ({{"#", "x"},{"#", "y"},{"#", ""}}, "# x\n# y\n#\n"));
assert (fail ({{"",""},{"#", "x"}})); // serialization after eos
+ assert (fail ({{"#", "\xB0"}})); // invalid UTF-8 sequence
// Empty manifest stream.
//
@@ -89,6 +90,12 @@ main ()
assert (fail ({{"","1"},{"a b",""}}));
assert (fail ({{"","1"},{"a\tb",""}}));
assert (fail ({{"","1"},{"a\n",""}}));
+ assert (fail ({{"","1"},{"a\xB0",""}})); // invalid UTF-8 sequence
+
+ // Invalid value.
+ //
+ assert (fail ({{"","1"},{"a","\xB0"}})); // invalid UTF-8 sequence
+ assert (fail ({{"","1"},{"a","\xD0"}})); // incomplete UTF-8 sequence
// Simple value.
//
@@ -172,11 +179,22 @@ main ()
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\Y\\\n"
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+ // Hard break after the UTF-8/delayed hard break.
+ //
+ string l6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82"
+ "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
+ string e6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82\\\n"
+ "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
assert (test ({{"","1"},{"a",l1},{"",""},{"",""}}, ": 1\na: " + e1 + "\n"));
assert (test ({{"","1"},{"a",l2},{"",""},{"",""}}, ": 1\na: " + e2 + "\n"));
assert (test ({{"","1"},{"a",l3},{"",""},{"",""}}, ": 1\na: " + e3 + "\n"));
assert (test ({{"","1"},{"a",l4},{"",""},{"",""}}, ": 1\na: " + e4 + "\n"));
assert (test ({{"","1"},{"a",l5},{"",""},{"",""}}, ": 1\na: " + e5 + "\n"));
+ assert (test ({{"","1"},{"a",l6},{"",""},{"",""}}, ": 1\na: " + e6 + "\n"));
// Multi-line value.
//
diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx
index 8480dec..f35e65e 100644
--- a/tests/utf8/driver.cxx
+++ b/tests/utf8/driver.cxx
@@ -13,8 +13,10 @@
#ifdef __cpp_lib_modules_ts
import std.core;
#endif
+import butl.utf8;
import butl.utility;
#else
+#include <libbutl/utf8.mxx>
#include <libbutl/utility.mxx>
#endif
@@ -24,6 +26,17 @@ using namespace butl;
int
main ()
{
+ // utf8() tests.
+ //
+ auto utf8_error = [] (const string& s,
+ codepoint_types ts = codepoint_types::any,
+ const char32_t* wl = nullptr)
+ {
+ string error;
+ assert (!utf8 (s, error, ts, wl));
+ return error;
+ };
+
// Valid sequences.
//
// Empty.
@@ -43,18 +56,36 @@ main ()
// Ill-formed sequences.
//
+ // Long sequences.
+ //
+ assert (!utf8 ("\xF8")); // 5-byte sequence.
+ assert (!utf8 ("\xFC")); // 6-byte sequence.
+
+ assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence");
+ assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence");
+ assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)");
+
// 2-byte sequences.
//
assert (!utf8 ("\xC1\x80")); // Invalid first byte.
assert (!utf8 ("\xD0y")); // Invalid second byte.
+ assert (utf8_error ("\xC1\x80") ==
+ "invalid UTF-8 sequence first byte (0xC1)");
+
+ assert (utf8_error ("\xD0y") ==
+ "invalid UTF-8 sequence second byte (0x79 'y')");
+
// 3-byte sequences.
//
assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.
- assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value.
- assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value.
+ assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate.
+ assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate.
+
+ assert (utf8_error ("\xE2\x80\x70") ==
+ "invalid UTF-8 sequence third byte (0x70 'p')");
// 4-byte sequences.
//
@@ -63,9 +94,8 @@ main ()
assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.
- // Out of the codepoint range (0x10ffff + 1).
- //
- assert (!utf8 ("\xF4\x90\x80\x80"));
+ assert (utf8_error ("\xF1\x80\x80\xC0") ==
+ "invalid UTF-8 sequence forth byte (0xC0)");
// Incomplete sequences.
//
@@ -73,14 +103,25 @@ main ()
assert (!utf8 ("\xE4\xBA")); // 3-byte sequence.
assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.
+ assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence");
+
// Missing sequence leading bytes.
//
- assert (!utf8 ("\xB0xyz")); // 2-byte sequence.
- assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence.
- assert (!utf8 ("\x8Cxyz")); // 3-byte sequence.
- assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
- assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence.
- assert (!utf8 ("\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\xB0xyz")); // 2-byte sequence.
+ assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence.
+ assert (!utf8 ("\x8Cxyz")); // 3-byte sequence.
+ assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x82xyz")); // 4-byte sequence.
+
+ assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)");
+
+ // Above the valid codepoint range (0x10ffff + 1).
+ //
+ assert (!utf8 ("\xF4\x90\x80\x80"));
+
+ assert (utf8_error ("\xF4\x90\x80\x80") ==
+ "invalid UTF-8 sequence second byte (0x90)");
// Whitelisting.
//
@@ -145,6 +186,9 @@ main ()
assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.
+ assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) ==
+ "invalid Unicode codepoint (format)");
+
assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.
// Private-use & Graphic.
@@ -153,4 +197,145 @@ main ()
codepoint_types::format));
assert (!utf8 ("a", codepoint_types::none)); // None.
+
+ assert (utf8_error ("a", codepoint_types::none) ==
+ "invalid Unicode codepoint (graphic)");
+
+ // UTF-8 string length.
+ //
+ auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any)
+ {
+ try
+ {
+ utf8_length (s, ts);
+ return false;
+ }
+ catch (const invalid_argument&)
+ {
+ return true;
+ }
+ };
+
+ assert (utf8_length ("") == 0);
+ assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5);
+
+ assert (invalid_utf8 ("\xFE")); // Invalid byte.
+ assert (invalid_utf8 ("\xD0")); // Incomplete.
+ assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint.
+
+ // to_utf8() tests.
+ //
+ auto roundtrip = [] (const char* s)
+ {
+ string r (s);
+ to_utf8 (r, '?');
+ return r == s;
+ };
+
+ auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any)
+ {
+ to_utf8 (s, '?', ts);
+ return s;
+ };
+
+ // Empty.
+ //
+ assert (roundtrip (""));
+
+ // 1 code point.
+ //
+ assert (roundtrip ("a")); // 1 byte.
+ assert (roundtrip ("\xD0\xB0")); // 2 bytes.
+ assert (roundtrip ("\xE4\xBA\x8C")); // 3 bytes.
+ assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes.
+
+ // Multiple code points.
+ //
+ assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));
+
+ // Ill-formed sequences.
+ //
+ // Long sequence.
+ //
+ assert (sanitize ("\xF8") == "?"); // 5-byte sequence.
+
+ // Invalid first byte followed by a second byte which ...
+ //
+ assert (sanitize ("\xC1\x80") == "??"); // is a trailing byte.
+ assert (sanitize ("\xC1y") == "?y"); // starts 1-byte sequence.
+ assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+ assert (sanitize ("\xC1\xFE") == "??"); // is not UTF-8.
+
+ // Invalid second byte which ...
+ //
+ assert (sanitize ("\xD0y") == "?y"); // starts 1-byte sequence.
+ assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+ assert (sanitize ("\xD0\xFE") == "??"); // is not UTF-8.
+
+ // Incomplete sequences.
+ //
+ assert (sanitize ("\xD0") == "?"); // 2-byte sequence.
+ assert (sanitize ("y\xD0") == "y?"); // 2-byte sequence.
+ assert (sanitize ("\xE4\xBA") == "??"); // 3-byte sequence.
+ assert (sanitize ("\xD0\xD0") == "??"); // 2-byte sequence.
+
+ // Incomplete recovery.
+ //
+ assert (sanitize ("\xD0\xFE") == "??"); // 2-byte sequence.
+ assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence.
+
+ assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range.
+ assert (sanitize ("\xED\xA0\x80") == "???"); // Min UTF-16 surrogate.
+ assert (sanitize ("\xED\xBF\xBF") == "???"); // Max UTF-16 surrogate.
+
+ // Invalid codepoints.
+ //
+ auto sanitize_g = [&sanitize] (string s)
+ {
+ return sanitize (move (s), codepoint_types::graphic);
+ };
+
+ assert (sanitize_g ("\xEF\xB7\x90") == "?");
+ assert (sanitize_g ("y\xEF\xB7\x90") == "y?");
+ assert (sanitize_g ("\xEF\xB7\x90y") == "?y");
+
+ // Invalid during recovery.
+ //
+ assert (sanitize_g ("\xD0\n") == "??");
+ assert (sanitize_g ("\xD0\ny") == "??y");
+ assert (sanitize_g ("\xD0\xFE\n") == "???");
+
+ assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??");
+
+ // utf8_validator::codepoint() tests.
+ //
+ {
+ u32string r;
+ size_t invalid_codepoints (0);
+
+ string s ("a"
+ "\xD0\xB0"
+ "\n" // Control.
+ "\xE4\xBA\x8C"
+ "\xEE\x80\x80" // Private-use.
+ "\xF0\x90\x8C\x82");
+
+ utf8_validator val (codepoint_types::graphic);
+
+ for (char c: s)
+ {
+ pair<bool, bool> v (val.validate (c));
+
+ if (v.first)
+ {
+ if (v.second)
+ r.push_back (val.codepoint ());
+ }
+ else
+ ++invalid_codepoints;
+ }
+
+ assert (r == U"a\x430\x4E8C\x10302");
+ assert (invalid_codepoints == 2);
+ }
}