aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utility.cxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utility.cxx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/utility.cxx')
-rw-r--r--libbutl/utility.cxx132
1 files changed, 132 insertions, 0 deletions
diff --git a/libbutl/utility.cxx b/libbutl/utility.cxx
index ce78295..d6a21c6 100644
--- a/libbutl/utility.cxx
+++ b/libbutl/utility.cxx
@@ -35,6 +35,9 @@ import std.io;
#endif
#endif
+import butl.utf8;
+#else
+#include <libbutl/utf8.mxx>
#endif
namespace butl
@@ -191,6 +194,135 @@ namespace butl
}
void
+ to_utf8 (string& s, char repl, codepoint_types ts, const char32_t* wl)
+ {
+ using iterator = string::iterator;
+
+ utf8_validator val (ts, wl);
+
+ iterator i (s.begin ()); // Source current position.
+ iterator e (s.end ()); // Source end position.
+ iterator d (i); // Destination current position.
+ iterator b (d); // Begin of the current destination sequence.
+
+ // Replace the current byte and prepare for the next sequence.
+ //
+ auto replace_byte = [&d, &b, repl] ()
+ {
+ *d++ = repl;
+ b = d;
+ };
+
+ // Replace bytes of the current sequence excluding the current byte and
+ // prepare for the next sequence.
+ //
+ auto replace_sequence = [&d, &b, repl] ()
+ {
+ for (; b != d; ++b)
+ *b = repl;
+ };
+
+ // Replace sequence bytes with a single replacement byte and prepare for
+ // the next sequence.
+ //
+ auto replace_codepoint = [&d, &b, &replace_byte] ()
+ {
+ d = b; // Rewind to the beginning of the sequence.
+ replace_byte ();
+ };
+
+ // Iterate over the byte string appending valid bytes, replacing invalid
+ // bytes/codepoints, and recovering after invalid bytes.
+ //
+ for (; i != e; ++i)
+ {
+ char c (*i);
+ pair<bool, bool> v (val.validate (c));
+
+ // Append a valid byte and prepare for the next sequence if the sequence
+ // end is reached.
+ //
+ auto append_byte = [&d, &b, &v, &c] ()
+ {
+ *d++ = c;
+
+ if (v.second) // Sequence last byte?
+ b = d;
+ };
+
+ // If this is a valid byte/codepoint, then append the byte and proceed
+ // to the next string byte.
+ //
+ if (v.first)
+ {
+ append_byte ();
+ continue;
+ }
+
+ // If this is an invalid codepoint, then replace the sequence with a
+ // single replacement character and proceed to the next byte sequence
+ // (no recovery is necessary).
+ //
+ if (v.second)
+ {
+ replace_codepoint ();
+ continue;
+ }
+
+ // Now, given this is an invalid byte, replace the current sequence
+ // bytes and recover.
+ //
+ replace_sequence ();
+
+ // Stay in the recovery cycle until a valid byte is encountered. Note
+ // that we start from where we left off, not from the next byte (see
+ // utf8_validator::recover() for details).
+ //
+ for (; i != e; ++i)
+ {
+ c = *i;
+ v = val.recover (c);
+
+ // End the recovery cycle for a valid byte.
+ //
+ if (v.first)
+ {
+ append_byte ();
+ break;
+ }
+
+ // End the recovery cycle for a decoded but invalid (ASCII-range)
+ // codepoint.
+ //
+ if (v.second)
+ {
+ replace_codepoint ();
+ break;
+ }
+
+ replace_byte ();
+ }
+
+ // Bail out if we reached the end of the byte string. Note that while we
+ // failed to recover (otherwise i != e), all the bytes are already
+ // replaced.
+ //
+ if (i == e)
+ break;
+ }
+
+ // If the last byte sequence is incomplete, then replace its bytes.
+ //
+ if (b != d)
+ replace_sequence ();
+
+ // Shrink the byte string if we replaced any invalid codepoints.
+ //
+ if (d != e)
+ s.resize (d - s.begin ());
+ }
+
+ void
setenv (const string& name, const string& value)
{
#ifndef _WIN32