aboutsummaryrefslogtreecommitdiff
path: root/libbutl/unicode.cxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/unicode.cxx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/unicode.cxx')
-rw-r--r--libbutl/unicode.cxx165
1 files changed, 165 insertions, 0 deletions
diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx
new file mode 100644
index 0000000..4219846
--- /dev/null
+++ b/libbutl/unicode.cxx
@@ -0,0 +1,165 @@
+// file : libbutl/unicode.cxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#include <libbutl/unicode.mxx>
+#endif
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint>
+
+#include <cstddef> // size_t
+#include <utility> // pair
+#include <algorithm> // lower_bound()
+#endif
+
+#ifdef __cpp_modules_ts
+module butl.unicode;
+
+// Only imports additional to interface.
+#ifdef __clang__
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#endif
+
+using namespace std;
+
+namespace butl
+{
+ // Sorted arrays of the Unicode codepoint ranges corresponding to the
+ // codepoint types (see the Types of Code Points table in the Unicode 12.0
+ // Standard for details). Note that code type range lists (but not ranges
+ // themselves) may overlap.
+ //
+ // Also note that the graphic type codepoints are numerous and scattered.
+ // Thus, we will consider a codepoint to be of the graphic type if it is not
+ // of any other type.
+ //
+ using codepoint_range = pair<char32_t, char32_t>;
+
+ static const codepoint_range cn_rs[] = // Control.
+ {
+ {0x00, 0x1F},
+ {0x7F, 0x9F}
+ };
+
+ static const codepoint_range fr_rs[] = // Format.
+ {
+ {0x000AD, 0x000AD},
+ {0x00600, 0x00605},
+ {0x0061C, 0x0061C},
+ {0x006DD, 0x006DD},
+ {0x0070F, 0x0070F},
+ {0x008E2, 0x008E2},
+ {0x0180E, 0x0180E},
+ {0x0200B, 0x0200F},
+ {0x0202A, 0x0202E},
+ {0x02060, 0x02064},
+ {0x02066, 0x0206F},
+ {0x0FEFF, 0x0FEFF},
+ {0x0FFF9, 0x0FFFB},
+ {0x110BD, 0x110BD},
+ {0x110CD, 0x110CD},
+ {0x13430, 0x13438},
+ {0x1BCA0, 0x1BCA3},
+ {0x1D173, 0x1D17A},
+ {0xE0001, 0xE0001},
+ {0xE0020, 0xE007F}
+ };
+
+ static const codepoint_range pr_rs[] = // Private-use.
+ {
+ {0x00E000, 0x00F8FF},
+ {0x0F0000, 0x10FFFF}
+ };
+
+ static const codepoint_range nc_rs[] = // Non-character.
+ {
+ {0xFDD0, 0xFDEF}
+ };
+
+ static const codepoint_range rs_rs[] = // Reserved.
+ {
+ {0x30000, 0xE0000},
+ {0xE0002, 0xE001F},
+ {0xE0080, 0xE00FF},
+ {0xE01F0, 0xEFFFF}
+ };
+
+ struct codepoint_type_ranges
+ {
+ codepoint_types type;
+ const codepoint_range* begin;
+ const codepoint_range* end;
+ };
+
+ static const codepoint_type_ranges ct_ranges[] =
+ {
+ {
+ codepoint_types::control,
+ cn_rs,
+ cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
+ },
+ {
+ codepoint_types::format,
+ fr_rs,
+ fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
+ },
+ {
+ codepoint_types::private_use,
+ pr_rs,
+ pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
+ },
+ {
+ codepoint_types::non_character,
+ nc_rs,
+ nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
+ },
+ {
+ codepoint_types::reserved,
+ rs_rs,
+ rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
+ }
+ };
+
+ // Return the codepoint type of a range if the codepoint value falls into
+ // one and the graphic type otherwise.
+ //
+ // Note that this is a type detection fallback (see codepoint_type() for
+ // details).
+ //
+ codepoint_types
+ codepoint_type_lookup (char32_t c)
+ {
+ // Note that the codepoint type range lists may overlap. Thus, we iterate
+ // over all of them until there is a match.
+ //
+ for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
+ {
+ const codepoint_type_ranges& rs (ct_ranges[i]);
+
+ // Find the range that either contains the codepoint or lays to the
+ // right of it. Note that here we assume a range to be less than a
+ // codepoint value if it lays to the left of the codepoint.
+ //
+ const codepoint_range* r (
+ lower_bound (rs.begin, rs.end,
+ c,
+ [] (const codepoint_range& r, char32_t c)
+ {
+ return r.second < c;
+ }));
+
+ if (r != rs.end && r->first <= c) // Contains the codepoint?
+ return rs.type;
+ }
+
+ return codepoint_types::graphic;
+ }
+}