From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/unicode.cxx | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 libbutl/unicode.cxx (limited to 'libbutl/unicode.cxx') diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx new file mode 100644 index 0000000..4219846 --- /dev/null +++ b/libbutl/unicode.cxx @@ -0,0 +1,165 @@ +// file : libbutl/unicode.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#ifndef __cpp_lib_modules_ts +#include +#include +#include + +#include // size_t +#include // pair +#include // lower_bound() +#endif + +#ifdef __cpp_modules_ts +module butl.unicode; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#endif + +using namespace std; + +namespace butl +{ + // Sorted arrays of the Unicode codepoint ranges corresponding to the + // codepoint types (see the Types of Code Points table in the Unicode 12.0 + // Standard for details). Note that code type range lists (but not ranges + // themselves) may overlap. + // + // Also note that the graphic type codepoints are numerous and scattered. + // Thus, we will consider a codepoint to be of the graphic type if it is not + // of any other type. + // + using codepoint_range = pair; + + static const codepoint_range cn_rs[] = // Control. + { + {0x00, 0x1F}, + {0x7F, 0x9F} + }; + + static const codepoint_range fr_rs[] = // Format. + { + {0x000AD, 0x000AD}, + {0x00600, 0x00605}, + {0x0061C, 0x0061C}, + {0x006DD, 0x006DD}, + {0x0070F, 0x0070F}, + {0x008E2, 0x008E2}, + {0x0180E, 0x0180E}, + {0x0200B, 0x0200F}, + {0x0202A, 0x0202E}, + {0x02060, 0x02064}, + {0x02066, 0x0206F}, + {0x0FEFF, 0x0FEFF}, + {0x0FFF9, 0x0FFFB}, + {0x110BD, 0x110BD}, + {0x110CD, 0x110CD}, + {0x13430, 0x13438}, + {0x1BCA0, 0x1BCA3}, + {0x1D173, 0x1D17A}, + {0xE0001, 0xE0001}, + {0xE0020, 0xE007F} + }; + + static const codepoint_range pr_rs[] = // Private-use. + { + {0x00E000, 0x00F8FF}, + {0x0F0000, 0x10FFFF} + }; + + static const codepoint_range nc_rs[] = // Non-character. + { + {0xFDD0, 0xFDEF} + }; + + static const codepoint_range rs_rs[] = // Reserved. + { + {0x30000, 0xE0000}, + {0xE0002, 0xE001F}, + {0xE0080, 0xE00FF}, + {0xE01F0, 0xEFFFF} + }; + + struct codepoint_type_ranges + { + codepoint_types type; + const codepoint_range* begin; + const codepoint_range* end; + }; + + static const codepoint_type_ranges ct_ranges[] = + { + { + codepoint_types::control, + cn_rs, + cn_rs + sizeof (cn_rs) / sizeof (*cn_rs) + }, + { + codepoint_types::format, + fr_rs, + fr_rs + sizeof (fr_rs) / sizeof (*fr_rs) + }, + { + codepoint_types::private_use, + pr_rs, + pr_rs + sizeof (pr_rs) / sizeof (*pr_rs) + }, + { + codepoint_types::non_character, + nc_rs, + nc_rs + sizeof (nc_rs) / sizeof (*nc_rs) + }, + { + codepoint_types::reserved, + rs_rs, + rs_rs + sizeof (rs_rs) / sizeof (*rs_rs) + } + }; + + // Return the codepoint type of a range if the codepoint value falls into + // one and the graphic type otherwise. + // + // Note that this is a type detection fallback (see codepoint_type() for + // details). + // + codepoint_types + codepoint_type_lookup (char32_t c) + { + // Note that the codepoint type range lists may overlap. Thus, we iterate + // over all of them until there is a match. + // + for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i) + { + const codepoint_type_ranges& rs (ct_ranges[i]); + + // Find the range that either contains the codepoint or lays to the + // right of it. Note that here we assume a range to be less than a + // codepoint value if it lays to the left of the codepoint. + // + const codepoint_range* r ( + lower_bound (rs.begin, rs.end, + c, + [] (const codepoint_range& r, char32_t c) + { + return r.second < c; + })); + + if (r != rs.end && r->first <= c) // Contains the codepoint? + return rs.type; + } + + return codepoint_types::graphic; + } +} -- cgit v1.1