From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/char-scanner.txx | 146 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 libbutl/char-scanner.txx (limited to 'libbutl/char-scanner.txx') diff --git a/libbutl/char-scanner.txx b/libbutl/char-scanner.txx new file mode 100644 index 0000000..d4e2082 --- /dev/null +++ b/libbutl/char-scanner.txx @@ -0,0 +1,146 @@ +// file : libbutl/char-scanner.txx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_lib_modules_ts +#include // move +#endif + +namespace butl +{ + template + char_scanner:: + char_scanner (std::istream& is, + validator_type v, + bool crlf, + std::uint64_t l, + std::uint64_t p) + : line (l), + column (1), + position (p), + is_ (is), + val_ (std::move (v)), + buf_ (dynamic_cast (is.rdbuf ())), + gptr_ (nullptr), + egptr_ (nullptr), + crlf_ (crlf) + { + } + + template + auto char_scanner:: + peek (std::string* what) -> xchar + { + if (unget_) + return ungetc_; + + if (unpeek_) + return unpeekc_; + + if (eos_) + return xchar (xchar::traits_type::eof (), line, column, position); + + int_type v (peek_ ()); + + if (v == xchar::traits_type::eof ()) + { + if (!decoded_) + { + if (what != nullptr) + *what = "unexpected end of stream"; + + v = xchar::invalid (); + } + + eos_ = true; + } + else + { + auto valid = [what, this] (int_type v) + { + if (validated_) + return true; + + char c (xchar::traits_type::to_char_type (v)); + std::pair r (what != nullptr + ? val_.validate (c, *what) + : val_.validate (c)); + + decoded_ = r.second; + validated_ = true; + return r.first; + }; + + if (!valid (v)) + v = xchar::invalid (); + else if (crlf_ && v == '\r') + { + // Note that '\r' is a valid character (otherwise we won't be here), + // so we don't validate it again below. We also postpone the + // validation of the next non-'\r' character (except EOF) until the + // next peek() call. + // + int_type v1; + do + { + get_ (); // Sets validated_ to false. + v1 = peek_ (); + } + while (v1 == '\r'); + + if (v1 != '\n') + { + // We need to make sure subsequent calls to peek() return newline. + // + unpeek_ = true; + unpeekc_ = xchar ('\n', line, column, position); + + // Note that the previous character is decoded ('\r') and so EOF is + // legitimate. + // + if (v1 == xchar::traits_type::eof ()) + eos_ = true; + } + + v = '\n'; + } + } + + return xchar (v, line, column, position); + } + + template + void char_scanner:: + get (const xchar& c) + { + if (unget_) + unget_ = false; + else + { + if (unpeek_) + { + unpeek_ = false; + } + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + else if (!eos (c)) + get_ (); + + if (!eos (c)) + { + if (c == '\n') + { + line++; + column = 1; + } + else if (decoded_) // The character is the last in a sequence? + column++; + + position = pos_ (); + } + } + } +} -- cgit v1.1