aboutsummaryrefslogtreecommitdiff
path: root/libbutl/char-scanner.mxx
blob: af4dad9abfbd3c420556237bbc235e675720c631 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// file      : libbutl/char-scanner.mxx -*- C++ -*-
// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef __cpp_modules
#pragma once
#endif

// C includes.

#ifndef __cpp_lib_modules
#include <string>  // char_traits
#include <cstdint> // uint64_t
#include <istream>
#endif

// Other includes.

#ifdef __cpp_modules
export module butl.char_scanner;
#ifdef __cpp_lib_modules
import std.core;
import std.io;
#endif
import butl.fdstream;
#else
#include <libbutl/fdstream.mxx>
#endif

#include <libbutl/export.hxx>

LIBBUTL_MODEXPORT namespace butl
{
  // Low-level character stream scanner. Normally used as a base for
  // higher-level lexers.
  //
  class LIBBUTL_SYMEXPORT char_scanner
  {
  public:
    // If the crlf argument is true, then recognize Windows newlines (0x0D
    // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone
    // 0x0D is treated "as if" it was followed by 0x0A.
    //
    // Note also that if the stream happens to be ifdstream, then it includes
    // a number of optimizations that assume nobody else is messing with the
    // stream.
    //
    char_scanner (std::istream& is, bool crlf = true);

    char_scanner (const char_scanner&) = delete;
    char_scanner& operator= (const char_scanner&) = delete;

    // Scanner interface.
    //
  public:

    // Extended character. It includes line/column information and is capable
    // of representing EOF.
    //
    // Note that implicit conversion of EOF to char_type results in NUL
    // character (which means in most cases it is safe to compare xchar to
    // char without checking for EOF).
    //
    class xchar
    {
    public:
      using traits_type = std::char_traits<char>;
      using int_type = traits_type::int_type;
      using char_type = traits_type::char_type;

      int_type value;
      std::uint64_t line;
      std::uint64_t column;

      operator char_type () const
      {
        return value != traits_type::eof ()
          ? static_cast<char_type> (value)
          : char_type (0);
      }

      xchar (int_type v, std::uint64_t l = 0, std::uint64_t c = 0)
          : value (v), line (l), column (c) {}
    };

    xchar
    get ();

    void
    get (const xchar& peeked); // Get previously peeked character (faster).

    void
    unget (const xchar&);

    // Note that if there is an "ungot" character, peek() will return
    // that.
    //
    xchar
    peek ();

    // Tests. In the future we can add tests line alpha(), alnum(),
    // etc.
    //
    static bool
    eos (const xchar& c) {return c.value == xchar::traits_type::eof ();}

    // Line and column of the next character to be extracted from the stream
    // by peek() or get().
    //
    std::uint64_t line = 1;
    std::uint64_t column = 1;

  protected:
    using int_type = xchar::int_type;
    using char_type = xchar::char_type;

    int_type
    peek_ ();

    void
    get_ ();

  protected:
    std::istream& is_;

    fdbuf* buf_; // NULL if not ifdstream.
    const char_type* gptr_;
    const char_type* egptr_;

    bool crlf_;
    bool eos_ = false;

    bool unget_ = false;
    bool unpeek_ = false;

    xchar ungetc_ = '\0';
    xchar unpeekc_ = '\0';
  };
}

#include <libbutl/char-scanner.ixx>