diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2019-08-24 17:41:30 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2019-08-28 15:01:48 +0300 |
commit | 4bdf53837e010073de802070d4e6087410662d3e (patch) | |
tree | 2820d3964877d1a7d498833da325aa3d3a699353 /libbuild2/cc/lexer.hxx | |
parent | ea24f530048cbce0c5335ca3fd3632c8ce34315a (diff) |
Move cc build system module to separate library
Diffstat (limited to 'libbuild2/cc/lexer.hxx')
-rw-r--r-- | libbuild2/cc/lexer.hxx | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/libbuild2/cc/lexer.hxx b/libbuild2/cc/lexer.hxx new file mode 100644 index 0000000..cb2b3a5 --- /dev/null +++ b/libbuild2/cc/lexer.hxx @@ -0,0 +1,190 @@ +// file : libbuild2/cc/lexer.hxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_CC_LEXER_HXX +#define LIBBUILD2_CC_LEXER_HXX + +#include <libbutl/sha256.mxx> +#include <libbutl/char-scanner.mxx> + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/diagnostics.hxx> + +namespace build2 +{ + namespace cc + { + // Preprocessor-level tokenization of C/C++ source. In other words, the + // sequence of tokens returned is similar to what a real C/C++ compiler + // would see from its preprocessor. + // + // The input is a (partially-)preprocessed translation unit that may still + // contain comments, line continuations, and preprocessor directives such + // as #line, #pragma, but not #include (which is diagnosed). Currently, + // all preprocessor directives except #line are ignored and no values are + // saved from literals. The #line directive (and its shorthand notation) + // is recognized to provide the logical token location. + // + // While at it we also calculate the checksum of the input ignoring + // comments, whitespaces, etc. This is used to detect changes that do not + // alter the resulting token stream. + // + enum class token_type + { + // NOTE: remember to update operator<<() if changing anything here! + // + eos, + + dot, // . + semi, // ; + less, // < + greater, // > + lcbrace, // { + rcbrace, // } + + punctuation, // Other punctuation. + + identifier, + + number, // Number literal. + character, // Char literal. + string, // String literal. + + other // Other token. + }; + + struct token + { + token_type type = token_type::eos; + string value; + + // Logical position. + // + path file; + uint64_t line = 0; + uint64_t column = 0; + + // Physical position in the stream, currently only for identifiers. + // + uint64_t position = 0; + }; + + // Output the token value in a format suitable for diagnostics. + // + ostream& + operator<< (ostream&, const token&); + + class lexer: protected butl::char_scanner + { + public: + lexer (ifdstream& is, const path& name) + : char_scanner (is, false), + name_ (name), + fail ("error", &name_), + log_file_ (name) {} + + const path& + name () const {return name_;} + + string + checksum () const {return cs_.string ();} + + // Note that it is ok to call next() again after getting eos. + // + token + next () + { + token t; + next (t, skip_spaces (), true); + return t; + } + + // As above but reuse the token to avoid a (potential) memory + // allocation. Typical usage: + // + // for (token t; l.next (t) != token_type::eos; ) + // ... + // + token_type + next (token& t) + { + next (t, skip_spaces (), true); + return t.type; + } + + private: + void + next (token&, xchar, bool); + + void + number_literal (token&, xchar); + + void + char_literal (token&, xchar); + + void + string_literal (token&, xchar); + + void + raw_string_literal (token&, xchar); + + void + literal_suffix (xchar); + + void + line_directive (token&, xchar); + + xchar + skip_spaces (bool newline = true); + + // The char_scanner adaptation for newline escape sequence processing. + // Enabled by default and is only disabled in the raw string literals. + // + private: + using base = char_scanner; + + xchar + peek (bool escape = true); + + xchar + get (bool escape = true); + + void + get (const xchar& peeked); + + // Hashing versions. + // + xchar + geth (bool escape = true); + + void + geth (const xchar& peeked); + + private: + const path name_; + const fail_mark fail; + + // Logical file and line as set by the #line directives. Note that the + // lexer diagnostics still uses the physical file/lines. + // + path log_file_; + optional<uint64_t> log_line_; + + string tmp_file_; + sha256 cs_; + }; + + // Diagnostics plumbing. + // + inline location + get_location (const token& t, const void* = nullptr) + { + return location (&t.file, t.line, t.column); + } + } +} + +#endif // LIBBUILD2_CC_LEXER_HXX |