From e6d92a1fb21232ab09886431d39ccb8a95c7c68d Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 11 Dec 2014 13:57:42 +0200 Subject: Initial lexer implementation for buildfiles --- build/bd.cxx | 62 ++++++++++++++++ build/buildfile | 1 + build/lexer | 98 +++++++++++++++++++++++++ build/lexer.cxx | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ build/target | 3 +- build/token | 55 ++++++++++++++ 6 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 build/buildfile create mode 100644 build/lexer create mode 100644 build/lexer.cxx create mode 100644 build/token diff --git a/build/bd.cxx b/build/bd.cxx index c592d64..33ee02f 100644 --- a/build/bd.cxx +++ b/build/bd.cxx @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,9 @@ #include #include +#include +#include + using namespace std; namespace build @@ -132,6 +136,64 @@ main (int argc, char* argv[]) // tzset (); + // Parse buildfile. + // + path bf ("buildfile"); + + ifstream ifs (bf.string ().c_str ()); + if (!ifs.is_open ()) + { + cerr << "error: unable to open " << bf << " in read mode" << endl; + return 1; + } + + ifs.exceptions (ifstream::failbit | ifstream::badbit); + lexer l (ifs, bf.string ()); + + try + { + for (token t (l.next ());; t = l.next ()) + { + cout << t.line () << ':' << t.column () << ": "; + + switch (t.type ()) + { + case token_type::eos: cout << ""; break; + case token_type::punctuation: + { + switch (t.punctuation ()) + { + case token_punctuation::newline: cout << "\\n"; break; + case token_punctuation::colon: cout << ':'; break; + case token_punctuation::lcbrace: cout << '{'; break; + case token_punctuation::rcbrace: cout << '}'; break; + } + break; + } + case token_type::name: cout << '\'' << t.name () << '\''; break; + } + + cout << endl; + + if (t.type () == token_type::eos) + break; + } + } + catch (const lexer_error&) + { + return 1; // Diagnostics has already been issued. + } + catch (const std::ios_base::failure&) + { + cerr << "error: failed to read from " << bf << endl; + return 1; + } + + return 0; + + + // Register rules. + // cxx::link cxx_link; rules.emplace (typeid (exe), cxx_link); diff --git a/build/buildfile b/build/buildfile new file mode 100644 index 0000000..ceed236 --- /dev/null +++ b/build/buildfile @@ -0,0 +1 @@ +exe{bd}: obj{bd target} diff --git a/build/lexer b/build/lexer new file mode 100644 index 0000000..987efab --- /dev/null +++ b/build/lexer @@ -0,0 +1,98 @@ +// file : build/lexer -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#ifndef BUILD_LEXER +#define BUILD_LEXER + +#include +#include +#include // uint64_t +#include + +#include + +namespace build +{ + // The handler must assume the diagnostics has already been issued. + // + struct lexer_error: std::exception {}; + + class lexer + { + public: + lexer (std::istream& is, const std::string& name) + : is_ (is), name_ (name) {} + + token + next (); + + // Character interface. + // + private: + class xchar + { + public: + typedef std::char_traits traits_type; + typedef traits_type::int_type int_type; + typedef traits_type::char_type char_type; + + xchar (int_type v, std::uint64_t l, std::uint64_t c) + : v_ (v), l_ (l), c_ (c) {} + + operator char_type () const {return static_cast (v_);} + + int_type + value () const {return v_;} + + std::uint64_t line () const {return l_;} + std::uint64_t column () const {return c_;} + + private: + int_type v_; + std::uint64_t l_; + std::uint64_t c_; + }; + + xchar + peek (); + + xchar + get (); + + void + unget (const xchar&); + + // Tests. + // + bool + is_eos (const xchar& c) const + { + return c.value () == xchar::traits_type::eof (); + } + + private: + xchar + escape (); + + void + skip_spaces (); + + token + name (xchar); + + private: + std::istream& is_; + std::string name_; + + std::uint64_t l_ {1}; + std::uint64_t c_ {1}; + + bool eos_ {false}; + + bool unget_ {false}; + xchar buf_ {0, 0, 0}; + }; +} + +#endif // BUILD_LEXER diff --git a/build/lexer.cxx b/build/lexer.cxx new file mode 100644 index 0000000..101227e --- /dev/null +++ b/build/lexer.cxx @@ -0,0 +1,220 @@ +// file : build/lexer.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#include + +#include + +using namespace std; + +namespace build +{ + token lexer:: + next () + { + skip_spaces (); + + xchar c (get ()); + uint64_t ln (c.line ()), cn (c.column ()); + + if (is_eos (c)) + return token (ln, cn); + + switch (c) + { + // NOTE: remember to update name() if adding new punctuations. + // + case '\n': + { + return token (token_punctuation::newline, ln, cn); + } + case ':': + { + return token (token_punctuation::colon, ln, cn); + } + case '{': + { + return token (token_punctuation::lcbrace, ln, cn); + } + case '}': + { + return token (token_punctuation::rcbrace, ln, cn); + } + } + + // Otherwise it is a name. + // + return name (c); + } + + lexer::xchar lexer:: + escape () + { + xchar c (get ()); + + if (!is_eos (c)) + return c; + + cerr << name_ << ':' << c.line () << ':' << c.column () << ": error: " << + "unterminated escape sequence" << endl; + throw lexer_error (); + } + + void lexer:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column () == 1); + + for (; !is_eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (start) + break; + + return; + } + case '#': + { + get (); + + // Read until newline or eos. + // + for (c = peek (); !is_eos (c) && c != '\n'; c = peek ()) + get (); + continue; + } + case '\\': + { + get (); + + if (peek () == '\n') + break; + + unget (c); + // Fall through. + } + default: + return; // Not a space. + } + + get (); + } + } + + token lexer:: + name (xchar c) + { + uint64_t ln (c.line ()), cn (c.column ()); + string lexeme; + lexeme += (c != '\\' ? c : escape ()); + + for (c = peek (); !is_eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + case '\n': + case ':': + case '{': + case '}': + case '#': + { + break; + } + case '\\': + { + get (); + lexeme += escape (); + continue; + } + default: + { + get (); + lexeme += c; + continue; + } + } + + break; + } + + return token (lexeme, ln, cn); + } + + lexer::xchar lexer:: + peek () + { + if (unget_) + return buf_; + else + { + if (eos_) + return xchar (xchar::traits_type::eof (), l_, c_); + else + { + xchar::int_type v (is_.peek ()); + + if (v == xchar::traits_type::eof ()) + eos_ = true; + + return xchar (v, l_, c_); + } + } + } + + lexer::xchar lexer:: + get () + { + if (unget_) + { + unget_ = false; + return buf_; + } + else + { + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + xchar c (peek ()); + + if (!is_eos (c)) + { + is_.get (); + + if (c == '\n') + { + l_++; + c_ = 1; + } + else + c_++; + } + + return c; + } + } + + void lexer:: + unget (const xchar& c) + { + // Because iostream::unget cannot work once eos is reached, + // we have to provide our own implementation. + // + buf_ = c; + unget_ = true; + } +} diff --git a/build/target b/build/target index 3ef3192..01cddc4 100644 --- a/build/target +++ b/build/target @@ -11,6 +11,7 @@ #include #include #include +#include // move #include #include @@ -27,7 +28,7 @@ namespace build class target { public: - target (std::string n): name_ (n) {} + target (std::string n): name_ (std::move (n)) {} const std::string& name () const {return name_;} diff --git a/build/token b/build/token new file mode 100644 index 0000000..bade45c --- /dev/null +++ b/build/token @@ -0,0 +1,55 @@ +// file : build/token -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#ifndef BUILD_TOKEN +#define BUILD_TOKEN + +#include +#include // size_t +#include // uint64_t +#include +#include // move + +namespace build +{ + enum class token_type {eos, name, punctuation}; + enum class token_punctuation {newline, colon, lcbrace, rcbrace}; + + class token + { + public: + token_type + type () const {return t_;} + + std::string const& + name () const {assert (t_ == token_type::name); return n_;} + + token_punctuation + punctuation () const {assert (t_ == token_type::punctuation); return p_;} + + std::uint64_t line () const {return l_;} + std::uint64_t column () const {return c_;} + + public: + token (std::uint64_t l, std::uint64_t c) + : t_ (token_type::eos), l_ (l), c_ (c) {} + + token (std::string n, std::uint64_t l, std::uint64_t c) + : t_ (token_type::name), n_ (std::move (n)), l_ (l), c_ (c) {} + + token (token_punctuation p, std::uint64_t l, std::uint64_t c) + : t_ (token_type::punctuation), p_ (p), l_ (l), c_ (c) {} + + private: + token_type t_; + + token_punctuation p_; + std::string n_; + + std::uint64_t l_; + std::uint64_t c_; + }; +} + +#endif // BUILD_TOKEN -- cgit v1.1