From e6cee3c2f9b03852ed4837f9be05e0a2fa4542a8 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Mon, 30 Sep 2019 13:48:28 +0300 Subject: Move path match to path-pattern.?xx --- libbutl/path-pattern.mxx | 242 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 libbutl/path-pattern.mxx (limited to 'libbutl/path-pattern.mxx') diff --git a/libbutl/path-pattern.mxx b/libbutl/path-pattern.mxx new file mode 100644 index 0000000..2d37b58 --- /dev/null +++ b/libbutl/path-pattern.mxx @@ -0,0 +1,242 @@ +// file : libbutl/path-pattern.mxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +#include + +#ifndef __cpp_lib_modules_ts +#include +#include // uint16_t +#include // ptrdiff_t, size_t +#include // input_iterator_tag +#endif + +// Other includes. +#ifdef __cpp_modules_ts +export module butl.path_pattern; + +#ifdef __cpp_lib_modules_ts +import std.core; +#endif + +import butl.path; +import butl.optional; +#else +#include +#include +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Wildcard pattern match (aka glob). + // + // The wildcard pattern contains the literal characters that match + // themselves and the wildcard characters that match a single or multiple + // characters. Currently the following wildcards are supported: + // + // * - match any number of characters (including zero) + // ? - match any single character + // [...] - match a character with a "bracket expression"; currently we only + // support literal characters and ranges (no character/equivalence + // classes, etc; see Pattern Matching Notation section of the Shell + // Command Language POSIX specification for details) + // + // Note also that currently we don't support the special characters + // backslash-escaping (as mandated by POSIX). + + // Path match/search flags. + // + enum class path_match_flags: std::uint16_t + { + // Follow symlinks. This only applies to symlinks that are matched against + // the rightmost component of the pattern. In particular, this mean that + // such symlinks will never match a directory pattern and some results can + // be missing for the recursive rightmost component. + // + // Note that this flag is only used for path_search(). + // + follow_symlinks = 0x1, + + // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or + // `.../*`) match absent path component. For example, with this flag + // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`. + // + // Note that this does not apply to single-component patterns and the + // pattern type is always preserved. In particular, the `a/*/` pattern + // matches `a/` but not `a`. + // + // Finally, keep in mind that only absent directory components can be + // matched this way. In particular, pattern `a*/*` does not match `ab` + // (but `a*/*/` matches `ab/`). + // + match_absent = 0x2, + + none = 0 + }; + + inline path_match_flags operator& (path_match_flags, path_match_flags); + inline path_match_flags operator| (path_match_flags, path_match_flags); + inline path_match_flags operator&= (path_match_flags&, path_match_flags); + inline path_match_flags operator|= (path_match_flags&, path_match_flags); + + // Return true if name matches pattern. Both must be single path components, + // possibly with a trailing directory separator to indicate a directory. + // + // If the pattern ends with a directory separator, then it only matches a + // directory name (i.e., ends with a directory separator, but potentially + // different). Otherwise, it only matches a non-directory name (no trailing + // directory separator). + // + LIBBUTL_SYMEXPORT bool + path_match (const std::string& name, const std::string& pattern); + + // Return true if path entry matches pattern. Note that the match is + // performed literally, with no paths normalization being performed. The + // start directory is used if the first pattern component is a self-matching + // wildcard (see below for the start directory and wildcard semantics). + // + // In addition to the wildcard characters, it also recognizes the ** and *** + // wildcard sequences (see path_search() for details). + // + LIBBUTL_SYMEXPORT bool + path_match (const path& entry, + const path& pattern, + const dir_path& start = dir_path (), + path_match_flags = path_match_flags::none); + + // Return true if a name contains the wildcard characters. + // + bool + path_pattern (const std::string&); + + // Return true if a name contains the ** wildcard sequences. + // + bool + path_pattern_recursive (const std::string&); + + // Return true if a name contains the *** wildcard sequences. + // + bool + path_pattern_self_matching (const std::string&); + + // Return true if a path contains the pattern components. + // + bool + path_pattern (const path&); + + // Return the number of recursive pattern components. + // + // Knowing the number of such components allows us to make some assumptions + // regarding the search result. For example, if it is zero or one, then the + // result contains no duplicates. + // + // Also note that the result can be used as bool. + // + std::size_t + path_pattern_recursive (const path&); + + // Return true if the path is not empty and its first component is a self- + // matching pattern. + // + bool + path_pattern_self_matching (const path&); + + // Iteration over pattern terminals. + // + enum class path_pattern_term_type + { + literal, // Literal character. + question, // Question mark wildcard. + star, // Star wildcard. + bracket // Bracket expression wildcard. + }; + + class path_pattern_term + { + public: + path_pattern_term_type type; + std::string::const_iterator begin; + std::string::const_iterator end; + + std::size_t + size () const {return end - begin;} + + // Predicates. + // + bool literal () const {return type == path_pattern_term_type::literal;} + bool question () const {return type == path_pattern_term_type::question;} + bool star () const {return type == path_pattern_term_type::star;} + bool bracket () const {return type == path_pattern_term_type::bracket;} + }; + + // Return the literal terminal character. + // + char + get_literal (const path_pattern_term&); + + // Match a character against the bracket expression terminal. + // + LIBBUTL_SYMEXPORT bool + match_bracket (char, const path_pattern_term&); + + class LIBBUTL_SYMEXPORT path_pattern_iterator + { + public: + using value_type = path_pattern_term; + using pointer = const path_pattern_term*; + using reference = const path_pattern_term&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + + explicit + path_pattern_iterator (const std::string&); + + path_pattern_iterator (std::string::const_iterator begin, + std::string::const_iterator end); + + path_pattern_iterator () = default; // Create the end iterator. + + path_pattern_iterator& operator++ () {assert (t_); next (); return *this;} + + reference operator* () const {assert (t_); return *t_;} + pointer operator-> () const {assert (t_); return &*t_;} + + friend bool + operator== (const path_pattern_iterator&, const path_pattern_iterator&); + + friend bool + operator!= (const path_pattern_iterator&, const path_pattern_iterator&); + + private: + void + next (); + + private: + // nullopt denotes the end iterator. + // + // Note that the default-constructed i_ and e_ iterators (having singular + // values) may not represent the end iterator as are not comparable for + // equality. That's why we use an absent term to represent such an + // iterator. + // + optional t_; + + std::string::const_iterator i_; + std::string::const_iterator e_; + }; + + // Range-based for loop support. + // + // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ... + // + path_pattern_iterator begin (const path_pattern_iterator&); + path_pattern_iterator end (const path_pattern_iterator&); +} + +#include -- cgit v1.1