From e6cee3c2f9b03852ed4837f9be05e0a2fa4542a8 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Mon, 30 Sep 2019 13:48:28 +0300 Subject: Move path match to path-pattern.?xx --- libbutl/builtin.cxx | 3 + libbutl/filesystem.cxx | 410 +---------------------------------------- libbutl/filesystem.ixx | 127 ------------- libbutl/filesystem.mxx | 214 +--------------------- libbutl/path-pattern.cxx | 451 ++++++++++++++++++++++++++++++++++++++++++++++ libbutl/path-pattern.ixx | 133 ++++++++++++++ libbutl/path-pattern.mxx | 242 +++++++++++++++++++++++++ tests/wildcard/driver.cxx | 2 + 8 files changed, 840 insertions(+), 742 deletions(-) create mode 100644 libbutl/path-pattern.cxx create mode 100644 libbutl/path-pattern.ixx create mode 100644 libbutl/path-pattern.mxx diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx index 3340271..0b7db7b 100644 --- a/libbutl/builtin.cxx +++ b/libbutl/builtin.cxx @@ -52,12 +52,15 @@ import butl.timestamp; import butl.regex; import butl.path_io; +import butl.utility; // operator<<(ostream,exception), + // throw_generic_error() import butl.optional; import butl.filesystem; import butl.small_vector; #else #include #include +#include #include #include #include diff --git a/libbutl/filesystem.cxx b/libbutl/filesystem.cxx index 832e1b6..b66dcfe 100644 --- a/libbutl/filesystem.cxx +++ b/libbutl/filesystem.cxx @@ -62,9 +62,10 @@ import std.core; #endif import butl.path; import butl.timestamp; +import butl.path_pattern; #endif -import butl.utility; // throw_generic_error(), lcase()[_WIN32] +import butl.utility; // throw_generic_error() import butl.fdstream; import butl.small_vector; #else @@ -1557,303 +1558,6 @@ namespace butl } #endif - // patterns - // - static inline bool - match (char c, char pc) - { -#ifndef _WIN32 - return c == pc; -#else - return lcase (c) == lcase (pc); -#endif - } - - bool - match_bracket (char c, const path_pattern_term& pt) - { - using iterator = string::const_iterator; - - assert (pt.bracket ()); - - iterator i (pt.begin + 1); // Position after '['. - iterator e (pt.end - 1); // Position at ']'. - - bool invert (*i == '!'); - if (invert) - ++i; - - bool r (false); - for (iterator b (i); i != e && !r; ++i) - { - char bc (*i); - - // If '-' is a first or last character in the bracket expression then - // match it literally and match the range otherwise. - // - if (bc == '-' && i != b && i + 1 != e) // Match the range? - { - // Note that we have already matched the range left endpoint character - // unsuccessfully (otherwise we wouldn't be here), so now we test if - // the character belongs to the (min-char, max-char] range. - // - // Also note that on Windows we match case insensitively and so can't - // just compare the character with the range endpoints. Thus, we - // fallback to matching each range character individually. - // -#ifndef _WIN32 - r = c > *(i - 1) && c <= *(i + 1); -#else - for (char bc (*(i - 1) + 1), mx (*(i + 1)); bc <= mx && !r; ++bc) - r = match (c, bc); -#endif - - ++i; // Position to the range max character. - } - else // Match against the expression character literally. - r = match (c, bc); - } - - return r != invert; - } - - // Match the name [ni, ne) to the pattern [pi, pe) that may not contain - // bracket expressions. Ranges can be empty. - // - static bool - match_no_brackets (string::const_iterator pi, string::const_iterator pe, - string::const_iterator ni, string::const_iterator ne) - { - using reverse_iterator = std::reverse_iterator; - - reverse_iterator rpi (pe); - reverse_iterator rpe (pi); - - reverse_iterator rni (ne); - reverse_iterator rne (ni); - - // Match the pattern suffix (follows the last *) to the name trailing - // characters. - // - char pc ('\0'); - for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni) - { - if (!match (*rni, pc) && pc != '?') - return false; - } - - // If we got to the (reversed) end of the pattern (no * is encountered) - // than we are done. The success depends on if we got to the (reversed) end - // of the name as well. - // - if (rpi == rpe) - return rni == rne; - - // If we didn't reach * in the pattern then we reached the (reversed) end - // of the name. That means we have unmatched non-star terms in the - // pattern, and so match failed. - // - if (pc != '*') - { - assert (rni == rne); - return false; - } - - // Match the pattern prefix (ends with the first *) to the name leading - // characters. If they mismatch we failed. Otherwise if this is an only * - // in the pattern (matches whatever is left in the name) then we succeed, - // otherwise we perform backtracking (recursively). - // - pe = rpi.base (); - ne = rni.base (); - - // Compare the pattern and the name term by char until the name suffix or - // * is encountered in the pattern (whichever happens first). Fail if a - // char mismatches. - // - for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni) - { - if (!match (*ni, pc) && pc != '?') - return false; - } - - // If we didn't get to * in the pattern then we got to the name suffix. - // That means that the pattern has unmatched non-star terms, and so match - // failed. - // - if (pc != '*') - { - assert (ni == ne); - return false; - } - - // If * that we have reached is the last one, then it matches whatever is - // left in the name (including an empty range). - // - if (++pi == pe) - return true; - - // Perform backtracking. - // - // From now on, we will call the pattern not-yet-matched part (starting - // the leftmost * and ending the rightmost one inclusively) as pattern, and - // the name not-yet-matched part as name. - // - // Here we sequentially assume that * that starts the pattern matches the - // name leading part (staring from an empty one and iterating till the full - // name). So if, at some iteration, the pattern trailing part (that follows - // the leftmost *) matches the name trailing part, then the pattern matches - // the name. - // - bool r; - for (; !(r = match_no_brackets (pi, pe, ni, ne)) && ni != ne; ++ni) ; - return r; - } - - // Match a character against the pattern term. - // - static inline bool - match (char c, const path_pattern_term& pt) - { - switch (pt.type) - { - // Matches any character. - // - case path_pattern_term_type::star: - case path_pattern_term_type::question: return true; - - case path_pattern_term_type::bracket: - { - return match_bracket (c, pt); - } - - case path_pattern_term_type::literal: - { - return match (c, get_literal (pt)); - } - } - - assert (false); // Can't be here. - return false; - } - - // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty. - // - static bool - match (string::const_iterator pi, string::const_iterator pe, - string::const_iterator ni, string::const_iterator ne) - { - // If the pattern doesn't contain the bracket expressions then reduce to - // the "clever" approach (see the implementation notes below for details). - // - if (find (pi, pe, '[') == pe) - return match_no_brackets (pi, pe, ni, ne); - - // Match the pattern prefix (precedes the first *) to the name leading - // characters. - // - path_pattern_iterator ppi (pi, pe); - path_pattern_iterator ppe; - path_pattern_term pt; - - for (; ppi != ppe && !(pt = *ppi).star () && ni != ne; ++ppi, ++ni) - { - if (!match (*ni, pt)) - return false; - } - - // If we got to the end of the pattern (no * is encountered) than we are - // done. The success depends on if we got to the end of the name as well. - // - if (ppi == ppe) - return ni == ne; - - // If we didn't reach * in the pattern then we reached the end of the - // name. That means we have unmatched non-star terms in the pattern, and - // so match failed. - // - if (!pt.star ()) - { - assert (ni == ne); - return false; - } - - // If * that we have reached is the last term, then it matches whatever is - // left in the name (including an empty range). - // - if (++ppi == ppe) - return true; - - // Switch back to the string iterator and perform backtracking. - // - // From now on, we will call the pattern not-yet-matched part (starting - // the leftmost *) as pattern, and the name not-yet-matched part as name. - // - // Here we sequentially assume that * that starts the pattern matches the - // name leading part (see match_no_brackets() for details). - // - bool r; - for (pi = ppi->begin; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ; - return r; - } - - bool - path_match (const string& name, const string& pattern) - { - // Implementation notes: - // - // - This has a good potential of becoming hairy quickly so need to strive - // for an elegant way to implement this. - // - // - Most patterns will contains a single * wildcard with a prefix and/or - // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very - // common: *foo*. - // - // So it would be nice to have a clever implementation that first - // "anchors" itself with a literal prefix and/or suffix and only then - // continue with backtracking. In other words, reduce: - // - // *.txt vs foo.txt -> * vs foo - // foo* vs foo.txt -> * vs .txt - // f*.txt vs foo.txt -> * vs oo - // - // Note that this approach fails if the pattern may contain bracket - // expressions. You can't easily recognize a suffix scanning backwards - // since * semantics depends on the characters to the left: - // - // f[o*]o - * is not a wildcard - // fo*]o - * is a wildcard - // - // That's why we will start with the straightforward left-to-right - // matching and reduce to the "clever" approach when the remaining part - // of the pattern doesn't contain bracket expressions. - - auto pi (pattern.rbegin ()); - auto pe (pattern.rend ()); - - auto ni (name.rbegin ()); - auto ne (name.rend ()); - - // The name doesn't match the pattern if it is of a different type than the - // pattern is. - // - bool pd (pi != pe && path::traits_type::is_separator (*pi)); - bool nd (ni != ne && path::traits_type::is_separator (*ni)); - - if (pd != nd) - return false; - - // Skip trailing separators if present. - // - if (pd) - { - ++pi; - ++ni; - } - - return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); - } - // Search for paths matching the pattern and call the specified function for // each matching path. Return false if the underlying func() call returns // false. Otherwise the function conforms to the path_search() description. @@ -2467,114 +2171,4 @@ namespace butl path_filesystem fs (start, entry); search (pattern, dir_path (), flags, func, fs); } - - bool - path_match (const path& entry, - const path& pattern, - const dir_path& start, - path_match_flags flags) - { - bool r (false); - - auto match = [&entry, &r] (path&& p, const string&, bool interim) - { - // If we found the entry (possibly through one of the recursive - // components) no need to search further. - // - if (p == entry && !interim) - { - r = true; - return false; - } - - return true; - }; - - path_search (pattern, entry, match, start, flags); - return r; - } - - // path_pattern_iterator - // - void path_pattern_iterator:: - next () - { - if (i_ == e_) - { - t_ = nullopt; // Convert the object into the end iterator. - return; - } - - auto next = [this] (path_pattern_term_type t) - { - assert (t != path_pattern_term_type::bracket); - - t_ = path_pattern_term {t, i_, i_ + 1}; - ++i_; - }; - - switch (*i_) - { - case '?': - { - next (path_pattern_term_type::question); - break; - } - case '*': - { - next (path_pattern_term_type::star); - break; - } - case '[': - { - // Try to find the bracket expression end. - // - // Note that '[' doesn't necessarily starts the bracket expression (no - // closing bracket, empty, etc). If that's the case, then we end up - // with the '[' literal terminal. - // - bool expr (false); - for (;;) // Breakout loop. - { - string::const_iterator i (i_ + 1); // Position after '['. - - if (i == e_) // Is '[' the pattern last character? - break; - - bool invert (*i == '!'); - if (invert && ++i == e_) // Is '!' the pattern last character? - break; - - // Find the bracket expression end. - // - // Note that the bracket expression may not be empty and ']' is a - // literal if it is the first expression character. - // - for (++i; i != e_ && *i != ']'; ++i) ; - - if (i == e_) // The closing bracket is not found? - break; - - expr = true; - - ++i; // Position after ']'. - - t_ = path_pattern_term {path_pattern_term_type::bracket, i_, i}; - - i_ = i; - break; - } - - // Fallback to '[' literal if it is not a bracket expression. - // - if (expr) - break; - } - // Fall through. - default: - { - next (path_pattern_term_type::literal); - } - } - } } diff --git a/libbutl/filesystem.ixx b/libbutl/filesystem.ixx index 1be5246..c9e3997 100644 --- a/libbutl/filesystem.ixx +++ b/libbutl/filesystem.ixx @@ -186,131 +186,4 @@ namespace butl { return dir_iterator (); } - - // path_pattern_iterator - // - inline path_pattern_iterator:: - path_pattern_iterator (std::string::const_iterator begin, - std::string::const_iterator end) - : i_ (begin), - e_ (end) - { - next (); // If i_ == e_ we will end up with the end iterator. - } - - inline path_pattern_iterator:: - path_pattern_iterator (const std::string& s) - : path_pattern_iterator (s.begin (), s.end ()) - { - } - - inline bool - operator== (const path_pattern_iterator& x, const path_pattern_iterator& y) - { - return x.t_.has_value () == y.t_.has_value () && - (!x.t_ || (x.i_ == y.i_ && x.e_ == y.e_)); - } - - inline bool - operator!= (const path_pattern_iterator& x, const path_pattern_iterator& y) - { - return !(x == y); - } - - inline path_pattern_iterator - begin (const path_pattern_iterator& i) - { - return i; - } - - inline path_pattern_iterator - end (const path_pattern_iterator&) - { - return path_pattern_iterator (); - } - - // patterns - // - inline char - get_literal (const path_pattern_term& t) - { - assert (t.literal ()); - return *t.begin; - } - - inline bool - path_pattern (const std::string& s) - { - for (const path_pattern_term& t: path_pattern_iterator (s)) - { - if (!t.literal ()) - return true; - } - - return false; - } - - // Return true for a pattern containing the specified number of the - // consecutive star wildcards. - // - inline bool - path_pattern_recursive (const std::string& s, size_t sn) - { - std::size_t n (0); - for (const path_pattern_term& t: path_pattern_iterator (s)) - { - if (t.star ()) - { - if (++n == sn) - return true; - } - else - n = 0; - } - - return false; - } - - inline bool - path_pattern_recursive (const std::string& s) - { - return path_pattern_recursive (s, 2); - } - - inline bool - path_pattern_self_matching (const std::string& s) - { - return path_pattern_recursive (s, 3); - } - - inline bool - path_pattern (const path& p) - { - for (auto i (p.begin ()); i != p.end (); ++i) - { - if (path_pattern (*i)) - return true; - } - - return false; - } - - inline size_t - path_pattern_recursive (const path& p) - { - std::size_t r (0); - for (auto i (p.begin ()); i != p.end (); ++i) - { - if (path_pattern_recursive (*i)) - ++r; - } - - return r; - } - - inline bool - path_pattern_self_matching (const path& p) - { - return !p.empty () && path_pattern_self_matching (*p.begin ()); - } } diff --git a/libbutl/filesystem.mxx b/libbutl/filesystem.mxx index 261b985..43c23b7 100644 --- a/libbutl/filesystem.mxx +++ b/libbutl/filesystem.mxx @@ -23,14 +23,12 @@ using mode_t = int; #endif -#include - #ifndef __cpp_lib_modules_ts #include -#include // ptrdiff_t, size_t +#include // ptrdiff_t #include // uint16_t, etc #include // move(), pair -#include +#include // input_iterator_tag #include #include //@@ MOD needed by timestamp module (no re-export). @@ -45,14 +43,14 @@ import std.core; #endif import butl.path; -import butl.optional; import butl.timestamp; +import butl.path_pattern; // path_match_flags -import butl.utility; +import butl.utility; // operator<<(ostream,exception), throw_generic_error() #else #include -#include #include +#include #include #endif @@ -717,79 +715,9 @@ LIBBUTL_MODEXPORT namespace butl inline dir_iterator begin (dir_iterator&&); #endif - // Wildcard pattern match and search (aka glob). - // - // The wildcard pattern contains the literal characters that match - // themselves and the wildcard characters that match a single or multiple - // characters. Currently the following wildcards are supported: - // - // * - match any number of characters (including zero) - // ? - match any single character - // [...] - match a character with a "bracket expression"; currently we only - // support literal characters and ranges (no character/equivalence - // classes, etc; see Pattern Matching Notation section of the Shell - // Command Language POSIX specification for details) - // - // Note also that currently we don't support the special characters - // backslash-escaping (as mandated by POSIX). - - // Path match/search flags. - // - enum class path_match_flags: std::uint16_t - { - // Follow symlinks. This only applies to symlinks that are matched against - // the rightmost component of the pattern. In particular, this mean that - // such symlinks will never match a directory pattern and some results can - // be missing for the recursive rightmost component. - // - follow_symlinks = 0x1, - - // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or - // `.../*`) match absent path component. For example, with this flag - // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`. - // - // Note that this does not apply to single-component patterns and the - // pattern type is always preserved. In particular, the `a/*/` pattern - // matches `a/` but not `a`. - // - // Finally, keep in mind that only absent directory components can be - // matched this way. In particular, pattern `a*/*` does not match `ab` - // (but `a*/*/` matches `ab/`). - // - match_absent = 0x2, - - none = 0 - }; - - inline path_match_flags operator& (path_match_flags, path_match_flags); - inline path_match_flags operator| (path_match_flags, path_match_flags); - inline path_match_flags operator&= (path_match_flags&, path_match_flags); - inline path_match_flags operator|= (path_match_flags&, path_match_flags); - - // Return true if name matches pattern. Both must be single path components, - // possibly with a trailing directory separator to indicate a directory. - // - // If the pattern ends with a directory separator, then it only matches a - // directory name (i.e., ends with a directory separator, but potentially - // different). Otherwise, it only matches a non-directory name (no trailing - // directory separator). - // - LIBBUTL_SYMEXPORT bool - path_match (const std::string& name, const std::string& pattern); - - // Return true if path entry matches pattern. Note that the match is - // performed literally, with no paths normalization being performed. The - // start directory is used if the first pattern component is a self-matching - // wildcard (see below for the start directory and wildcard semantics). + // Wildcard pattern search (aka glob). // - // In addition to the wildcard characters, it also recognizes the ** and *** - // wildcard sequences (see path_search() for details). - // - LIBBUTL_SYMEXPORT bool - path_match (const path& entry, - const path& pattern, - const dir_path& start = dir_path (), - path_match_flags = path_match_flags::none); + // For details on the wildcard patterns see // Search for paths matching the pattern calling the specified function for // each matching path (see below for details). @@ -883,134 +811,6 @@ LIBBUTL_MODEXPORT namespace butl bool interm)>&, const dir_path& start = dir_path (), path_match_flags = path_match_flags::none); - - // Return true if a name contains the wildcard characters. - // - bool - path_pattern (const std::string&); - - // Return true if a name contains the ** wildcard sequences. - // - bool - path_pattern_recursive (const std::string&); - - // Return true if a name contains the *** wildcard sequences. - // - bool - path_pattern_self_matching (const std::string&); - - // Return true if a path contains the pattern components. - // - bool - path_pattern (const path&); - - // Return the number of recursive pattern components. - // - // Knowing the number of such components allows us to make some assumptions - // regarding the search result. For example, if it is zero or one, then the - // result contains no duplicates. - // - // Also note that the result can be used as bool. - // - size_t - path_pattern_recursive (const path&); - - // Return true if the path is not empty and its first component is a self- - // matching pattern. - // - bool - path_pattern_self_matching (const path&); - - // Iteration over pattern terminals. - // - enum class path_pattern_term_type - { - literal, // Literal character. - question, // Question mark wildcard. - star, // Star wildcard. - bracket // Bracket expression wildcard. - }; - - class path_pattern_term - { - public: - path_pattern_term_type type; - std::string::const_iterator begin; - std::string::const_iterator end; - - std::size_t - size () const {return end - begin;} - - // Predicates. - // - bool literal () const {return type == path_pattern_term_type::literal;} - bool question () const {return type == path_pattern_term_type::question;} - bool star () const {return type == path_pattern_term_type::star;} - bool bracket () const {return type == path_pattern_term_type::bracket;} - }; - - // Return the literal terminal character. - // - char - get_literal (const path_pattern_term&); - - // Match a character against the bracket expression terminal. - // - LIBBUTL_SYMEXPORT bool - match_bracket (char, const path_pattern_term&); - - class LIBBUTL_SYMEXPORT path_pattern_iterator - { - public: - using value_type = path_pattern_term; - using pointer = const path_pattern_term*; - using reference = const path_pattern_term&; - using difference_type = std::ptrdiff_t; - using iterator_category = std::input_iterator_tag; - - explicit - path_pattern_iterator (const std::string&); - - path_pattern_iterator (std::string::const_iterator begin, - std::string::const_iterator end); - - path_pattern_iterator () = default; // Create the end iterator. - - path_pattern_iterator& operator++ () {assert (t_); next (); return *this;} - - reference operator* () const {assert (t_); return *t_;} - pointer operator-> () const {assert (t_); return &*t_;} - - friend bool - operator== (const path_pattern_iterator&, const path_pattern_iterator&); - - friend bool - operator!= (const path_pattern_iterator&, const path_pattern_iterator&); - - private: - void - next (); - - private: - // nullopt denotes the end iterator. - // - // Note that the default-constructed i_ and e_ iterators (having singular - // values) may not represent the end iterator as are not comparable for - // equality. That's why we use an absent term to represent such an - // iterator. - // - optional t_; - - std::string::const_iterator i_; - std::string::const_iterator e_; - }; - - // Range-based for loop support. - // - // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ... - // - path_pattern_iterator begin (const path_pattern_iterator&); - path_pattern_iterator end (const path_pattern_iterator&); } #include diff --git a/libbutl/path-pattern.cxx b/libbutl/path-pattern.cxx new file mode 100644 index 0000000..86be1ec --- /dev/null +++ b/libbutl/path-pattern.cxx @@ -0,0 +1,451 @@ +// file : libbutl/path-pattern.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#include + +#ifndef __cpp_lib_modules_ts +#include +#include +#include +#include // reverse_iterator + +#include // find() +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +module butl.path_pattern; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +#endif +import butl.path; +import butl.optional; +#endif + +import butl.utility; // lcase()[_WIN32] +import butl.filesystem; // path_search() +#else +#include +#include +#endif + +using namespace std; + +namespace butl +{ + // patterns + // + static inline bool + match (char c, char pc) + { +#ifndef _WIN32 + return c == pc; +#else + return lcase (c) == lcase (pc); +#endif + } + + bool + match_bracket (char c, const path_pattern_term& pt) + { + using iterator = string::const_iterator; + + assert (pt.bracket ()); + + iterator i (pt.begin + 1); // Position after '['. + iterator e (pt.end - 1); // Position at ']'. + + bool invert (*i == '!'); + if (invert) + ++i; + + bool r (false); + for (iterator b (i); i != e && !r; ++i) + { + char bc (*i); + + // If '-' is a first or last character in the bracket expression then + // match it literally and match the range otherwise. + // + if (bc == '-' && i != b && i + 1 != e) // Match the range? + { + // Note that we have already matched the range left endpoint character + // unsuccessfully (otherwise we wouldn't be here), so now we test if + // the character belongs to the (min-char, max-char] range. + // + // Also note that on Windows we match case insensitively and so can't + // just compare the character with the range endpoints. Thus, we + // fallback to matching each range character individually. + // +#ifndef _WIN32 + r = c > *(i - 1) && c <= *(i + 1); +#else + for (char bc (*(i - 1) + 1), mx (*(i + 1)); bc <= mx && !r; ++bc) + r = match (c, bc); +#endif + + ++i; // Position to the range max character. + } + else // Match against the expression character literally. + r = match (c, bc); + } + + return r != invert; + } + + // Match the name [ni, ne) to the pattern [pi, pe) that may not contain + // bracket expressions. Ranges can be empty. + // + static bool + match_no_brackets (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + using reverse_iterator = std::reverse_iterator; + + reverse_iterator rpi (pe); + reverse_iterator rpe (pi); + + reverse_iterator rni (ne); + reverse_iterator rne (ni); + + // Match the pattern suffix (follows the last *) to the name trailing + // characters. + // + char pc ('\0'); + for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni) + { + if (!match (*rni, pc) && pc != '?') + return false; + } + + // If we got to the (reversed) end of the pattern (no * is encountered) + // than we are done. The success depends on if we got to the (reversed) end + // of the name as well. + // + if (rpi == rpe) + return rni == rne; + + // If we didn't reach * in the pattern then we reached the (reversed) end + // of the name. That means we have unmatched non-star terms in the + // pattern, and so match failed. + // + if (pc != '*') + { + assert (rni == rne); + return false; + } + + // Match the pattern prefix (ends with the first *) to the name leading + // characters. If they mismatch we failed. Otherwise if this is an only * + // in the pattern (matches whatever is left in the name) then we succeed, + // otherwise we perform backtracking (recursively). + // + pe = rpi.base (); + ne = rni.base (); + + // Compare the pattern and the name term by char until the name suffix or + // * is encountered in the pattern (whichever happens first). Fail if a + // char mismatches. + // + for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni) + { + if (!match (*ni, pc) && pc != '?') + return false; + } + + // If we didn't get to * in the pattern then we got to the name suffix. + // That means that the pattern has unmatched non-star terms, and so match + // failed. + // + if (pc != '*') + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last one, then it matches whatever is + // left in the name (including an empty range). + // + if (++pi == pe) + return true; + + // Perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost * and ending the rightmost one inclusively) as pattern, and + // the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (staring from an empty one and iterating till the full + // name). So if, at some iteration, the pattern trailing part (that follows + // the leftmost *) matches the name trailing part, then the pattern matches + // the name. + // + bool r; + for (; !(r = match_no_brackets (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + // Match a character against the pattern term. + // + static inline bool + match (char c, const path_pattern_term& pt) + { + switch (pt.type) + { + // Matches any character. + // + case path_pattern_term_type::star: + case path_pattern_term_type::question: return true; + + case path_pattern_term_type::bracket: + { + return match_bracket (c, pt); + } + + case path_pattern_term_type::literal: + { + return match (c, get_literal (pt)); + } + } + + assert (false); // Can't be here. + return false; + } + + // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty. + // + static bool + match (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + // If the pattern doesn't contain the bracket expressions then reduce to + // the "clever" approach (see the implementation notes below for details). + // + if (find (pi, pe, '[') == pe) + return match_no_brackets (pi, pe, ni, ne); + + // Match the pattern prefix (precedes the first *) to the name leading + // characters. + // + path_pattern_iterator ppi (pi, pe); + path_pattern_iterator ppe; + path_pattern_term pt; + + for (; ppi != ppe && !(pt = *ppi).star () && ni != ne; ++ppi, ++ni) + { + if (!match (*ni, pt)) + return false; + } + + // If we got to the end of the pattern (no * is encountered) than we are + // done. The success depends on if we got to the end of the name as well. + // + if (ppi == ppe) + return ni == ne; + + // If we didn't reach * in the pattern then we reached the end of the + // name. That means we have unmatched non-star terms in the pattern, and + // so match failed. + // + if (!pt.star ()) + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last term, then it matches whatever is + // left in the name (including an empty range). + // + if (++ppi == ppe) + return true; + + // Switch back to the string iterator and perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost *) as pattern, and the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (see match_no_brackets() for details). + // + bool r; + for (pi = ppi->begin; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + bool + path_match (const string& name, const string& pattern) + { + // Implementation notes: + // + // - This has a good potential of becoming hairy quickly so need to strive + // for an elegant way to implement this. + // + // - Most patterns will contains a single * wildcard with a prefix and/or + // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very + // common: *foo*. + // + // So it would be nice to have a clever implementation that first + // "anchors" itself with a literal prefix and/or suffix and only then + // continue with backtracking. In other words, reduce: + // + // *.txt vs foo.txt -> * vs foo + // foo* vs foo.txt -> * vs .txt + // f*.txt vs foo.txt -> * vs oo + // + // Note that this approach fails if the pattern may contain bracket + // expressions. You can't easily recognize a suffix scanning backwards + // since * semantics depends on the characters to the left: + // + // f[o*]o - * is not a wildcard + // fo*]o - * is a wildcard + // + // That's why we will start with the straightforward left-to-right + // matching and reduce to the "clever" approach when the remaining part + // of the pattern doesn't contain bracket expressions. + + auto pi (pattern.rbegin ()); + auto pe (pattern.rend ()); + + auto ni (name.rbegin ()); + auto ne (name.rend ()); + + // The name doesn't match the pattern if it is of a different type than the + // pattern is. + // + bool pd (pi != pe && path::traits_type::is_separator (*pi)); + bool nd (ni != ne && path::traits_type::is_separator (*ni)); + + if (pd != nd) + return false; + + // Skip trailing separators if present. + // + if (pd) + { + ++pi; + ++ni; + } + + return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); + } + + bool + path_match (const path& entry, + const path& pattern, + const dir_path& start, + path_match_flags flags) + { + bool r (false); + + auto match = [&entry, &r] (path&& p, const string&, bool interim) + { + // If we found the entry (possibly through one of the recursive + // components) no need to search further. + // + if (p == entry && !interim) + { + r = true; + return false; + } + + return true; + }; + + path_search (pattern, entry, match, start, flags); + return r; + } + + // path_pattern_iterator + // + void path_pattern_iterator:: + next () + { + if (i_ == e_) + { + t_ = nullopt; // Convert the object into the end iterator. + return; + } + + auto next = [this] (path_pattern_term_type t) + { + assert (t != path_pattern_term_type::bracket); + + t_ = path_pattern_term {t, i_, i_ + 1}; + ++i_; + }; + + switch (*i_) + { + case '?': + { + next (path_pattern_term_type::question); + break; + } + case '*': + { + next (path_pattern_term_type::star); + break; + } + case '[': + { + // Try to find the bracket expression end. + // + // Note that '[' doesn't necessarily starts the bracket expression (no + // closing bracket, empty, etc). If that's the case, then we end up + // with the '[' literal terminal. + // + bool expr (false); + for (;;) // Breakout loop. + { + string::const_iterator i (i_ + 1); // Position after '['. + + if (i == e_) // Is '[' the pattern last character? + break; + + bool invert (*i == '!'); + if (invert && ++i == e_) // Is '!' the pattern last character? + break; + + // Find the bracket expression end. + // + // Note that the bracket expression may not be empty and ']' is a + // literal if it is the first expression character. + // + for (++i; i != e_ && *i != ']'; ++i) ; + + if (i == e_) // The closing bracket is not found? + break; + + expr = true; + + ++i; // Position after ']'. + + t_ = path_pattern_term {path_pattern_term_type::bracket, i_, i}; + + i_ = i; + break; + } + + // Fallback to '[' literal if it is not a bracket expression. + // + if (expr) + break; + } + // Fall through. + default: + { + next (path_pattern_term_type::literal); + } + } + } +} diff --git a/libbutl/path-pattern.ixx b/libbutl/path-pattern.ixx new file mode 100644 index 0000000..5adde3f --- /dev/null +++ b/libbutl/path-pattern.ixx @@ -0,0 +1,133 @@ +// file : libbutl/path-pattern.ixx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + // path_pattern_iterator + // + inline path_pattern_iterator:: + path_pattern_iterator (std::string::const_iterator begin, + std::string::const_iterator end) + : i_ (begin), + e_ (end) + { + next (); // If i_ == e_ we will end up with the end iterator. + } + + inline path_pattern_iterator:: + path_pattern_iterator (const std::string& s) + : path_pattern_iterator (s.begin (), s.end ()) + { + } + + inline bool + operator== (const path_pattern_iterator& x, const path_pattern_iterator& y) + { + return x.t_.has_value () == y.t_.has_value () && + (!x.t_ || (x.i_ == y.i_ && x.e_ == y.e_)); + } + + inline bool + operator!= (const path_pattern_iterator& x, const path_pattern_iterator& y) + { + return !(x == y); + } + + inline path_pattern_iterator + begin (const path_pattern_iterator& i) + { + return i; + } + + inline path_pattern_iterator + end (const path_pattern_iterator&) + { + return path_pattern_iterator (); + } + + // patterns + // + inline char + get_literal (const path_pattern_term& t) + { + assert (t.literal ()); + return *t.begin; + } + + inline bool + path_pattern (const std::string& s) + { + for (const path_pattern_term& t: path_pattern_iterator (s)) + { + if (!t.literal ()) + return true; + } + + return false; + } + + // Return true for a pattern containing the specified number of the + // consecutive star wildcards. + // + inline bool + path_pattern_recursive (const std::string& s, std::size_t sn) + { + std::size_t n (0); + for (const path_pattern_term& t: path_pattern_iterator (s)) + { + if (t.star ()) + { + if (++n == sn) + return true; + } + else + n = 0; + } + + return false; + } + + inline bool + path_pattern_recursive (const std::string& s) + { + return path_pattern_recursive (s, 2); + } + + inline bool + path_pattern_self_matching (const std::string& s) + { + return path_pattern_recursive (s, 3); + } + + inline bool + path_pattern (const path& p) + { + for (auto i (p.begin ()); i != p.end (); ++i) + { + if (path_pattern (*i)) + return true; + } + + return false; + } + + inline std::size_t + path_pattern_recursive (const path& p) + { + std::size_t r (0); + for (auto i (p.begin ()); i != p.end (); ++i) + { + if (path_pattern_recursive (*i)) + ++r; + } + + return r; + } + + inline bool + path_pattern_self_matching (const path& p) + { + return !p.empty () && path_pattern_self_matching (*p.begin ()); + } +} diff --git a/libbutl/path-pattern.mxx b/libbutl/path-pattern.mxx new file mode 100644 index 0000000..2d37b58 --- /dev/null +++ b/libbutl/path-pattern.mxx @@ -0,0 +1,242 @@ +// file : libbutl/path-pattern.mxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +#include + +#ifndef __cpp_lib_modules_ts +#include +#include // uint16_t +#include // ptrdiff_t, size_t +#include // input_iterator_tag +#endif + +// Other includes. +#ifdef __cpp_modules_ts +export module butl.path_pattern; + +#ifdef __cpp_lib_modules_ts +import std.core; +#endif + +import butl.path; +import butl.optional; +#else +#include +#include +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Wildcard pattern match (aka glob). + // + // The wildcard pattern contains the literal characters that match + // themselves and the wildcard characters that match a single or multiple + // characters. Currently the following wildcards are supported: + // + // * - match any number of characters (including zero) + // ? - match any single character + // [...] - match a character with a "bracket expression"; currently we only + // support literal characters and ranges (no character/equivalence + // classes, etc; see Pattern Matching Notation section of the Shell + // Command Language POSIX specification for details) + // + // Note also that currently we don't support the special characters + // backslash-escaping (as mandated by POSIX). + + // Path match/search flags. + // + enum class path_match_flags: std::uint16_t + { + // Follow symlinks. This only applies to symlinks that are matched against + // the rightmost component of the pattern. In particular, this mean that + // such symlinks will never match a directory pattern and some results can + // be missing for the recursive rightmost component. + // + // Note that this flag is only used for path_search(). + // + follow_symlinks = 0x1, + + // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or + // `.../*`) match absent path component. For example, with this flag + // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`. + // + // Note that this does not apply to single-component patterns and the + // pattern type is always preserved. In particular, the `a/*/` pattern + // matches `a/` but not `a`. + // + // Finally, keep in mind that only absent directory components can be + // matched this way. In particular, pattern `a*/*` does not match `ab` + // (but `a*/*/` matches `ab/`). + // + match_absent = 0x2, + + none = 0 + }; + + inline path_match_flags operator& (path_match_flags, path_match_flags); + inline path_match_flags operator| (path_match_flags, path_match_flags); + inline path_match_flags operator&= (path_match_flags&, path_match_flags); + inline path_match_flags operator|= (path_match_flags&, path_match_flags); + + // Return true if name matches pattern. Both must be single path components, + // possibly with a trailing directory separator to indicate a directory. + // + // If the pattern ends with a directory separator, then it only matches a + // directory name (i.e., ends with a directory separator, but potentially + // different). Otherwise, it only matches a non-directory name (no trailing + // directory separator). + // + LIBBUTL_SYMEXPORT bool + path_match (const std::string& name, const std::string& pattern); + + // Return true if path entry matches pattern. Note that the match is + // performed literally, with no paths normalization being performed. The + // start directory is used if the first pattern component is a self-matching + // wildcard (see below for the start directory and wildcard semantics). + // + // In addition to the wildcard characters, it also recognizes the ** and *** + // wildcard sequences (see path_search() for details). + // + LIBBUTL_SYMEXPORT bool + path_match (const path& entry, + const path& pattern, + const dir_path& start = dir_path (), + path_match_flags = path_match_flags::none); + + // Return true if a name contains the wildcard characters. + // + bool + path_pattern (const std::string&); + + // Return true if a name contains the ** wildcard sequences. + // + bool + path_pattern_recursive (const std::string&); + + // Return true if a name contains the *** wildcard sequences. + // + bool + path_pattern_self_matching (const std::string&); + + // Return true if a path contains the pattern components. + // + bool + path_pattern (const path&); + + // Return the number of recursive pattern components. + // + // Knowing the number of such components allows us to make some assumptions + // regarding the search result. For example, if it is zero or one, then the + // result contains no duplicates. + // + // Also note that the result can be used as bool. + // + std::size_t + path_pattern_recursive (const path&); + + // Return true if the path is not empty and its first component is a self- + // matching pattern. + // + bool + path_pattern_self_matching (const path&); + + // Iteration over pattern terminals. + // + enum class path_pattern_term_type + { + literal, // Literal character. + question, // Question mark wildcard. + star, // Star wildcard. + bracket // Bracket expression wildcard. + }; + + class path_pattern_term + { + public: + path_pattern_term_type type; + std::string::const_iterator begin; + std::string::const_iterator end; + + std::size_t + size () const {return end - begin;} + + // Predicates. + // + bool literal () const {return type == path_pattern_term_type::literal;} + bool question () const {return type == path_pattern_term_type::question;} + bool star () const {return type == path_pattern_term_type::star;} + bool bracket () const {return type == path_pattern_term_type::bracket;} + }; + + // Return the literal terminal character. + // + char + get_literal (const path_pattern_term&); + + // Match a character against the bracket expression terminal. + // + LIBBUTL_SYMEXPORT bool + match_bracket (char, const path_pattern_term&); + + class LIBBUTL_SYMEXPORT path_pattern_iterator + { + public: + using value_type = path_pattern_term; + using pointer = const path_pattern_term*; + using reference = const path_pattern_term&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + + explicit + path_pattern_iterator (const std::string&); + + path_pattern_iterator (std::string::const_iterator begin, + std::string::const_iterator end); + + path_pattern_iterator () = default; // Create the end iterator. + + path_pattern_iterator& operator++ () {assert (t_); next (); return *this;} + + reference operator* () const {assert (t_); return *t_;} + pointer operator-> () const {assert (t_); return &*t_;} + + friend bool + operator== (const path_pattern_iterator&, const path_pattern_iterator&); + + friend bool + operator!= (const path_pattern_iterator&, const path_pattern_iterator&); + + private: + void + next (); + + private: + // nullopt denotes the end iterator. + // + // Note that the default-constructed i_ and e_ iterators (having singular + // values) may not represent the end iterator as are not comparable for + // equality. That's why we use an absent term to represent such an + // iterator. + // + optional t_; + + std::string::const_iterator i_; + std::string::const_iterator e_; + }; + + // Range-based for loop support. + // + // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ... + // + path_pattern_iterator begin (const path_pattern_iterator&); + path_pattern_iterator end (const path_pattern_iterator&); +} + +#include diff --git a/tests/wildcard/driver.cxx b/tests/wildcard/driver.cxx index ca8bb97..b1303bf 100644 --- a/tests/wildcard/driver.cxx +++ b/tests/wildcard/driver.cxx @@ -24,11 +24,13 @@ import butl.path; import butl.utility; // operator<<(ostream, exception) import butl.optional; import butl.filesystem; +import butl.path_pattern; #else #include #include #include #include +#include #endif using namespace std; -- cgit v1.1