aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2019-09-30 13:48:28 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2019-10-01 12:56:58 +0300
commite6cee3c2f9b03852ed4837f9be05e0a2fa4542a8 (patch)
tree544708bec82c3a025387ea738f73d67646e1a98e
parent55764e395c453b537c08c1e5cadfbb2ddd349279 (diff)
Move path match to path-pattern.?xx
-rw-r--r--libbutl/builtin.cxx3
-rw-r--r--libbutl/filesystem.cxx410
-rw-r--r--libbutl/filesystem.ixx127
-rw-r--r--libbutl/filesystem.mxx214
-rw-r--r--libbutl/path-pattern.cxx451
-rw-r--r--libbutl/path-pattern.ixx133
-rw-r--r--libbutl/path-pattern.mxx242
-rw-r--r--tests/wildcard/driver.cxx2
8 files changed, 840 insertions, 742 deletions
diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx
index 3340271..0b7db7b 100644
--- a/libbutl/builtin.cxx
+++ b/libbutl/builtin.cxx
@@ -52,12 +52,15 @@ import butl.timestamp;
import butl.regex;
import butl.path_io;
+import butl.utility; // operator<<(ostream,exception),
+ // throw_generic_error()
import butl.optional;
import butl.filesystem;
import butl.small_vector;
#else
#include <libbutl/regex.mxx>
#include <libbutl/path-io.mxx>
+#include <libbutl/utility.mxx>
#include <libbutl/optional.mxx>
#include <libbutl/filesystem.mxx>
#include <libbutl/small-vector.mxx>
diff --git a/libbutl/filesystem.cxx b/libbutl/filesystem.cxx
index 832e1b6..b66dcfe 100644
--- a/libbutl/filesystem.cxx
+++ b/libbutl/filesystem.cxx
@@ -62,9 +62,10 @@ import std.core;
#endif
import butl.path;
import butl.timestamp;
+import butl.path_pattern;
#endif
-import butl.utility; // throw_generic_error(), lcase()[_WIN32]
+import butl.utility; // throw_generic_error()
import butl.fdstream;
import butl.small_vector;
#else
@@ -1557,303 +1558,6 @@ namespace butl
}
#endif
- // patterns
- //
- static inline bool
- match (char c, char pc)
- {
-#ifndef _WIN32
- return c == pc;
-#else
- return lcase (c) == lcase (pc);
-#endif
- }
-
- bool
- match_bracket (char c, const path_pattern_term& pt)
- {
- using iterator = string::const_iterator;
-
- assert (pt.bracket ());
-
- iterator i (pt.begin + 1); // Position after '['.
- iterator e (pt.end - 1); // Position at ']'.
-
- bool invert (*i == '!');
- if (invert)
- ++i;
-
- bool r (false);
- for (iterator b (i); i != e && !r; ++i)
- {
- char bc (*i);
-
- // If '-' is a first or last character in the bracket expression then
- // match it literally and match the range otherwise.
- //
- if (bc == '-' && i != b && i + 1 != e) // Match the range?
- {
- // Note that we have already matched the range left endpoint character
- // unsuccessfully (otherwise we wouldn't be here), so now we test if
- // the character belongs to the (min-char, max-char] range.
- //
- // Also note that on Windows we match case insensitively and so can't
- // just compare the character with the range endpoints. Thus, we
- // fallback to matching each range character individually.
- //
-#ifndef _WIN32
- r = c > *(i - 1) && c <= *(i + 1);
-#else
- for (char bc (*(i - 1) + 1), mx (*(i + 1)); bc <= mx && !r; ++bc)
- r = match (c, bc);
-#endif
-
- ++i; // Position to the range max character.
- }
- else // Match against the expression character literally.
- r = match (c, bc);
- }
-
- return r != invert;
- }
-
- // Match the name [ni, ne) to the pattern [pi, pe) that may not contain
- // bracket expressions. Ranges can be empty.
- //
- static bool
- match_no_brackets (string::const_iterator pi, string::const_iterator pe,
- string::const_iterator ni, string::const_iterator ne)
- {
- using reverse_iterator = std::reverse_iterator<string::const_iterator>;
-
- reverse_iterator rpi (pe);
- reverse_iterator rpe (pi);
-
- reverse_iterator rni (ne);
- reverse_iterator rne (ni);
-
- // Match the pattern suffix (follows the last *) to the name trailing
- // characters.
- //
- char pc ('\0');
- for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni)
- {
- if (!match (*rni, pc) && pc != '?')
- return false;
- }
-
- // If we got to the (reversed) end of the pattern (no * is encountered)
- // than we are done. The success depends on if we got to the (reversed) end
- // of the name as well.
- //
- if (rpi == rpe)
- return rni == rne;
-
- // If we didn't reach * in the pattern then we reached the (reversed) end
- // of the name. That means we have unmatched non-star terms in the
- // pattern, and so match failed.
- //
- if (pc != '*')
- {
- assert (rni == rne);
- return false;
- }
-
- // Match the pattern prefix (ends with the first *) to the name leading
- // characters. If they mismatch we failed. Otherwise if this is an only *
- // in the pattern (matches whatever is left in the name) then we succeed,
- // otherwise we perform backtracking (recursively).
- //
- pe = rpi.base ();
- ne = rni.base ();
-
- // Compare the pattern and the name term by char until the name suffix or
- // * is encountered in the pattern (whichever happens first). Fail if a
- // char mismatches.
- //
- for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni)
- {
- if (!match (*ni, pc) && pc != '?')
- return false;
- }
-
- // If we didn't get to * in the pattern then we got to the name suffix.
- // That means that the pattern has unmatched non-star terms, and so match
- // failed.
- //
- if (pc != '*')
- {
- assert (ni == ne);
- return false;
- }
-
- // If * that we have reached is the last one, then it matches whatever is
- // left in the name (including an empty range).
- //
- if (++pi == pe)
- return true;
-
- // Perform backtracking.
- //
- // From now on, we will call the pattern not-yet-matched part (starting
- // the leftmost * and ending the rightmost one inclusively) as pattern, and
- // the name not-yet-matched part as name.
- //
- // Here we sequentially assume that * that starts the pattern matches the
- // name leading part (staring from an empty one and iterating till the full
- // name). So if, at some iteration, the pattern trailing part (that follows
- // the leftmost *) matches the name trailing part, then the pattern matches
- // the name.
- //
- bool r;
- for (; !(r = match_no_brackets (pi, pe, ni, ne)) && ni != ne; ++ni) ;
- return r;
- }
-
- // Match a character against the pattern term.
- //
- static inline bool
- match (char c, const path_pattern_term& pt)
- {
- switch (pt.type)
- {
- // Matches any character.
- //
- case path_pattern_term_type::star:
- case path_pattern_term_type::question: return true;
-
- case path_pattern_term_type::bracket:
- {
- return match_bracket (c, pt);
- }
-
- case path_pattern_term_type::literal:
- {
- return match (c, get_literal (pt));
- }
- }
-
- assert (false); // Can't be here.
- return false;
- }
-
- // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty.
- //
- static bool
- match (string::const_iterator pi, string::const_iterator pe,
- string::const_iterator ni, string::const_iterator ne)
- {
- // If the pattern doesn't contain the bracket expressions then reduce to
- // the "clever" approach (see the implementation notes below for details).
- //
- if (find (pi, pe, '[') == pe)
- return match_no_brackets (pi, pe, ni, ne);
-
- // Match the pattern prefix (precedes the first *) to the name leading
- // characters.
- //
- path_pattern_iterator ppi (pi, pe);
- path_pattern_iterator ppe;
- path_pattern_term pt;
-
- for (; ppi != ppe && !(pt = *ppi).star () && ni != ne; ++ppi, ++ni)
- {
- if (!match (*ni, pt))
- return false;
- }
-
- // If we got to the end of the pattern (no * is encountered) than we are
- // done. The success depends on if we got to the end of the name as well.
- //
- if (ppi == ppe)
- return ni == ne;
-
- // If we didn't reach * in the pattern then we reached the end of the
- // name. That means we have unmatched non-star terms in the pattern, and
- // so match failed.
- //
- if (!pt.star ())
- {
- assert (ni == ne);
- return false;
- }
-
- // If * that we have reached is the last term, then it matches whatever is
- // left in the name (including an empty range).
- //
- if (++ppi == ppe)
- return true;
-
- // Switch back to the string iterator and perform backtracking.
- //
- // From now on, we will call the pattern not-yet-matched part (starting
- // the leftmost *) as pattern, and the name not-yet-matched part as name.
- //
- // Here we sequentially assume that * that starts the pattern matches the
- // name leading part (see match_no_brackets() for details).
- //
- bool r;
- for (pi = ppi->begin; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ;
- return r;
- }
-
- bool
- path_match (const string& name, const string& pattern)
- {
- // Implementation notes:
- //
- // - This has a good potential of becoming hairy quickly so need to strive
- // for an elegant way to implement this.
- //
- // - Most patterns will contains a single * wildcard with a prefix and/or
- // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very
- // common: *foo*.
- //
- // So it would be nice to have a clever implementation that first
- // "anchors" itself with a literal prefix and/or suffix and only then
- // continue with backtracking. In other words, reduce:
- //
- // *.txt vs foo.txt -> * vs foo
- // foo* vs foo.txt -> * vs .txt
- // f*.txt vs foo.txt -> * vs oo
- //
- // Note that this approach fails if the pattern may contain bracket
- // expressions. You can't easily recognize a suffix scanning backwards
- // since * semantics depends on the characters to the left:
- //
- // f[o*]o - * is not a wildcard
- // fo*]o - * is a wildcard
- //
- // That's why we will start with the straightforward left-to-right
- // matching and reduce to the "clever" approach when the remaining part
- // of the pattern doesn't contain bracket expressions.
-
- auto pi (pattern.rbegin ());
- auto pe (pattern.rend ());
-
- auto ni (name.rbegin ());
- auto ne (name.rend ());
-
- // The name doesn't match the pattern if it is of a different type than the
- // pattern is.
- //
- bool pd (pi != pe && path::traits_type::is_separator (*pi));
- bool nd (ni != ne && path::traits_type::is_separator (*ni));
-
- if (pd != nd)
- return false;
-
- // Skip trailing separators if present.
- //
- if (pd)
- {
- ++pi;
- ++ni;
- }
-
- return match (pattern.begin (), pi.base (), name.begin (), ni.base ());
- }
-
// Search for paths matching the pattern and call the specified function for
// each matching path. Return false if the underlying func() call returns
// false. Otherwise the function conforms to the path_search() description.
@@ -2467,114 +2171,4 @@ namespace butl
path_filesystem fs (start, entry);
search (pattern, dir_path (), flags, func, fs);
}
-
- bool
- path_match (const path& entry,
- const path& pattern,
- const dir_path& start,
- path_match_flags flags)
- {
- bool r (false);
-
- auto match = [&entry, &r] (path&& p, const string&, bool interim)
- {
- // If we found the entry (possibly through one of the recursive
- // components) no need to search further.
- //
- if (p == entry && !interim)
- {
- r = true;
- return false;
- }
-
- return true;
- };
-
- path_search (pattern, entry, match, start, flags);
- return r;
- }
-
- // path_pattern_iterator
- //
- void path_pattern_iterator::
- next ()
- {
- if (i_ == e_)
- {
- t_ = nullopt; // Convert the object into the end iterator.
- return;
- }
-
- auto next = [this] (path_pattern_term_type t)
- {
- assert (t != path_pattern_term_type::bracket);
-
- t_ = path_pattern_term {t, i_, i_ + 1};
- ++i_;
- };
-
- switch (*i_)
- {
- case '?':
- {
- next (path_pattern_term_type::question);
- break;
- }
- case '*':
- {
- next (path_pattern_term_type::star);
- break;
- }
- case '[':
- {
- // Try to find the bracket expression end.
- //
- // Note that '[' doesn't necessarily starts the bracket expression (no
- // closing bracket, empty, etc). If that's the case, then we end up
- // with the '[' literal terminal.
- //
- bool expr (false);
- for (;;) // Breakout loop.
- {
- string::const_iterator i (i_ + 1); // Position after '['.
-
- if (i == e_) // Is '[' the pattern last character?
- break;
-
- bool invert (*i == '!');
- if (invert && ++i == e_) // Is '!' the pattern last character?
- break;
-
- // Find the bracket expression end.
- //
- // Note that the bracket expression may not be empty and ']' is a
- // literal if it is the first expression character.
- //
- for (++i; i != e_ && *i != ']'; ++i) ;
-
- if (i == e_) // The closing bracket is not found?
- break;
-
- expr = true;
-
- ++i; // Position after ']'.
-
- t_ = path_pattern_term {path_pattern_term_type::bracket, i_, i};
-
- i_ = i;
- break;
- }
-
- // Fallback to '[' literal if it is not a bracket expression.
- //
- if (expr)
- break;
- }
- // Fall through.
- default:
- {
- next (path_pattern_term_type::literal);
- }
- }
- }
}
diff --git a/libbutl/filesystem.ixx b/libbutl/filesystem.ixx
index 1be5246..c9e3997 100644
--- a/libbutl/filesystem.ixx
+++ b/libbutl/filesystem.ixx
@@ -186,131 +186,4 @@ namespace butl
{
return dir_iterator ();
}
-
- // path_pattern_iterator
- //
- inline path_pattern_iterator::
- path_pattern_iterator (std::string::const_iterator begin,
- std::string::const_iterator end)
- : i_ (begin),
- e_ (end)
- {
- next (); // If i_ == e_ we will end up with the end iterator.
- }
-
- inline path_pattern_iterator::
- path_pattern_iterator (const std::string& s)
- : path_pattern_iterator (s.begin (), s.end ())
- {
- }
-
- inline bool
- operator== (const path_pattern_iterator& x, const path_pattern_iterator& y)
- {
- return x.t_.has_value () == y.t_.has_value () &&
- (!x.t_ || (x.i_ == y.i_ && x.e_ == y.e_));
- }
-
- inline bool
- operator!= (const path_pattern_iterator& x, const path_pattern_iterator& y)
- {
- return !(x == y);
- }
-
- inline path_pattern_iterator
- begin (const path_pattern_iterator& i)
- {
- return i;
- }
-
- inline path_pattern_iterator
- end (const path_pattern_iterator&)
- {
- return path_pattern_iterator ();
- }
-
- // patterns
- //
- inline char
- get_literal (const path_pattern_term& t)
- {
- assert (t.literal ());
- return *t.begin;
- }
-
- inline bool
- path_pattern (const std::string& s)
- {
- for (const path_pattern_term& t: path_pattern_iterator (s))
- {
- if (!t.literal ())
- return true;
- }
-
- return false;
- }
-
- // Return true for a pattern containing the specified number of the
- // consecutive star wildcards.
- //
- inline bool
- path_pattern_recursive (const std::string& s, size_t sn)
- {
- std::size_t n (0);
- for (const path_pattern_term& t: path_pattern_iterator (s))
- {
- if (t.star ())
- {
- if (++n == sn)
- return true;
- }
- else
- n = 0;
- }
-
- return false;
- }
-
- inline bool
- path_pattern_recursive (const std::string& s)
- {
- return path_pattern_recursive (s, 2);
- }
-
- inline bool
- path_pattern_self_matching (const std::string& s)
- {
- return path_pattern_recursive (s, 3);
- }
-
- inline bool
- path_pattern (const path& p)
- {
- for (auto i (p.begin ()); i != p.end (); ++i)
- {
- if (path_pattern (*i))
- return true;
- }
-
- return false;
- }
-
- inline size_t
- path_pattern_recursive (const path& p)
- {
- std::size_t r (0);
- for (auto i (p.begin ()); i != p.end (); ++i)
- {
- if (path_pattern_recursive (*i))
- ++r;
- }
-
- return r;
- }
-
- inline bool
- path_pattern_self_matching (const path& p)
- {
- return !p.empty () && path_pattern_self_matching (*p.begin ());
- }
}
diff --git a/libbutl/filesystem.mxx b/libbutl/filesystem.mxx
index 261b985..43c23b7 100644
--- a/libbutl/filesystem.mxx
+++ b/libbutl/filesystem.mxx
@@ -23,14 +23,12 @@
using mode_t = int;
#endif
-#include <cassert>
-
#ifndef __cpp_lib_modules_ts
#include <string>
-#include <cstddef> // ptrdiff_t, size_t
+#include <cstddef> // ptrdiff_t
#include <cstdint> // uint16_t, etc
#include <utility> // move(), pair
-#include <iterator>
+#include <iterator> // input_iterator_tag
#include <functional>
#include <chrono> //@@ MOD needed by timestamp module (no re-export).
@@ -45,14 +43,14 @@ import std.core;
#endif
import butl.path;
-import butl.optional;
import butl.timestamp;
+import butl.path_pattern; // path_match_flags
-import butl.utility;
+import butl.utility; // operator<<(ostream,exception), throw_generic_error()
#else
#include <libbutl/path.mxx>
-#include <libbutl/optional.mxx>
#include <libbutl/timestamp.mxx>
+#include <libbutl/path-pattern.mxx>
#include <libbutl/utility.mxx>
#endif
@@ -717,79 +715,9 @@ LIBBUTL_MODEXPORT namespace butl
inline dir_iterator begin (dir_iterator&&);
#endif
- // Wildcard pattern match and search (aka glob).
- //
- // The wildcard pattern contains the literal characters that match
- // themselves and the wildcard characters that match a single or multiple
- // characters. Currently the following wildcards are supported:
- //
- // * - match any number of characters (including zero)
- // ? - match any single character
- // [...] - match a character with a "bracket expression"; currently we only
- // support literal characters and ranges (no character/equivalence
- // classes, etc; see Pattern Matching Notation section of the Shell
- // Command Language POSIX specification for details)
- //
- // Note also that currently we don't support the special characters
- // backslash-escaping (as mandated by POSIX).
-
- // Path match/search flags.
- //
- enum class path_match_flags: std::uint16_t
- {
- // Follow symlinks. This only applies to symlinks that are matched against
- // the rightmost component of the pattern. In particular, this mean that
- // such symlinks will never match a directory pattern and some results can
- // be missing for the recursive rightmost component.
- //
- follow_symlinks = 0x1,
-
- // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or
- // `.../*`) match absent path component. For example, with this flag
- // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`.
- //
- // Note that this does not apply to single-component patterns and the
- // pattern type is always preserved. In particular, the `a/*/` pattern
- // matches `a/` but not `a`.
- //
- // Finally, keep in mind that only absent directory components can be
- // matched this way. In particular, pattern `a*/*` does not match `ab`
- // (but `a*/*/` matches `ab/`).
- //
- match_absent = 0x2,
-
- none = 0
- };
-
- inline path_match_flags operator& (path_match_flags, path_match_flags);
- inline path_match_flags operator| (path_match_flags, path_match_flags);
- inline path_match_flags operator&= (path_match_flags&, path_match_flags);
- inline path_match_flags operator|= (path_match_flags&, path_match_flags);
-
- // Return true if name matches pattern. Both must be single path components,
- // possibly with a trailing directory separator to indicate a directory.
- //
- // If the pattern ends with a directory separator, then it only matches a
- // directory name (i.e., ends with a directory separator, but potentially
- // different). Otherwise, it only matches a non-directory name (no trailing
- // directory separator).
- //
- LIBBUTL_SYMEXPORT bool
- path_match (const std::string& name, const std::string& pattern);
-
- // Return true if path entry matches pattern. Note that the match is
- // performed literally, with no paths normalization being performed. The
- // start directory is used if the first pattern component is a self-matching
- // wildcard (see below for the start directory and wildcard semantics).
+ // Wildcard pattern search (aka glob).
//
- // In addition to the wildcard characters, it also recognizes the ** and ***
- // wildcard sequences (see path_search() for details).
- //
- LIBBUTL_SYMEXPORT bool
- path_match (const path& entry,
- const path& pattern,
- const dir_path& start = dir_path (),
- path_match_flags = path_match_flags::none);
+ // For details on the wildcard patterns see <libbutl/path-pattern.mxx>
// Search for paths matching the pattern calling the specified function for
// each matching path (see below for details).
@@ -883,134 +811,6 @@ LIBBUTL_MODEXPORT namespace butl
bool interm)>&,
const dir_path& start = dir_path (),
path_match_flags = path_match_flags::none);
-
- // Return true if a name contains the wildcard characters.
- //
- bool
- path_pattern (const std::string&);
-
- // Return true if a name contains the ** wildcard sequences.
- //
- bool
- path_pattern_recursive (const std::string&);
-
- // Return true if a name contains the *** wildcard sequences.
- //
- bool
- path_pattern_self_matching (const std::string&);
-
- // Return true if a path contains the pattern components.
- //
- bool
- path_pattern (const path&);
-
- // Return the number of recursive pattern components.
- //
- // Knowing the number of such components allows us to make some assumptions
- // regarding the search result. For example, if it is zero or one, then the
- // result contains no duplicates.
- //
- // Also note that the result can be used as bool.
- //
- size_t
- path_pattern_recursive (const path&);
-
- // Return true if the path is not empty and its first component is a self-
- // matching pattern.
- //
- bool
- path_pattern_self_matching (const path&);
-
- // Iteration over pattern terminals.
- //
- enum class path_pattern_term_type
- {
- literal, // Literal character.
- question, // Question mark wildcard.
- star, // Star wildcard.
- bracket // Bracket expression wildcard.
- };
-
- class path_pattern_term
- {
- public:
- path_pattern_term_type type;
- std::string::const_iterator begin;
- std::string::const_iterator end;
-
- std::size_t
- size () const {return end - begin;}
-
- // Predicates.
- //
- bool literal () const {return type == path_pattern_term_type::literal;}
- bool question () const {return type == path_pattern_term_type::question;}
- bool star () const {return type == path_pattern_term_type::star;}
- bool bracket () const {return type == path_pattern_term_type::bracket;}
- };
-
- // Return the literal terminal character.
- //
- char
- get_literal (const path_pattern_term&);
-
- // Match a character against the bracket expression terminal.
- //
- LIBBUTL_SYMEXPORT bool
- match_bracket (char, const path_pattern_term&);
-
- class LIBBUTL_SYMEXPORT path_pattern_iterator
- {
- public:
- using value_type = path_pattern_term;
- using pointer = const path_pattern_term*;
- using reference = const path_pattern_term&;
- using difference_type = std::ptrdiff_t;
- using iterator_category = std::input_iterator_tag;
-
- explicit
- path_pattern_iterator (const std::string&);
-
- path_pattern_iterator (std::string::const_iterator begin,
- std::string::const_iterator end);
-
- path_pattern_iterator () = default; // Create the end iterator.
-
- path_pattern_iterator& operator++ () {assert (t_); next (); return *this;}
-
- reference operator* () const {assert (t_); return *t_;}
- pointer operator-> () const {assert (t_); return &*t_;}
-
- friend bool
- operator== (const path_pattern_iterator&, const path_pattern_iterator&);
-
- friend bool
- operator!= (const path_pattern_iterator&, const path_pattern_iterator&);
-
- private:
- void
- next ();
-
- private:
- // nullopt denotes the end iterator.
- //
- // Note that the default-constructed i_ and e_ iterators (having singular
- // values) may not represent the end iterator as are not comparable for
- // equality. That's why we use an absent term to represent such an
- // iterator.
- //
- optional<path_pattern_term> t_;
-
- std::string::const_iterator i_;
- std::string::const_iterator e_;
- };
-
- // Range-based for loop support.
- //
- // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ...
- //
- path_pattern_iterator begin (const path_pattern_iterator&);
- path_pattern_iterator end (const path_pattern_iterator&);
}
#include <libbutl/filesystem.ixx>
diff --git a/libbutl/path-pattern.cxx b/libbutl/path-pattern.cxx
new file mode 100644
index 0000000..86be1ec
--- /dev/null
+++ b/libbutl/path-pattern.cxx
@@ -0,0 +1,451 @@
+// file : libbutl/path-pattern.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#include <libbutl/path-pattern.mxx>
+#endif
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint>
+#include <cstddef>
+#include <iterator> // reverse_iterator
+
+#include <algorithm> // find()
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+module butl.path_pattern;
+
+// Only imports additional to interface.
+#ifdef __clang__
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.path;
+import butl.optional;
+#endif
+
+import butl.utility; // lcase()[_WIN32]
+import butl.filesystem; // path_search()
+#else
+#include <libbutl/utility.mxx>
+#include <libbutl/filesystem.mxx>
+#endif
+
+using namespace std;
+
+namespace butl
+{
+ // patterns
+ //
+ static inline bool
+ match (char c, char pc)
+ {
+#ifndef _WIN32
+ return c == pc;
+#else
+ return lcase (c) == lcase (pc);
+#endif
+ }
+
+ bool
+ match_bracket (char c, const path_pattern_term& pt)
+ {
+ using iterator = string::const_iterator;
+
+ assert (pt.bracket ());
+
+ iterator i (pt.begin + 1); // Position after '['.
+ iterator e (pt.end - 1); // Position at ']'.
+
+ bool invert (*i == '!');
+ if (invert)
+ ++i;
+
+ bool r (false);
+ for (iterator b (i); i != e && !r; ++i)
+ {
+ char bc (*i);
+
+ // If '-' is a first or last character in the bracket expression then
+ // match it literally and match the range otherwise.
+ //
+ if (bc == '-' && i != b && i + 1 != e) // Match the range?
+ {
+ // Note that we have already matched the range left endpoint character
+ // unsuccessfully (otherwise we wouldn't be here), so now we test if
+ // the character belongs to the (min-char, max-char] range.
+ //
+ // Also note that on Windows we match case insensitively and so can't
+ // just compare the character with the range endpoints. Thus, we
+ // fallback to matching each range character individually.
+ //
+#ifndef _WIN32
+ r = c > *(i - 1) && c <= *(i + 1);
+#else
+ for (char bc (*(i - 1) + 1), mx (*(i + 1)); bc <= mx && !r; ++bc)
+ r = match (c, bc);
+#endif
+
+ ++i; // Position to the range max character.
+ }
+ else // Match against the expression character literally.
+ r = match (c, bc);
+ }
+
+ return r != invert;
+ }
+
+ // Match the name [ni, ne) to the pattern [pi, pe) that may not contain
+ // bracket expressions. Ranges can be empty.
+ //
+ static bool
+ match_no_brackets (string::const_iterator pi, string::const_iterator pe,
+ string::const_iterator ni, string::const_iterator ne)
+ {
+ using reverse_iterator = std::reverse_iterator<string::const_iterator>;
+
+ reverse_iterator rpi (pe);
+ reverse_iterator rpe (pi);
+
+ reverse_iterator rni (ne);
+ reverse_iterator rne (ni);
+
+ // Match the pattern suffix (follows the last *) to the name trailing
+ // characters.
+ //
+ char pc ('\0');
+ for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni)
+ {
+ if (!match (*rni, pc) && pc != '?')
+ return false;
+ }
+
+ // If we got to the (reversed) end of the pattern (no * is encountered)
+ // than we are done. The success depends on if we got to the (reversed) end
+ // of the name as well.
+ //
+ if (rpi == rpe)
+ return rni == rne;
+
+ // If we didn't reach * in the pattern then we reached the (reversed) end
+ // of the name. That means we have unmatched non-star terms in the
+ // pattern, and so match failed.
+ //
+ if (pc != '*')
+ {
+ assert (rni == rne);
+ return false;
+ }
+
+ // Match the pattern prefix (ends with the first *) to the name leading
+ // characters. If they mismatch we failed. Otherwise if this is an only *
+ // in the pattern (matches whatever is left in the name) then we succeed,
+ // otherwise we perform backtracking (recursively).
+ //
+ pe = rpi.base ();
+ ne = rni.base ();
+
+ // Compare the pattern and the name term by char until the name suffix or
+ // * is encountered in the pattern (whichever happens first). Fail if a
+ // char mismatches.
+ //
+ for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni)
+ {
+ if (!match (*ni, pc) && pc != '?')
+ return false;
+ }
+
+ // If we didn't get to * in the pattern then we got to the name suffix.
+ // That means that the pattern has unmatched non-star terms, and so match
+ // failed.
+ //
+ if (pc != '*')
+ {
+ assert (ni == ne);
+ return false;
+ }
+
+ // If * that we have reached is the last one, then it matches whatever is
+ // left in the name (including an empty range).
+ //
+ if (++pi == pe)
+ return true;
+
+ // Perform backtracking.
+ //
+ // From now on, we will call the pattern not-yet-matched part (starting
+ // the leftmost * and ending the rightmost one inclusively) as pattern, and
+ // the name not-yet-matched part as name.
+ //
+ // Here we sequentially assume that * that starts the pattern matches the
+ // name leading part (staring from an empty one and iterating till the full
+ // name). So if, at some iteration, the pattern trailing part (that follows
+ // the leftmost *) matches the name trailing part, then the pattern matches
+ // the name.
+ //
+ bool r;
+ for (; !(r = match_no_brackets (pi, pe, ni, ne)) && ni != ne; ++ni) ;
+ return r;
+ }
+
+ // Match a character against the pattern term.
+ //
+ static inline bool
+ match (char c, const path_pattern_term& pt)
+ {
+ switch (pt.type)
+ {
+ // Matches any character.
+ //
+ case path_pattern_term_type::star:
+ case path_pattern_term_type::question: return true;
+
+ case path_pattern_term_type::bracket:
+ {
+ return match_bracket (c, pt);
+ }
+
+ case path_pattern_term_type::literal:
+ {
+ return match (c, get_literal (pt));
+ }
+ }
+
+ assert (false); // Can't be here.
+ return false;
+ }
+
+ // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty.
+ //
+ static bool
+ match (string::const_iterator pi, string::const_iterator pe,
+ string::const_iterator ni, string::const_iterator ne)
+ {
+ // If the pattern doesn't contain the bracket expressions then reduce to
+ // the "clever" approach (see the implementation notes below for details).
+ //
+ if (find (pi, pe, '[') == pe)
+ return match_no_brackets (pi, pe, ni, ne);
+
+ // Match the pattern prefix (precedes the first *) to the name leading
+ // characters.
+ //
+ path_pattern_iterator ppi (pi, pe);
+ path_pattern_iterator ppe;
+ path_pattern_term pt;
+
+ for (; ppi != ppe && !(pt = *ppi).star () && ni != ne; ++ppi, ++ni)
+ {
+ if (!match (*ni, pt))
+ return false;
+ }
+
+ // If we got to the end of the pattern (no * is encountered) than we are
+ // done. The success depends on if we got to the end of the name as well.
+ //
+ if (ppi == ppe)
+ return ni == ne;
+
+ // If we didn't reach * in the pattern then we reached the end of the
+ // name. That means we have unmatched non-star terms in the pattern, and
+ // so match failed.
+ //
+ if (!pt.star ())
+ {
+ assert (ni == ne);
+ return false;
+ }
+
+ // If * that we have reached is the last term, then it matches whatever is
+ // left in the name (including an empty range).
+ //
+ if (++ppi == ppe)
+ return true;
+
+ // Switch back to the string iterator and perform backtracking.
+ //
+ // From now on, we will call the pattern not-yet-matched part (starting
+ // the leftmost *) as pattern, and the name not-yet-matched part as name.
+ //
+ // Here we sequentially assume that * that starts the pattern matches the
+ // name leading part (see match_no_brackets() for details).
+ //
+ bool r;
+ for (pi = ppi->begin; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ;
+ return r;
+ }
+
+ bool
+ path_match (const string& name, const string& pattern)
+ {
+ // Implementation notes:
+ //
+ // - This has a good potential of becoming hairy quickly so need to strive
+ // for an elegant way to implement this.
+ //
+ // - Most patterns will contains a single * wildcard with a prefix and/or
+ // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very
+ // common: *foo*.
+ //
+ // So it would be nice to have a clever implementation that first
+ // "anchors" itself with a literal prefix and/or suffix and only then
+ // continue with backtracking. In other words, reduce:
+ //
+ // *.txt vs foo.txt -> * vs foo
+ // foo* vs foo.txt -> * vs .txt
+ // f*.txt vs foo.txt -> * vs oo
+ //
+ // Note that this approach fails if the pattern may contain bracket
+ // expressions. You can't easily recognize a suffix scanning backwards
+ // since * semantics depends on the characters to the left:
+ //
+ // f[o*]o - * is not a wildcard
+ // fo*]o - * is a wildcard
+ //
+ // That's why we will start with the straightforward left-to-right
+ // matching and reduce to the "clever" approach when the remaining part
+ // of the pattern doesn't contain bracket expressions.
+
+ auto pi (pattern.rbegin ());
+ auto pe (pattern.rend ());
+
+ auto ni (name.rbegin ());
+ auto ne (name.rend ());
+
+ // The name doesn't match the pattern if it is of a different type than the
+ // pattern is.
+ //
+ bool pd (pi != pe && path::traits_type::is_separator (*pi));
+ bool nd (ni != ne && path::traits_type::is_separator (*ni));
+
+ if (pd != nd)
+ return false;
+
+ // Skip trailing separators if present.
+ //
+ if (pd)
+ {
+ ++pi;
+ ++ni;
+ }
+
+ return match (pattern.begin (), pi.base (), name.begin (), ni.base ());
+ }
+
+ bool
+ path_match (const path& entry,
+ const path& pattern,
+ const dir_path& start,
+ path_match_flags flags)
+ {
+ bool r (false);
+
+ auto match = [&entry, &r] (path&& p, const string&, bool interim)
+ {
+ // If we found the entry (possibly through one of the recursive
+ // components) no need to search further.
+ //
+ if (p == entry && !interim)
+ {
+ r = true;
+ return false;
+ }
+
+ return true;
+ };
+
+ path_search (pattern, entry, match, start, flags);
+ return r;
+ }
+
+ // path_pattern_iterator
+ //
+ void path_pattern_iterator::
+ next ()
+ {
+ if (i_ == e_)
+ {
+ t_ = nullopt; // Convert the object into the end iterator.
+ return;
+ }
+
+ auto next = [this] (path_pattern_term_type t)
+ {
+ assert (t != path_pattern_term_type::bracket);
+
+ t_ = path_pattern_term {t, i_, i_ + 1};
+ ++i_;
+ };
+
+ switch (*i_)
+ {
+ case '?':
+ {
+ next (path_pattern_term_type::question);
+ break;
+ }
+ case '*':
+ {
+ next (path_pattern_term_type::star);
+ break;
+ }
+ case '[':
+ {
+ // Try to find the bracket expression end.
+ //
+ // Note that '[' doesn't necessarily starts the bracket expression (no
+ // closing bracket, empty, etc). If that's the case, then we end up
+ // with the '[' literal terminal.
+ //
+ bool expr (false);
+ for (;;) // Breakout loop.
+ {
+ string::const_iterator i (i_ + 1); // Position after '['.
+
+ if (i == e_) // Is '[' the pattern last character?
+ break;
+
+ bool invert (*i == '!');
+ if (invert && ++i == e_) // Is '!' the pattern last character?
+ break;
+
+ // Find the bracket expression end.
+ //
+ // Note that the bracket expression may not be empty and ']' is a
+ // literal if it is the first expression character.
+ //
+ for (++i; i != e_ && *i != ']'; ++i) ;
+
+ if (i == e_) // The closing bracket is not found?
+ break;
+
+ expr = true;
+
+ ++i; // Position after ']'.
+
+ t_ = path_pattern_term {path_pattern_term_type::bracket, i_, i};
+
+ i_ = i;
+ break;
+ }
+
+ // Fallback to '[' literal if it is not a bracket expression.
+ //
+ if (expr)
+ break;
+ }
+ // Fall through.
+ default:
+ {
+ next (path_pattern_term_type::literal);
+ }
+ }
+ }
+}
diff --git a/libbutl/path-pattern.ixx b/libbutl/path-pattern.ixx
new file mode 100644
index 0000000..5adde3f
--- /dev/null
+++ b/libbutl/path-pattern.ixx
@@ -0,0 +1,133 @@
+// file : libbutl/path-pattern.ixx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+ // path_pattern_iterator
+ //
+ inline path_pattern_iterator::
+ path_pattern_iterator (std::string::const_iterator begin,
+ std::string::const_iterator end)
+ : i_ (begin),
+ e_ (end)
+ {
+ next (); // If i_ == e_ we will end up with the end iterator.
+ }
+
+ inline path_pattern_iterator::
+ path_pattern_iterator (const std::string& s)
+ : path_pattern_iterator (s.begin (), s.end ())
+ {
+ }
+
+ inline bool
+ operator== (const path_pattern_iterator& x, const path_pattern_iterator& y)
+ {
+ return x.t_.has_value () == y.t_.has_value () &&
+ (!x.t_ || (x.i_ == y.i_ && x.e_ == y.e_));
+ }
+
+ inline bool
+ operator!= (const path_pattern_iterator& x, const path_pattern_iterator& y)
+ {
+ return !(x == y);
+ }
+
+ inline path_pattern_iterator
+ begin (const path_pattern_iterator& i)
+ {
+ return i;
+ }
+
+ inline path_pattern_iterator
+ end (const path_pattern_iterator&)
+ {
+ return path_pattern_iterator ();
+ }
+
+ // patterns
+ //
+ inline char
+ get_literal (const path_pattern_term& t)
+ {
+ assert (t.literal ());
+ return *t.begin;
+ }
+
+ inline bool
+ path_pattern (const std::string& s)
+ {
+ for (const path_pattern_term& t: path_pattern_iterator (s))
+ {
+ if (!t.literal ())
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return true for a pattern containing the specified number of the
+ // consecutive star wildcards.
+ //
+ inline bool
+ path_pattern_recursive (const std::string& s, std::size_t sn)
+ {
+ std::size_t n (0);
+ for (const path_pattern_term& t: path_pattern_iterator (s))
+ {
+ if (t.star ())
+ {
+ if (++n == sn)
+ return true;
+ }
+ else
+ n = 0;
+ }
+
+ return false;
+ }
+
+ inline bool
+ path_pattern_recursive (const std::string& s)
+ {
+ return path_pattern_recursive (s, 2);
+ }
+
+ inline bool
+ path_pattern_self_matching (const std::string& s)
+ {
+ return path_pattern_recursive (s, 3);
+ }
+
+ inline bool
+ path_pattern (const path& p)
+ {
+ for (auto i (p.begin ()); i != p.end (); ++i)
+ {
+ if (path_pattern (*i))
+ return true;
+ }
+
+ return false;
+ }
+
+ inline std::size_t
+ path_pattern_recursive (const path& p)
+ {
+ std::size_t r (0);
+ for (auto i (p.begin ()); i != p.end (); ++i)
+ {
+ if (path_pattern_recursive (*i))
+ ++r;
+ }
+
+ return r;
+ }
+
+ inline bool
+ path_pattern_self_matching (const path& p)
+ {
+ return !p.empty () && path_pattern_self_matching (*p.begin ());
+ }
+}
diff --git a/libbutl/path-pattern.mxx b/libbutl/path-pattern.mxx
new file mode 100644
index 0000000..2d37b58
--- /dev/null
+++ b/libbutl/path-pattern.mxx
@@ -0,0 +1,242 @@
+// file : libbutl/path-pattern.mxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint16_t
+#include <cstddef> // ptrdiff_t, size_t
+#include <iterator> // input_iterator_tag
+#endif
+
+// Other includes.
+#ifdef __cpp_modules_ts
+export module butl.path_pattern;
+
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+
+import butl.path;
+import butl.optional;
+#else
+#include <libbutl/path.mxx>
+#include <libbutl/optional.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // Wildcard pattern match (aka glob).
+ //
+ // The wildcard pattern contains the literal characters that match
+ // themselves and the wildcard characters that match a single or multiple
+ // characters. Currently the following wildcards are supported:
+ //
+ // * - match any number of characters (including zero)
+ // ? - match any single character
+ // [...] - match a character with a "bracket expression"; currently we only
+ // support literal characters and ranges (no character/equivalence
+ // classes, etc; see Pattern Matching Notation section of the Shell
+ // Command Language POSIX specification for details)
+ //
+ // Note also that currently we don't support the special characters
+ // backslash-escaping (as mandated by POSIX).
+
+ // Path match/search flags.
+ //
+ enum class path_match_flags: std::uint16_t
+ {
+ // Follow symlinks. This only applies to symlinks that are matched against
+ // the rightmost component of the pattern. In particular, this mean that
+ // such symlinks will never match a directory pattern and some results can
+ // be missing for the recursive rightmost component.
+ //
+ // Note that this flag is only used for path_search().
+ //
+ follow_symlinks = 0x1,
+
+ // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or
+ // `.../*`) match absent path component. For example, with this flag
+ // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`.
+ //
+ // Note that this does not apply to single-component patterns and the
+ // pattern type is always preserved. In particular, the `a/*/` pattern
+ // matches `a/` but not `a`.
+ //
+ // Finally, keep in mind that only absent directory components can be
+ // matched this way. In particular, pattern `a*/*` does not match `ab`
+ // (but `a*/*/` matches `ab/`).
+ //
+ match_absent = 0x2,
+
+ none = 0
+ };
+
+ inline path_match_flags operator& (path_match_flags, path_match_flags);
+ inline path_match_flags operator| (path_match_flags, path_match_flags);
+ inline path_match_flags operator&= (path_match_flags&, path_match_flags);
+ inline path_match_flags operator|= (path_match_flags&, path_match_flags);
+
+ // Return true if name matches pattern. Both must be single path components,
+ // possibly with a trailing directory separator to indicate a directory.
+ //
+ // If the pattern ends with a directory separator, then it only matches a
+ // directory name (i.e., ends with a directory separator, but potentially
+ // different). Otherwise, it only matches a non-directory name (no trailing
+ // directory separator).
+ //
+ LIBBUTL_SYMEXPORT bool
+ path_match (const std::string& name, const std::string& pattern);
+
+ // Return true if path entry matches pattern. Note that the match is
+ // performed literally, with no paths normalization being performed. The
+ // start directory is used if the first pattern component is a self-matching
+ // wildcard (see below for the start directory and wildcard semantics).
+ //
+ // In addition to the wildcard characters, it also recognizes the ** and ***
+ // wildcard sequences (see path_search() for details).
+ //
+ LIBBUTL_SYMEXPORT bool
+ path_match (const path& entry,
+ const path& pattern,
+ const dir_path& start = dir_path (),
+ path_match_flags = path_match_flags::none);
+
+ // Return true if a name contains the wildcard characters.
+ //
+ bool
+ path_pattern (const std::string&);
+
+ // Return true if a name contains the ** wildcard sequences.
+ //
+ bool
+ path_pattern_recursive (const std::string&);
+
+ // Return true if a name contains the *** wildcard sequences.
+ //
+ bool
+ path_pattern_self_matching (const std::string&);
+
+ // Return true if a path contains the pattern components.
+ //
+ bool
+ path_pattern (const path&);
+
+ // Return the number of recursive pattern components.
+ //
+ // Knowing the number of such components allows us to make some assumptions
+ // regarding the search result. For example, if it is zero or one, then the
+ // result contains no duplicates.
+ //
+ // Also note that the result can be used as bool.
+ //
+ std::size_t
+ path_pattern_recursive (const path&);
+
+ // Return true if the path is not empty and its first component is a self-
+ // matching pattern.
+ //
+ bool
+ path_pattern_self_matching (const path&);
+
+ // Iteration over pattern terminals.
+ //
+ enum class path_pattern_term_type
+ {
+ literal, // Literal character.
+ question, // Question mark wildcard.
+ star, // Star wildcard.
+ bracket // Bracket expression wildcard.
+ };
+
+ class path_pattern_term
+ {
+ public:
+ path_pattern_term_type type;
+ std::string::const_iterator begin;
+ std::string::const_iterator end;
+
+ std::size_t
+ size () const {return end - begin;}
+
+ // Predicates.
+ //
+ bool literal () const {return type == path_pattern_term_type::literal;}
+ bool question () const {return type == path_pattern_term_type::question;}
+ bool star () const {return type == path_pattern_term_type::star;}
+ bool bracket () const {return type == path_pattern_term_type::bracket;}
+ };
+
+ // Return the literal terminal character.
+ //
+ char
+ get_literal (const path_pattern_term&);
+
+ // Match a character against the bracket expression terminal.
+ //
+ LIBBUTL_SYMEXPORT bool
+ match_bracket (char, const path_pattern_term&);
+
+ class LIBBUTL_SYMEXPORT path_pattern_iterator
+ {
+ public:
+ using value_type = path_pattern_term;
+ using pointer = const path_pattern_term*;
+ using reference = const path_pattern_term&;
+ using difference_type = std::ptrdiff_t;
+ using iterator_category = std::input_iterator_tag;
+
+ explicit
+ path_pattern_iterator (const std::string&);
+
+ path_pattern_iterator (std::string::const_iterator begin,
+ std::string::const_iterator end);
+
+ path_pattern_iterator () = default; // Create the end iterator.
+
+ path_pattern_iterator& operator++ () {assert (t_); next (); return *this;}
+
+ reference operator* () const {assert (t_); return *t_;}
+ pointer operator-> () const {assert (t_); return &*t_;}
+
+ friend bool
+ operator== (const path_pattern_iterator&, const path_pattern_iterator&);
+
+ friend bool
+ operator!= (const path_pattern_iterator&, const path_pattern_iterator&);
+
+ private:
+ void
+ next ();
+
+ private:
+ // nullopt denotes the end iterator.
+ //
+ // Note that the default-constructed i_ and e_ iterators (having singular
+ // values) may not represent the end iterator as are not comparable for
+ // equality. That's why we use an absent term to represent such an
+ // iterator.
+ //
+ optional<path_pattern_term> t_;
+
+ std::string::const_iterator i_;
+ std::string::const_iterator e_;
+ };
+
+ // Range-based for loop support.
+ //
+ // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ...
+ //
+ path_pattern_iterator begin (const path_pattern_iterator&);
+ path_pattern_iterator end (const path_pattern_iterator&);
+}
+
+#include <libbutl/path-pattern.ixx>
diff --git a/tests/wildcard/driver.cxx b/tests/wildcard/driver.cxx
index ca8bb97..b1303bf 100644
--- a/tests/wildcard/driver.cxx
+++ b/tests/wildcard/driver.cxx
@@ -24,11 +24,13 @@ import butl.path;
import butl.utility; // operator<<(ostream, exception)
import butl.optional;
import butl.filesystem;
+import butl.path_pattern;
#else
#include <libbutl/path.mxx>
#include <libbutl/utility.mxx>
#include <libbutl/optional.mxx>
#include <libbutl/filesystem.mxx>
+#include <libbutl/path-pattern.mxx>
#endif
using namespace std;