aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2017-06-07 17:18:54 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2017-06-08 19:57:50 +0300
commit04f4bb3e2492bb6e4b0769b4c7f020493cfa5cc4 (patch)
tree9ab6bf532ea9a906fa9d82570dbd8dd63e582f0a
parent3ea3b1c94f320bb55849d4a9fea1c3009ab3b298 (diff)
Add path_match() and path_search() overloads
-rw-r--r--libbutl/filesystem.cxx491
-rw-r--r--libbutl/filesystem.hxx42
-rw-r--r--tests/wildcard/driver.cxx55
3 files changed, 447 insertions, 141 deletions
diff --git a/libbutl/filesystem.cxx b/libbutl/filesystem.cxx
index 18b4d22..3002c93 100644
--- a/libbutl/filesystem.cxx
+++ b/libbutl/filesystem.cxx
@@ -1097,6 +1097,192 @@ namespace butl
return match (pattern.begin (), pi.base (), name.begin (), ni.base ());
}
+ // Search for paths matching the pattern and call the specified function for
+ // each matching path. Return false if the underlying func() call returns
+ // false. Otherwise the function conforms to the path_search() description.
+ //
+ // Note that the access to the traversed directory tree (real or virtual) is
+ // performed through the provided filesystem object.
+ //
+ static const string any_dir ("*/");
+
+ template <typename FS>
+ static bool
+ search (
+ path pattern,
+ dir_path pattern_dir,
+ bool follow_symlinks,
+ const function<bool (path&&, const string& pattern, bool interm)>& func,
+ FS& filesystem)
+ {
+ // Fast-forward the leftmost pattern non-wildcard components. So, for
+ // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/.
+ //
+ {
+ auto b (pattern.begin ());
+ auto e (pattern.end ());
+ auto i (b);
+ for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ;
+
+ // If the pattern has no wildcards then we reduce to checking for the
+ // filesystem entry existence. It matches if exists and is of the proper
+ // type.
+ //
+ if (i == e)
+ {
+ path p (pattern_dir / pattern);
+ auto pe (filesystem.path_entry (p, follow_symlinks));
+
+ if (pe.first &&
+ ((pe.second.type == entry_type::directory) == p.to_directory ()))
+ return func (move (p), string (), false);
+
+ return true;
+ }
+ else if (i != b) // There are non-wildcard components, so fast-forward.
+ {
+ path p (b, i);
+ pattern = pattern.leaf (p);
+ pattern_dir /= path_cast<dir_path> (move (p));
+ }
+ }
+
+ assert (!pattern.empty ());
+
+ // The pattern leftmost component. Will use it to match the start directory
+ // sub-entries.
+ //
+ path pc (pattern.begin (), ++pattern.begin ());
+ string pcr (pc.representation ());
+
+ // Note that if the pattern has multiple components (is not a simple path),
+ // then the leftmost one has a trailing separator, and so will match
+ // sub-directories only.
+ //
+ bool simple (pattern.simple ());
+
+ // Note that we rely on "small function object" optimization here.
+ //
+ typename FS::iterator_type i (filesystem.iterator (
+ pattern_dir,
+ pcr.find ("**") != string::npos, // Recursive.
+ pcr.find ("***") != string::npos, // Self-inclusive.
+ follow_symlinks || !simple,
+ [&pattern_dir, &func] (const dir_path& p) -> bool // Preopen.
+ {
+ return func (pattern_dir / p, any_dir, true);
+ }));
+
+ // Canonicalize the pattern component collapsing consecutive stars (used to
+ // express that it is recursive) into a single one.
+ //
+ size_t j (0);
+ size_t n (pcr.size ());
+ for (size_t i (0); i < n; ++i)
+ {
+ char c (pcr[i]);
+ if (!(c == '*' && i > 0 && pcr[i - 1] == '*'))
+ pcr[j++] = c;
+ }
+
+ if (j != n)
+ pcr.resize (j);
+
+ // Note that the callback function can be called for the same directory
+ // twice: first time as intermediate match from iterator's preopen() call,
+ // and then, if the first call succeed, from the iterating loop (possibly
+ // as the final match).
+ //
+ path p;
+ while (i.next (p))
+ {
+ // Skip sub-entry if its name doesn't match the pattern leftmost
+ // component.
+ //
+ // Matching the directory we are iterating through (as for a pattern
+ // component containing ***) is a bit tricky. This directory is
+ // represented by the iterator as an empty path, and so we need to
+ // compute it (the leaf would actually be enough) for matching. This
+ // leaf can be acquired from the start_dir / pattern_dir. We don't expect
+ // this path to be empty, as the filesystem object must replace an empty
+ // start directory with the current one. This is the case when we search
+ // in the current directory (start_dir is empty) with a pattern that
+ // starts with *** wildcard (for example f***/bar). Note that this will
+ // be the only case per path_search() as the next time pattern_dir will
+ // not be empty.
+ //
+ const path& se (!p.empty ()
+ ? p
+ : path_cast<path> (!pattern_dir.empty ()
+ ? pattern_dir
+ : filesystem.start_dir ()));
+
+ if (!path_match (pcr, se.leaf ().representation ()))
+ continue;
+
+ // If the callback function returns false, then we stop the entire search
+ // for the final match, or do not search below the path for the
+ // intermediate one.
+ //
+ if (!func (pattern_dir / p, pcr, !simple))
+ {
+ if (simple) // Final match.
+ return false;
+ else
+ continue;
+ }
+
+ // If the pattern is not a simple one, and it's leftmost component
+ // matches the sub-entry, then the sub-entry is a directory (see the note
+ // above), and we search in it using the trailing part of the pattern.
+ //
+ if (!simple && !search (pattern.leaf (pc),
+ pattern_dir / path_cast<dir_path> (move (p)),
+ follow_symlinks,
+ func,
+ filesystem))
+ return false;
+ }
+
+ return true;
+ }
+
+ // Path search implementations.
+ //
+ static const dir_path empty_dir;
+
+ using preopen = std::function<bool (const dir_path&)>;
+
+ // Base for filesystem (see above) implementations.
+ //
+ // Don't copy start directory. It is expected to exist till the end of the
+ // object lifetime.
+ //
+ class filesystem_base
+ {
+ protected:
+ filesystem_base (const dir_path& start): start_ (start) {}
+
+ public:
+ const dir_path&
+ start_dir ()
+ {
+ if (!start_.empty ())
+ return start_;
+
+ if (current_.empty ())
+ current_ = dir_path::current_directory ();
+
+ return current_;
+ }
+
+ protected:
+ const dir_path& start_;
+ dir_path current_;
+ };
+
+ // Search path in the real filesystem.
+ //
// Iterate over directory sub-entries, recursively and including itself if
// requested. Note that recursive iterating goes depth-first which make
// sense for the cleanup use cases (@@ maybe this should be controllable
@@ -1109,8 +1295,6 @@ namespace butl
// Note that iterating over non-existent directory is not en error. The
// subsequent next() call returns false for such a directory.
//
- using preopen = std::function<bool (const dir_path&)>;
-
class recursive_dir_iterator
{
public:
@@ -1128,10 +1312,11 @@ namespace butl
open (dir_path (), self_);
}
- // Non-copyable, non-movable type.
+ // Move constructible-only, non-assignable type.
//
recursive_dir_iterator (const recursive_dir_iterator&) = delete;
recursive_dir_iterator& operator= (const recursive_dir_iterator&) = delete;
+ recursive_dir_iterator (recursive_dir_iterator&&) = default;
// Return false if no more entries left. Otherwise save the next entry path
// and return true. The path is relative against the directory being
@@ -1235,165 +1420,207 @@ namespace butl
small_vector<pair<dir_iterator, dir_path>, 1> iters_;
};
- // Search for paths matching the pattern and call the specified function for
- // each matching path. Return false if the underlying func() call returns
- // false. Otherwise the function conforms to the path_search() description.
+ // Provide an access to the real filesystem.
//
- static const string any_dir ("*/");
-
- static bool
- search (
- path pattern,
- dir_path pattern_dir,
- const dir_path start_dir,
- bool follow_symlinks,
- const function<bool (path&&, const string& pattern, bool interm)>& func)
+ class real_filesystem: public filesystem_base
{
- // Fast-forward the leftmost pattern non-wildcard components. So, for
- // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/.
- //
- {
- auto b (pattern.begin ());
- auto e (pattern.end ());
- auto i (b);
- for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ;
+ public:
+ using iterator_type = recursive_dir_iterator;
- // If the pattern has no wildcards then we reduce to checking for the
- // filesystem entry existence. It matches if exists and is of the proper
- // type.
- //
- if (i == e)
- {
- path p (pattern_dir / pattern);
- auto pe (path_entry (start_dir / p, true));
+ real_filesystem (const dir_path& start): filesystem_base (start) {}
- if (pe.first &&
- ((pe.second.type == entry_type::directory) == p.to_directory ()))
- return func (move (p), string (), false);
+ pair<bool, entry_stat>
+ path_entry (const path& p, bool follow_symlinks) const
+ {
+ return butl::path_entry (start_ / p, follow_symlinks);
+ }
- return true;
- }
- else if (i != b) // There are non-wildcard components, so fast-forward.
- {
- path p (b, i);
- pattern = pattern.leaf (p);
- pattern_dir /= path_cast<dir_path> (move (p));
- }
+ iterator_type
+ iterator (const dir_path& p,
+ bool recursive,
+ bool self,
+ bool follow_symlinks,
+ preopen po) const
+ {
+ return iterator_type (start_ / p, recursive, self, follow_symlinks, po);
}
+ };
- assert (!pattern.empty ());
+ void
+ path_search (
+ const path& pattern,
+ const function<bool (path&&, const string& pattern, bool interm)>& func,
+ const dir_path& start,
+ bool follow_symlinks)
+ {
+ real_filesystem fs (pattern.relative () ? start : empty_dir);
+ search (pattern, dir_path (), follow_symlinks, func, fs);
+ }
- // The pattern leftmost component. Will use it to match the start directory
- // sub-entries.
- //
- path pc (pattern.begin (), ++pattern.begin ());
- string pcr (pc.representation ());
+ // Search path in the directory tree represented by a path.
+ //
+ // Iterate over path prefixes, as recursive_dir_iterator (see above) would
+ // iterate over the real directory tree.
+ //
+ class path_iterator
+ {
+ public:
+ path_iterator (path p, bool recursive, bool self, preopen po)
+ : path_ (move (p)),
+ recursive_ (recursive),
+ self_ (self),
+ preopen_ (move (po)),
+ iter_ (path_.begin ())
+ {
+ open (dir_path (), self_);
+ }
- // Note that if the pattern has multiple components (is not a simple path),
- // then the leftmost one has a trailing separator, and so will match
- // sub-directories only.
+ // Move constructible-only, non-assignable type.
//
- bool simple (pattern.simple ());
+ path_iterator (const path_iterator&) = delete;
+ path_iterator& operator= (const path_iterator&) = delete;
+ path_iterator (path_iterator&&) = default;
- // Note that we rely on "small function object" optimization here.
+ // Return false if no more entries left. Otherwise save the next entry path
+ // and return true.
//
- recursive_dir_iterator i (
- start_dir / pattern_dir,
- pcr.find ("**") != string::npos, // Recursive.
- pcr.find ("***") != string::npos, // Self-inclusive.
- follow_symlinks,
- [&pattern_dir, &func] (const dir_path& p) -> bool // Preopen.
+ bool
+ next (path& p)
+ {
+ if (iter_ == path_.begin ())
{
- return func (pattern_dir / p, any_dir, true);
- });
+ if (!self_)
+ return false;
- // Canonicalize the pattern component collapsing consecutive stars (used to
- // express that it is recursive) into a single one.
- //
- size_t j (0);
- size_t n (pcr.size ());
- for (size_t i (0); i < n; ++i)
- {
- char c (pcr[i]);
- if (!(c == '*' && i > 0 && pcr[i - 1] == '*'))
- pcr[j++] = c;
- }
+ p = path ();
+ self_ = false; // To bail out the next time.
+ return true;
+ }
- if (j != n)
- pcr.resize (j);
+ path pe (path_.begin (), iter_);
+ if (recursive_ && pe.to_directory ())
+ {
+ open (path_cast<dir_path> (pe), true);
+ return next (p);
+ }
- // Note that the callback function can be called for the same directory
- // twice: first time as intermediate match from iterator's preopen() call,
- // and then, if the first call succeed, from the iterating loop (possibly
- // as the final match).
- //
- path p;
- while (i.next (p))
+ --iter_; // Return one level up.
+
+ p = move (pe);
+ return true;
+ }
+
+ private:
+ void
+ open (const dir_path& p, bool preopen)
{
- // Skip sub-entry if its name doesn't match the pattern leftmost
- // component.
+ // If preopen_() returns false, then the directory will not be
+ // traversed (as we reset the recursive flag) but still be returned by
+ // the next() call as a sub-entry.
//
- // Matching the directory we are iterating through (as for a pattern
- // component containing ***) is a bit tricky. This directory is
- // represented by the iterator as an empty path, and so we need to
- // compute it (the leaf would actually be enough) for matching. This
- // leaf can be aquired from the pattern_dir / start_dir path except the
- // case when both directories are empty. This is the case when we search
- // in the current directory (start_dir is empty) with a pattern that
- // starts with *** wildcard (for example f***/bar). All we can do here is
- // to fallback to path::current_directory() call. Note that this will be
- // the only call per path_search() as the next time pattern_dir will not
- // be empty.
+ if (preopen && !preopen_ (p))
+ recursive_ = false;
+ else if (iter_ != path_.end ())
+ ++iter_;
+
+ // If the rightmost component is reached, then all the directories were
+ // traversed, so we reset the recursive flag.
//
- const path& se (!p.empty ()
- ? p
- : path_cast<path> (!pattern_dir.empty ()
- ? pattern_dir
- : !start_dir.empty ()
- ? start_dir
- : path::current_directory ()));
+ if (iter_ == path_.end ())
+ recursive_ = false;
+ }
- if (!path_match (pcr, se.leaf ().representation ()))
- continue;
+ private:
+ path path_;
+ bool recursive_;
+ bool self_;
+ preopen preopen_;
+ path::iterator iter_;
+ };
- // If the callback function returns false, then we stop the entire search
- // for the final match, or do not search below the path for the
- // intermediate one.
- //
- if (!func (pattern_dir / p, pcr, !simple))
- {
- if (simple) // Final match.
- return false;
- else
- continue;
- }
+ // Provide an access to a directory tree, that is represented by the path.
+ //
+ // Note that symlinks are meaningless for this filesystem.
+ //
+ class path_filesystem: public filesystem_base
+ {
+ public:
+ using iterator_type = path_iterator;
- // If the pattern is not a simple one, and it's leftmost component
- // matches the sub-entry, then the sub-entry is a directory (see the note
- // above), and we search in it using the trailing part of the pattern.
+ path_filesystem (const dir_path& start, const path& p)
+ : filesystem_base (start),
+ path_ (p) {}
+
+ pair<bool, entry_stat>
+ path_entry (const path& p, bool /*follow_symlinks*/) const
+ {
+ // Note that paths are not required to be normalized, so we just check
+ // that one path is a literal prefix of the other one.
//
- if (!simple && !search (pattern.leaf (pc),
- pattern_dir / path_cast<dir_path> (move (p)),
- start_dir,
- follow_symlinks,
- func))
- return false;
+ if (!path_.sub (p))
+ return make_pair (false, entry_stat {entry_type::unknown, 0});
+
+ entry_type t (p == path_ && !p.to_directory ()
+ ? entry_type::regular
+ : entry_type::directory);
+
+ return make_pair (true, entry_stat {t, 0});
}
- return true;
- }
+ iterator_type
+ iterator (const dir_path& p,
+ bool recursive,
+ bool self,
+ bool /*follow_symlinks*/,
+ preopen po) const
+ {
+ assert (path_.sub (p));
+ return iterator_type (path_.leaf (p), recursive, self, po);
+ }
+
+ private:
+ const path& path_;
+ };
void
path_search (
const path& pattern,
+ const path& entry,
const function<bool (path&&, const string& pattern, bool interm)>& func,
- const dir_path& start,
- bool follow_symlinks)
+ const dir_path& start)
+ {
+ path_filesystem fs (pattern.relative () ? start : empty_dir, entry);
+ search (pattern, dir_path (), true, func, fs);
+ }
+
+ bool
+ path_match (const path& pattern, const path& entry, const dir_path& start)
{
- search (pattern,
- dir_path (),
- pattern.relative () ? start : dir_path (),
- follow_symlinks,
- func);
+ bool r (false);
+
+ auto match = [&entry, &r] (path&& p, const std::string&, bool interim)
+ {
+ if (p == entry)
+ {
+ // If we found the entry (possibly through one of the recursive
+ // components) no need to search further.
+ //
+ if (!interim)
+ {
+ r = true;
+ return false;
+ }
+ else
+ // For self-matching the callback is first called in the interim
+ // mode (through the preopen function) with an empty path.
+ //
+ assert (p.empty ());
+ }
+
+ return true;
+ };
+
+ path_search (pattern, entry, match, start);
+ return r;
}
}
diff --git a/libbutl/filesystem.hxx b/libbutl/filesystem.hxx
index 6ea7d2c..b4f8d96 100644
--- a/libbutl/filesystem.hxx
+++ b/libbutl/filesystem.hxx
@@ -489,6 +489,10 @@ namespace butl
// Wildcard pattern match and search (aka glob).
//
+ // Currently the following wildcard characters are supported:
+ //
+ // * - match any number of characters (including zero)
+ // ? - match any single character
// Return true if name matches pattern. Both must be single path components,
// possibly with a trailing directory separator to indicate a directory.
@@ -498,14 +502,19 @@ namespace butl
// different). Otherwise, it only matches a non-directory name (no trailing
// directory separator).
//
- // Currently the following wildcard characters are supported:
- //
- // * - match any number of characters (including zero)
- // ? - match any single character
- //
LIBBUTL_EXPORT bool
path_match (const std::string& pattern, const std::string& name);
+ // Return true if path entry matches pattern. Note that the match is
+ // performed literally, with no paths normalization being performed. The
+ // start directory is used if the first pattern component is a self-matching
+ // wildcard (see below for the start directory and wildcard semantics).
+ //
+ LIBBUTL_EXPORT bool
+ path_match (const path& pattern,
+ const path& entry,
+ const dir_path& start = dir_path ());
+
// Search for paths matching the pattern calling the specified function for
// each matching path (see below for details).
//
@@ -521,7 +530,10 @@ namespace butl
// path_search() also recognizes the ** and *** wildcard sequences. If a
// path component contains **, then it is matched just like * but in all the
// subdirectories, recursively. The *** wildcard behaves like ** but also
- // matches the start directory itself.
+ // matches the start directory itself. Note that if the first pattern
+ // component contains ***, then the start directory must be empty or be
+ // terminated with a "meaningful" component (e.g., probably not '.' or
+ // '..').
//
// So, for example, foo/bar-**.txt will return all the files matching the
// bar-*.txt pattern in all the subdirectoris of foo/. And foo/f***/ will
@@ -567,6 +579,12 @@ namespace butl
// (a/b/, b*/, true)
// (a/b/c/, c*/, false)
//
+ // Symlinks are not followed if the follow_symlinks argument is false. This
+ // rule is only applied for symlinks that are matched against the rightmost
+ // component of the pattern. In particular, this mean that such symlinks will
+ // never match a directory pattern, and some results can be missing for the
+ // recursive rightmost component.
+ //
// Note that recursive iterating through directories currently goes
// depth-first which make sense for the cleanup use cases. In future we may
// want to make it controllable.
@@ -578,6 +596,18 @@ namespace butl
bool interm)>&,
const dir_path& start = dir_path (),
bool follow_symlinks = true);
+
+ // Same as above, but behaves as if the directory tree being searched
+ // through contains only the specified entry. The start directory is used if
+ // the first pattern component is a self-matching wildcard (see above).
+ //
+ LIBBUTL_EXPORT void
+ path_search (const path& pattern,
+ const path& entry,
+ const std::function<bool (path&&,
+ const std::string& pattern,
+ bool interm)>&,
+ const dir_path& start = dir_path ());
}
#include <libbutl/filesystem.ixx>
diff --git a/tests/wildcard/driver.cxx b/tests/wildcard/driver.cxx
index 2397fc8..1e600f6 100644
--- a/tests/wildcard/driver.cxx
+++ b/tests/wildcard/driver.cxx
@@ -2,6 +2,7 @@
// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
// license : MIT; see accompanying LICENSE file
+#include <map>
#include <string>
#include <vector>
#include <cassert>
@@ -90,8 +91,11 @@ try
assert (i == argc); // All args parsed,
vector<path> paths;
- auto add =
- [&paths, &start] (path&& p, const std::string& pt, bool interim) -> bool
+ map<path, size_t> path_count;
+
+ auto add = [&paths, &path_count, &start] (path&& p,
+ const string& pt,
+ bool interim)
{
bool pd (!pt.empty () && pt[0] == '.'); // Dot-started pattern.
@@ -114,13 +118,58 @@ try
return !skip;
if (!skip)
- paths.emplace_back (move (p.canonicalize ()));
+ {
+ p.canonicalize ();
+
+ auto i (path_count.find (p));
+ if (i == path_count.end ())
+ path_count[p] = 1;
+ else
+ ++(i->second);
+
+ paths.emplace_back (move (p));
+ }
return true;
};
path_search (pattern, add, start);
+ // Test search in the directory tree represented by the path.
+ //
+ for (const auto& p: path_count)
+ {
+ // Will match multiple times if the pattern contains several recursive
+ // components.
+ //
+ size_t match_count (0);
+
+ auto check = [&p, &match_count] (path&& pe, const string&, bool interim)
+ {
+ if (pe == p.first)
+ {
+ if (!interim)
+ ++match_count;
+ else
+ // For self-matching the callback is first called in the interim
+ // mode (through the preopen function) with an empty path.
+ //
+ assert (pe.empty ());
+ }
+
+ return true;
+ };
+
+ path_search (pattern, p.first, check, start);
+ assert (match_count == p.second);
+
+ // Test path match.
+ //
+ assert (path_match (pattern, p.first, start));
+ }
+
+ // Print the found paths.
+ //
if (sort)
std::sort (paths.begin (), paths.end ());