From 78910e3cb0b9cc215e53142c28f8b9f52c30af60 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 3 Feb 2017 14:57:03 +0200 Subject: Implement path_match() and path_search() --- butl/filesystem | 53 ++++++++ butl/filesystem.cxx | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++++ butl/path | 2 +- 3 files changed, 423 insertions(+), 1 deletion(-) (limited to 'butl') diff --git a/butl/filesystem b/butl/filesystem index ef715c6..a125dab 100644 --- a/butl/filesystem +++ b/butl/filesystem @@ -24,6 +24,7 @@ #include // uint16_t #include // move(), pair #include +#include #include @@ -396,6 +397,58 @@ namespace butl // inline dir_iterator begin (dir_iterator&); inline dir_iterator end (const dir_iterator&); + + // Wildcard pattern match and search (aka glob). + // + + // Return true if name matches pattern. Both must be single path components, + // possibly with a trailing directory separator to indicate a directory. + // + // If the pattern ends with a directory separator, then it only matches a + // directory name (i.e., ends with a directory separator, but potentially + // different). Otherwise, it only matches a non-directory name (no trailing + // directory separator). + // + // Currently the following wildcard characters are supported: + // + // * - match any number of characters (including zero) + // ? - match any single character + // + LIBBUTL_EXPORT bool + path_match (const std::string& pattern, const std::string& name); + + // Search for paths matching the pattern calling the specified function for + // each matching path. Stop the search if the function returns false. + // + // If the pattern is relative, then search in the start directory. If the + // start directory is empty, then search in the current working directory. + // Searching in non-existent directories is not an error. Throw + // std::system_error in case of a failure (insufficient permissions, etc). + // + // The pattern may contain multiple components that include wildcards. On + // Windows the drive letter may not be a wildcard. + // + // In addition to the wildcard characters listed in path_match(), + // path_search() also recognizes the ** and *** wildcard sequences. If a + // path component contains **, then it is matched just like * but in all the + // subdirectories, recursively. The *** wildcard behaves like ** but also + // matches the start directory itself. + // + // So, for example, foo/bar-**.txt will return all the files matching the + // bar-*.txt pattern in all the subdirectoris of foo/. And foo/f***/ will + // return all the subdirectories matching the f*/ pattern plus foo/ itself. + // + // Note that having multiple recursive components in the pattern we can end + // up with calling func() multiple times (once per such a component) for the + // same path. For example the search with pattern f***/b**/ starting in + // directory foo, that has the foo/fox/box/ structure, will result in + // calling func(foo/fox/box/) twice: first time for being a child of fox/, + // second time for being a child of foo/. + // + LIBBUTL_EXPORT void + path_search (const path& pattern, + const std::function&, + const dir_path& start = dir_path ()); } #include diff --git a/butl/filesystem.cxx b/butl/filesystem.cxx index e7e54a8..1553324 100644 --- a/butl/filesystem.cxx +++ b/butl/filesystem.cxx @@ -25,10 +25,16 @@ #include // errno, E* +#include +#include #include // unique_ptr +#include // pair +#include // reverse_iterator #include +#include #include +#include using namespace std; @@ -728,4 +734,367 @@ namespace butl } } #endif + + // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty. + // + static bool + match (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + using reverse_iterator = std::reverse_iterator; + + reverse_iterator rpi (pe); + reverse_iterator rpe (pi); + + reverse_iterator rni (ne); + reverse_iterator rne (ni); + + // Match the pattern suffix (follows the last *) to the name trailing + // characters. + // + char pc; + for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni) + { + if (*rni != pc && pc != '?') + return false; + } + + // If we got to the (reversed) end of the pattern (no * is encountered) + // than we are done. The success depends on if we got to the (reversed) end + // of the name as well. + // + if (rpi == rpe) + return rni == rne; + + // If we didn't reach * in the pattern then we reached the (reversed) end + // of the name. That means we have unmatched non-star characters in the + // pattern, and so match failed. + // + if (pc != '*') + { + assert (rni == rne); + return false; + } + + // Match the pattern prefix (ends with the first *) to the name leading + // characters. If they mismatch we failed. Otherwise if this is an only * + // in the pattern (matches whatever is left in the name) then we succeed, + // otherwise we perform backtracking (recursively). + // + pe = rpi.base (); + ne = rni.base (); + + // Compare the pattern and the name char by char until the name suffix or + // * is encountered in the pattern (whichever happens first). Fail if a + // char mismatches. + // + for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni) + { + if (*ni != pc && pc != '?') + return false; + } + + // If we didn't get to * in the pattern then we got to the name suffix. + // That means that the pattern has unmatched non-star characters, and so + // match failed. + // + if (pc != '*') + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last one, then it matches whatever is + // left in the name (including an empty range). + // + if (++pi == pe) + return true; + + // Perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost * and ending the rightmost one inclusively) as pattern, and + // the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (staring from an empty one and iterating till the full + // name). So if, at some iteration, the pattern trailing part (that follows + // the leftmost *) matches the name trailing part, then the pattern matches + // the name. + // + bool r; + for (; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + bool + path_match (const string& pattern, const string& name) + { + // Implementation notes: + // + // - This has a good potential of becoming hairy quickly so need to strive + // for an elegant way to implement this. + // + // - Most patterns will contains a single * wildcard with a prefix and/or + // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very + // common: *foo*. + // + // So it would be nice to have a clever implementation that first + // "anchors" itself with a literal prefix and/or suffix and only then + // continue with backtracking. In other words, reduce: + // + // *.txt vs foo.txt -> * vs foo + // foo* vs foo.txt -> * vs .txt + // f*.txt vs foo.txt -> * vs oo + // + + auto pi (pattern.rbegin ()); + auto pe (pattern.rend ()); + + auto ni (name.rbegin ()); + auto ne (name.rend ()); + + // The name doesn't match the pattern if it is of a different type than the + // pattern is. + // + bool pd (pi != pe && path::traits::is_separator (*pi)); + bool nd (ni != ne && path::traits::is_separator (*ni)); + + if (pd != nd) + return false; + + // Skip trailing separators if present. + // + if (pd) + { + ++pi; + ++ni; + } + + return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); + } + + // Iterate over directory sub-entries, recursively and including itself if + // requested. Note that recursive iterating goes depth-first which make + // sense for the cleanup use cases (@@ maybe this should be controllable + // since for directory creation it won't make sense). + // + // Note that iterating over non-existent directory is not en error. The + // subsequent next() call returns false for such a directory. + // + class recursive_dir_iterator + { + public: + recursive_dir_iterator (dir_path p, bool recursive, bool self) + : recursive_ (recursive), self_ (self), start_ (p) + { + open (dir_path ()); + } + + // Non-copyable, non-movable type. + // + recursive_dir_iterator (const recursive_dir_iterator&) = delete; + recursive_dir_iterator& operator= (const recursive_dir_iterator&) = delete; + + // Return false if no more entries left. Otherwise save the next entry path + // and return true. The path is relative against the directory being + // traversed and contains a trailing separator for sub-directories. Throw + // std::system_error in case of a failure (insufficient permissions, + // dangling symlink encountered, etc). + // + bool + next (path& p) + { + if (iters_.empty ()) + return false; + + auto& i (iters_.back ()); + + // If we got to the end of directory sub-entries, then go one level up + // and return this directory path. + // + if (i.first == dir_iterator ()) + { + path d (move (i.second)); + iters_.pop_back (); + + // Return the path unless it is the last one (the directory we started + // to iterate from) and the self flag is not set. + // + if (iters_.empty () && !self_) + return false; + + p = move (d); + return true; + } + + const dir_entry& de (*i.first); + + // Append separator if a directory. Note that dir_entry::type() can + // throw. + // + path pe (de.type () == entry_type::directory + ? path_cast (i.second / de.path ()) + : i.second / de.path ()); + + ++i.first; + + if (recursive_ && pe.to_directory ()) + { + open (path_cast (move (pe))); + return next (p); + } + + p = move (pe); + return true; + } + + private: + void + open (dir_path p) + { + // We should consider a racing condition here. The directory can be + // removed before we create an iterator for it. In this case we just do + // nothing, so the directory is silently skipped. + // + try + { + dir_path d (start_ / p); + dir_iterator i (!d.empty () ? d : dir_path (".")); + iters_.emplace_back (move (i), move (p)); + } + catch (const system_error& e) + { + // Ignore non-existent directory (ENOENT or ENOTDIR). Rethrow for any + // other error. We consider ENOTDIR as a variety of removal, with a + // new filesystem entry being created afterwards. + // + int ec (e.code ().value ()); + if (ec != ENOENT && ec != ENOTDIR) + throw; + } + } + + private: + bool recursive_; + bool self_; + dir_path start_; + small_vector, 1> iters_; + }; + + // Search for paths matching the pattern and call the specified function for + // each matching path. Return false if the underlying func() call returns + // false. Otherwise the function conforms to the path_search() description. + // + static bool + search (path pattern, dir_path pattern_dir, const dir_path start_dir, + const function& func) + { + // Fast-forward the leftmost pattern non-wildcard components. So, for + // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/. + // + { + auto b (pattern.begin ()); + auto e (pattern.end ()); + auto i (b); + for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ; + + // If the pattern has no wildcards then we reduce to checking for the + // filesystem entry existence. It matches if exists and is of the proper + // type. + // + if (i == e) + { + path p (pattern_dir / pattern); + auto pe (path_entry (start_dir / p, true)); + + if (pe.first && + ((pe.second == entry_type::directory) == p.to_directory ())) + return func (move (p)); + + return true; + } + else if (i != b) // There are non-wildcard components, so fast-forward. + { + path p (b, i); + pattern = pattern.leaf (p); + pattern_dir /= path_cast (move (p)); + } + } + + assert (!pattern.empty ()); + + // The pattern leftmost component. Will use it to match the start directory + // sub-entries. + // + path pc (pattern.begin (), ++pattern.begin ()); + string pcr (pc.representation ()); + + // Note that if the pattern has multiple components (is not a simple path), + // then the leftmost one has a trailing separator, and so will match + // sub-directories only. + // + bool simple (pattern.simple ()); + + recursive_dir_iterator i ( + start_dir / pattern_dir, + pcr.find ("**") != string::npos, // Recursive. + pcr.find ("***") != string::npos); // Self-inclusive. + + path p; + while (i.next (p)) + { + // Skip sub-entry if its name doesn't match the pattern leftmost + // component. + // + // Matching the directory we are iterating through (as for a pattern + // component containing ***) is a bit tricky. This directory is + // represented by the iterator as an empty path, and so we need to + // compute it (the leaf would actually be enough) for matching. This + // leaf can be aquired from the pattern_dir / start_dir path except the + // case when both directories are empty. This is the case when we search + // in the current directory (start_dir is empty) with a pattern that + // starts with *** wildcard (for example f***/bar). All we can do here is + // to fallback to path::current_directory() call. Note that this will be + // the only call per path_search() as the next time pattern_dir will not + // be empty. + // + const path& se (!p.empty () + ? p + : path_cast (!pattern_dir.empty () + ? pattern_dir + : !start_dir.empty () + ? start_dir + : path::current_directory ())); + + if (!path_match (pcr, se.leaf ().representation ())) + continue; + + // If the pattern is a simple path then call func() for the sub-entry. + // Otherwise the sub-entry is a directory (read above), and we search in + // it using the trailing part of the pattern. + // + if (!( + simple + ? func (pattern_dir / p) + : search (pattern.leaf (pc), + pattern_dir / path_cast (move (p)), + start_dir, + func))) + return false; + } + + return true; + } + + void + path_search (const path& pattern, + const function& func, + const dir_path& start) + { + search (pattern, + dir_path (), + pattern.relative () ? start : dir_path (), + func); + } } diff --git a/butl/path b/butl/path index a81848b..19e3d42 100644 --- a/butl/path +++ b/butl/path @@ -27,7 +27,7 @@ namespace butl // p -= "*/"; // leaf // p -= ".*"; // base // - // - Faster normalize() implementation. In many cases (e.g., in buil2) + // - Faster normalize() implementation. In many cases (e.g., in build2) // the path is either already normal or the difference is just slashes // (i.e., there are no '.' or '..' components). So a fast path case // might be in order. -- cgit v1.1