From e6cee3c2f9b03852ed4837f9be05e0a2fa4542a8 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Mon, 30 Sep 2019 13:48:28 +0300 Subject: Move path match to path-pattern.?xx --- libbutl/path-pattern.cxx | 451 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 451 insertions(+) create mode 100644 libbutl/path-pattern.cxx (limited to 'libbutl/path-pattern.cxx') diff --git a/libbutl/path-pattern.cxx b/libbutl/path-pattern.cxx new file mode 100644 index 0000000..86be1ec --- /dev/null +++ b/libbutl/path-pattern.cxx @@ -0,0 +1,451 @@ +// file : libbutl/path-pattern.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#include + +#ifndef __cpp_lib_modules_ts +#include +#include +#include +#include // reverse_iterator + +#include // find() +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +module butl.path_pattern; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +#endif +import butl.path; +import butl.optional; +#endif + +import butl.utility; // lcase()[_WIN32] +import butl.filesystem; // path_search() +#else +#include +#include +#endif + +using namespace std; + +namespace butl +{ + // patterns + // + static inline bool + match (char c, char pc) + { +#ifndef _WIN32 + return c == pc; +#else + return lcase (c) == lcase (pc); +#endif + } + + bool + match_bracket (char c, const path_pattern_term& pt) + { + using iterator = string::const_iterator; + + assert (pt.bracket ()); + + iterator i (pt.begin + 1); // Position after '['. + iterator e (pt.end - 1); // Position at ']'. + + bool invert (*i == '!'); + if (invert) + ++i; + + bool r (false); + for (iterator b (i); i != e && !r; ++i) + { + char bc (*i); + + // If '-' is a first or last character in the bracket expression then + // match it literally and match the range otherwise. + // + if (bc == '-' && i != b && i + 1 != e) // Match the range? + { + // Note that we have already matched the range left endpoint character + // unsuccessfully (otherwise we wouldn't be here), so now we test if + // the character belongs to the (min-char, max-char] range. + // + // Also note that on Windows we match case insensitively and so can't + // just compare the character with the range endpoints. Thus, we + // fallback to matching each range character individually. + // +#ifndef _WIN32 + r = c > *(i - 1) && c <= *(i + 1); +#else + for (char bc (*(i - 1) + 1), mx (*(i + 1)); bc <= mx && !r; ++bc) + r = match (c, bc); +#endif + + ++i; // Position to the range max character. + } + else // Match against the expression character literally. + r = match (c, bc); + } + + return r != invert; + } + + // Match the name [ni, ne) to the pattern [pi, pe) that may not contain + // bracket expressions. Ranges can be empty. + // + static bool + match_no_brackets (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + using reverse_iterator = std::reverse_iterator; + + reverse_iterator rpi (pe); + reverse_iterator rpe (pi); + + reverse_iterator rni (ne); + reverse_iterator rne (ni); + + // Match the pattern suffix (follows the last *) to the name trailing + // characters. + // + char pc ('\0'); + for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni) + { + if (!match (*rni, pc) && pc != '?') + return false; + } + + // If we got to the (reversed) end of the pattern (no * is encountered) + // than we are done. The success depends on if we got to the (reversed) end + // of the name as well. + // + if (rpi == rpe) + return rni == rne; + + // If we didn't reach * in the pattern then we reached the (reversed) end + // of the name. That means we have unmatched non-star terms in the + // pattern, and so match failed. + // + if (pc != '*') + { + assert (rni == rne); + return false; + } + + // Match the pattern prefix (ends with the first *) to the name leading + // characters. If they mismatch we failed. Otherwise if this is an only * + // in the pattern (matches whatever is left in the name) then we succeed, + // otherwise we perform backtracking (recursively). + // + pe = rpi.base (); + ne = rni.base (); + + // Compare the pattern and the name term by char until the name suffix or + // * is encountered in the pattern (whichever happens first). Fail if a + // char mismatches. + // + for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni) + { + if (!match (*ni, pc) && pc != '?') + return false; + } + + // If we didn't get to * in the pattern then we got to the name suffix. + // That means that the pattern has unmatched non-star terms, and so match + // failed. + // + if (pc != '*') + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last one, then it matches whatever is + // left in the name (including an empty range). + // + if (++pi == pe) + return true; + + // Perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost * and ending the rightmost one inclusively) as pattern, and + // the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (staring from an empty one and iterating till the full + // name). So if, at some iteration, the pattern trailing part (that follows + // the leftmost *) matches the name trailing part, then the pattern matches + // the name. + // + bool r; + for (; !(r = match_no_brackets (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + // Match a character against the pattern term. + // + static inline bool + match (char c, const path_pattern_term& pt) + { + switch (pt.type) + { + // Matches any character. + // + case path_pattern_term_type::star: + case path_pattern_term_type::question: return true; + + case path_pattern_term_type::bracket: + { + return match_bracket (c, pt); + } + + case path_pattern_term_type::literal: + { + return match (c, get_literal (pt)); + } + } + + assert (false); // Can't be here. + return false; + } + + // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty. + // + static bool + match (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + // If the pattern doesn't contain the bracket expressions then reduce to + // the "clever" approach (see the implementation notes below for details). + // + if (find (pi, pe, '[') == pe) + return match_no_brackets (pi, pe, ni, ne); + + // Match the pattern prefix (precedes the first *) to the name leading + // characters. + // + path_pattern_iterator ppi (pi, pe); + path_pattern_iterator ppe; + path_pattern_term pt; + + for (; ppi != ppe && !(pt = *ppi).star () && ni != ne; ++ppi, ++ni) + { + if (!match (*ni, pt)) + return false; + } + + // If we got to the end of the pattern (no * is encountered) than we are + // done. The success depends on if we got to the end of the name as well. + // + if (ppi == ppe) + return ni == ne; + + // If we didn't reach * in the pattern then we reached the end of the + // name. That means we have unmatched non-star terms in the pattern, and + // so match failed. + // + if (!pt.star ()) + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last term, then it matches whatever is + // left in the name (including an empty range). + // + if (++ppi == ppe) + return true; + + // Switch back to the string iterator and perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost *) as pattern, and the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (see match_no_brackets() for details). + // + bool r; + for (pi = ppi->begin; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + bool + path_match (const string& name, const string& pattern) + { + // Implementation notes: + // + // - This has a good potential of becoming hairy quickly so need to strive + // for an elegant way to implement this. + // + // - Most patterns will contains a single * wildcard with a prefix and/or + // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very + // common: *foo*. + // + // So it would be nice to have a clever implementation that first + // "anchors" itself with a literal prefix and/or suffix and only then + // continue with backtracking. In other words, reduce: + // + // *.txt vs foo.txt -> * vs foo + // foo* vs foo.txt -> * vs .txt + // f*.txt vs foo.txt -> * vs oo + // + // Note that this approach fails if the pattern may contain bracket + // expressions. You can't easily recognize a suffix scanning backwards + // since * semantics depends on the characters to the left: + // + // f[o*]o - * is not a wildcard + // fo*]o - * is a wildcard + // + // That's why we will start with the straightforward left-to-right + // matching and reduce to the "clever" approach when the remaining part + // of the pattern doesn't contain bracket expressions. + + auto pi (pattern.rbegin ()); + auto pe (pattern.rend ()); + + auto ni (name.rbegin ()); + auto ne (name.rend ()); + + // The name doesn't match the pattern if it is of a different type than the + // pattern is. + // + bool pd (pi != pe && path::traits_type::is_separator (*pi)); + bool nd (ni != ne && path::traits_type::is_separator (*ni)); + + if (pd != nd) + return false; + + // Skip trailing separators if present. + // + if (pd) + { + ++pi; + ++ni; + } + + return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); + } + + bool + path_match (const path& entry, + const path& pattern, + const dir_path& start, + path_match_flags flags) + { + bool r (false); + + auto match = [&entry, &r] (path&& p, const string&, bool interim) + { + // If we found the entry (possibly through one of the recursive + // components) no need to search further. + // + if (p == entry && !interim) + { + r = true; + return false; + } + + return true; + }; + + path_search (pattern, entry, match, start, flags); + return r; + } + + // path_pattern_iterator + // + void path_pattern_iterator:: + next () + { + if (i_ == e_) + { + t_ = nullopt; // Convert the object into the end iterator. + return; + } + + auto next = [this] (path_pattern_term_type t) + { + assert (t != path_pattern_term_type::bracket); + + t_ = path_pattern_term {t, i_, i_ + 1}; + ++i_; + }; + + switch (*i_) + { + case '?': + { + next (path_pattern_term_type::question); + break; + } + case '*': + { + next (path_pattern_term_type::star); + break; + } + case '[': + { + // Try to find the bracket expression end. + // + // Note that '[' doesn't necessarily starts the bracket expression (no + // closing bracket, empty, etc). If that's the case, then we end up + // with the '[' literal terminal. + // + bool expr (false); + for (;;) // Breakout loop. + { + string::const_iterator i (i_ + 1); // Position after '['. + + if (i == e_) // Is '[' the pattern last character? + break; + + bool invert (*i == '!'); + if (invert && ++i == e_) // Is '!' the pattern last character? + break; + + // Find the bracket expression end. + // + // Note that the bracket expression may not be empty and ']' is a + // literal if it is the first expression character. + // + for (++i; i != e_ && *i != ']'; ++i) ; + + if (i == e_) // The closing bracket is not found? + break; + + expr = true; + + ++i; // Position after ']'. + + t_ = path_pattern_term {path_pattern_term_type::bracket, i_, i}; + + i_ = i; + break; + } + + // Fallback to '[' literal if it is not a bracket expression. + // + if (expr) + break; + } + // Fall through. + default: + { + next (path_pattern_term_type::literal); + } + } + } +} -- cgit v1.1