diff options
Diffstat (limited to 'libbutl')
-rw-r--r-- | libbutl/b.cxx | 186 | ||||
-rw-r--r-- | libbutl/b.hxx | 150 | ||||
-rw-r--r-- | libbutl/b.ixx | 31 | ||||
-rw-r--r-- | libbutl/b.mxx | 115 | ||||
-rw-r--r-- | libbutl/backtrace.cxx | 30 | ||||
-rw-r--r-- | libbutl/backtrace.hxx (renamed from libbutl/backtrace.mxx) | 19 | ||||
-rw-r--r-- | libbutl/base64.cxx | 102 | ||||
-rw-r--r-- | libbutl/base64.hxx (renamed from libbutl/base64.mxx) | 39 | ||||
-rw-r--r-- | libbutl/bufstreambuf.cxx | 13 | ||||
-rw-r--r-- | libbutl/bufstreambuf.hxx | 67 | ||||
-rw-r--r-- | libbutl/buildfile | 65 | ||||
-rw-r--r-- | libbutl/builtin-options.cxx | 455 | ||||
-rw-r--r-- | libbutl/builtin-options.hxx | 118 | ||||
-rw-r--r-- | libbutl/builtin-options.ixx | 35 | ||||
-rw-r--r-- | libbutl/builtin.cli | 5 | ||||
-rw-r--r-- | libbutl/builtin.cxx | 460 | ||||
-rw-r--r-- | libbutl/builtin.hxx (renamed from libbutl/builtin.mxx) | 61 | ||||
-rw-r--r-- | libbutl/builtin.ixx | 16 | ||||
-rw-r--r-- | libbutl/char-scanner.hxx (renamed from libbutl/char-scanner.mxx) | 45 | ||||
-rw-r--r-- | libbutl/char-scanner.ixx | 6 | ||||
-rw-r--r-- | libbutl/char-scanner.txx | 7 | ||||
-rw-r--r-- | libbutl/command.cxx | 50 | ||||
-rw-r--r-- | libbutl/command.hxx (renamed from libbutl/command.mxx) | 23 | ||||
-rw-r--r-- | libbutl/const-ptr.hxx (renamed from libbutl/const-ptr.mxx) | 21 | ||||
-rw-r--r-- | libbutl/curl.cxx | 173 | ||||
-rw-r--r-- | libbutl/curl.hxx (renamed from libbutl/curl.mxx) | 121 | ||||
-rw-r--r-- | libbutl/curl.ixx | 79 | ||||
-rw-r--r-- | libbutl/curl.txx | 10 | ||||
-rw-r--r-- | libbutl/default-options.cxx | 73 | ||||
-rw-r--r-- | libbutl/default-options.hxx (renamed from libbutl/default-options.mxx) | 70 | ||||
-rw-r--r-- | libbutl/default-options.ixx | 2 | ||||
-rw-r--r-- | libbutl/default-options.txx | 90 | ||||
-rw-r--r-- | libbutl/diagnostics.cxx | 89 | ||||
-rw-r--r-- | libbutl/diagnostics.hxx (renamed from libbutl/diagnostics.mxx) | 138 | ||||
-rw-r--r-- | libbutl/export.hxx | 8 | ||||
-rw-r--r-- | libbutl/fdstream.cxx | 356 | ||||
-rw-r--r-- | libbutl/fdstream.hxx (renamed from libbutl/fdstream.mxx) | 172 | ||||
-rw-r--r-- | libbutl/fdstream.ixx | 12 | ||||
-rw-r--r-- | libbutl/filesystem.cxx | 938 | ||||
-rw-r--r-- | libbutl/filesystem.hxx (renamed from libbutl/filesystem.mxx) | 213 | ||||
-rw-r--r-- | libbutl/filesystem.ixx | 62 | ||||
-rw-r--r-- | libbutl/ft/lang.hxx | 9 | ||||
-rw-r--r-- | libbutl/git.cxx | 44 | ||||
-rw-r--r-- | libbutl/git.hxx (renamed from libbutl/git.mxx) | 28 | ||||
-rw-r--r-- | libbutl/host-os-release.cxx | 323 | ||||
-rw-r--r-- | libbutl/host-os-release.hxx | 86 | ||||
-rw-r--r-- | libbutl/json/event.hxx | 27 | ||||
-rw-r--r-- | libbutl/json/parser.cxx | 645 | ||||
-rw-r--r-- | libbutl/json/parser.hxx | 705 | ||||
-rw-r--r-- | libbutl/json/parser.ixx | 552 | ||||
-rw-r--r-- | libbutl/json/pdjson.c | 1044 | ||||
-rw-r--r-- | libbutl/json/pdjson.h | 147 | ||||
-rw-r--r-- | libbutl/json/serializer.cxx | 671 | ||||
-rw-r--r-- | libbutl/json/serializer.hxx | 413 | ||||
-rw-r--r-- | libbutl/json/serializer.ixx | 247 | ||||
-rw-r--r-- | libbutl/lz4-stream.cxx | 281 | ||||
-rw-r--r-- | libbutl/lz4-stream.hxx | 280 | ||||
-rw-r--r-- | libbutl/lz4.c | 2495 | ||||
-rw-r--r-- | libbutl/lz4.cxx | 555 | ||||
-rw-r--r-- | libbutl/lz4.h | 774 | ||||
-rw-r--r-- | libbutl/lz4.hxx | 205 | ||||
-rw-r--r-- | libbutl/lz4frame.c | 1899 | ||||
-rw-r--r-- | libbutl/lz4frame.h | 623 | ||||
-rw-r--r-- | libbutl/lz4hc.c | 1615 | ||||
-rw-r--r-- | libbutl/lz4hc.h | 413 | ||||
-rw-r--r-- | libbutl/manifest-parser.cxx | 238 | ||||
-rw-r--r-- | libbutl/manifest-parser.hxx (renamed from libbutl/manifest-parser.mxx) | 38 | ||||
-rw-r--r-- | libbutl/manifest-rewriter.cxx | 46 | ||||
-rw-r--r-- | libbutl/manifest-rewriter.hxx (renamed from libbutl/manifest-rewriter.mxx) | 28 | ||||
-rw-r--r-- | libbutl/manifest-serializer.cxx | 153 | ||||
-rw-r--r-- | libbutl/manifest-serializer.hxx (renamed from libbutl/manifest-serializer.mxx) | 43 | ||||
-rw-r--r-- | libbutl/manifest-types.hxx (renamed from libbutl/manifest-types.mxx) | 22 | ||||
-rw-r--r-- | libbutl/mingw-condition_variable.hxx | 275 | ||||
-rw-r--r-- | libbutl/mingw-invoke.hxx | 109 | ||||
-rw-r--r-- | libbutl/mingw-mutex.hxx | 210 | ||||
-rw-r--r-- | libbutl/mingw-shared_mutex.hxx | 124 | ||||
-rw-r--r-- | libbutl/mingw-thread.hxx | 330 | ||||
-rw-r--r-- | libbutl/move-only-function.hxx | 177 | ||||
-rw-r--r-- | libbutl/multi-index.hxx (renamed from libbutl/multi-index.mxx) | 21 | ||||
-rw-r--r-- | libbutl/openssl.cxx | 27 | ||||
-rw-r--r-- | libbutl/openssl.hxx (renamed from libbutl/openssl.mxx) | 69 | ||||
-rw-r--r-- | libbutl/openssl.ixx | 14 | ||||
-rw-r--r-- | libbutl/openssl.txx | 68 | ||||
-rw-r--r-- | libbutl/optional.hxx (renamed from libbutl/optional.mxx) | 185 | ||||
-rw-r--r-- | libbutl/optional.ixx | 5 | ||||
-rw-r--r-- | libbutl/pager.cxx | 40 | ||||
-rw-r--r-- | libbutl/pager.hxx (renamed from libbutl/pager.mxx) | 26 | ||||
-rw-r--r-- | libbutl/path-io.hxx (renamed from libbutl/path-io.mxx) | 24 | ||||
-rw-r--r-- | libbutl/path-map.hxx (renamed from libbutl/path-map.mxx) | 35 | ||||
-rw-r--r-- | libbutl/path-pattern.cxx | 33 | ||||
-rw-r--r-- | libbutl/path-pattern.hxx (renamed from libbutl/path-pattern.mxx) | 27 | ||||
-rw-r--r-- | libbutl/path-pattern.ixx | 26 | ||||
-rw-r--r-- | libbutl/path.cxx | 58 | ||||
-rw-r--r-- | libbutl/path.hxx (renamed from libbutl/path.mxx) | 126 | ||||
-rw-r--r-- | libbutl/path.ixx | 93 | ||||
-rw-r--r-- | libbutl/path.txx | 18 | ||||
-rw-r--r-- | libbutl/prefix-map.hxx (renamed from libbutl/prefix-map.mxx) | 62 | ||||
-rw-r--r-- | libbutl/prefix-map.txx | 126 | ||||
-rw-r--r-- | libbutl/process-details.hxx | 32 | ||||
-rw-r--r-- | libbutl/process-io.cxx | 29 | ||||
-rw-r--r-- | libbutl/process-io.hxx (renamed from libbutl/process-io.mxx) | 23 | ||||
-rw-r--r-- | libbutl/process-run.cxx | 29 | ||||
-rw-r--r-- | libbutl/process-run.txx | 83 | ||||
-rw-r--r-- | libbutl/process.cxx | 372 | ||||
-rw-r--r-- | libbutl/process.hxx (renamed from libbutl/process.mxx) | 231 | ||||
-rw-r--r-- | libbutl/process.ixx | 233 | ||||
-rw-r--r-- | libbutl/project-name.cxx | 30 | ||||
-rw-r--r-- | libbutl/project-name.hxx (renamed from libbutl/project-name.mxx) | 23 | ||||
-rw-r--r-- | libbutl/prompt.cxx | 30 | ||||
-rw-r--r-- | libbutl/prompt.hxx (renamed from libbutl/prompt.mxx) | 23 | ||||
-rw-r--r-- | libbutl/regex.cxx | 31 | ||||
-rw-r--r-- | libbutl/regex.hxx (renamed from libbutl/regex.mxx) | 71 | ||||
-rw-r--r-- | libbutl/regex.ixx | 30 | ||||
-rw-r--r-- | libbutl/regex.txx | 77 | ||||
-rw-r--r-- | libbutl/semantic-version.cxx | 98 | ||||
-rw-r--r-- | libbutl/semantic-version.hxx (renamed from libbutl/semantic-version.mxx) | 93 | ||||
-rw-r--r-- | libbutl/semantic-version.ixx | 64 | ||||
-rw-r--r-- | libbutl/sendmail.cxx | 27 | ||||
-rw-r--r-- | libbutl/sendmail.hxx (renamed from libbutl/sendmail.mxx) | 31 | ||||
-rw-r--r-- | libbutl/sendmail.ixx | 5 | ||||
-rw-r--r-- | libbutl/sha1.c | 10 | ||||
-rw-r--r-- | libbutl/sha1.cxx | 34 | ||||
-rw-r--r-- | libbutl/sha1.hxx (renamed from libbutl/sha1.mxx) | 33 | ||||
-rw-r--r-- | libbutl/sha256.cxx | 44 | ||||
-rw-r--r-- | libbutl/sha256.hxx (renamed from libbutl/sha256.mxx) | 29 | ||||
-rw-r--r-- | libbutl/small-allocator.hxx (renamed from libbutl/small-allocator.mxx) | 18 | ||||
-rw-r--r-- | libbutl/small-forward-list.hxx (renamed from libbutl/small-forward-list.mxx) | 29 | ||||
-rw-r--r-- | libbutl/small-list.hxx (renamed from libbutl/small-list.mxx) | 31 | ||||
-rw-r--r-- | libbutl/small-vector-odb.hxx | 2 | ||||
-rw-r--r-- | libbutl/small-vector.hxx (renamed from libbutl/small-vector.mxx) | 55 | ||||
-rw-r--r-- | libbutl/standard-version.cxx | 32 | ||||
-rw-r--r-- | libbutl/standard-version.hxx (renamed from libbutl/standard-version.mxx) | 25 | ||||
-rw-r--r-- | libbutl/string-parser.cxx | 30 | ||||
-rw-r--r-- | libbutl/string-parser.hxx (renamed from libbutl/string-parser.mxx) | 19 | ||||
-rw-r--r-- | libbutl/string-table.hxx (renamed from libbutl/string-table.mxx) | 26 | ||||
-rw-r--r-- | libbutl/string-table.txx | 3 | ||||
-rw-r--r-- | libbutl/tab-parser.cxx | 31 | ||||
-rw-r--r-- | libbutl/tab-parser.hxx (renamed from libbutl/tab-parser.mxx) | 20 | ||||
-rw-r--r-- | libbutl/target-triplet.cxx | 53 | ||||
-rw-r--r-- | libbutl/target-triplet.hxx (renamed from libbutl/target-triplet.mxx) | 30 | ||||
-rw-r--r-- | libbutl/timestamp.cxx | 127 | ||||
-rw-r--r-- | libbutl/timestamp.hxx (renamed from libbutl/timestamp.mxx) | 32 | ||||
-rw-r--r-- | libbutl/unicode.cxx | 23 | ||||
-rw-r--r-- | libbutl/unicode.hxx (renamed from libbutl/unicode.mxx) | 20 | ||||
-rw-r--r-- | libbutl/url.hxx (renamed from libbutl/url.mxx) | 41 | ||||
-rw-r--r-- | libbutl/url.ixx | 2 | ||||
-rw-r--r-- | libbutl/url.txx | 7 | ||||
-rw-r--r-- | libbutl/utf8.hxx (renamed from libbutl/utf8.mxx) | 22 | ||||
-rw-r--r-- | libbutl/utf8.ixx | 2 | ||||
-rw-r--r-- | libbutl/utility.cxx | 119 | ||||
-rw-r--r-- | libbutl/utility.hxx (renamed from libbutl/utility.mxx) | 145 | ||||
-rw-r--r-- | libbutl/utility.ixx | 134 | ||||
-rw-r--r-- | libbutl/uuid-linux.cxx | 2 | ||||
-rw-r--r-- | libbutl/uuid-openbsd.cxx | 80 | ||||
-rw-r--r-- | libbutl/uuid.cxx | 23 | ||||
-rw-r--r-- | libbutl/uuid.hxx | 20 | ||||
-rw-r--r-- | libbutl/uuid.ixx | 4 | ||||
-rw-r--r-- | libbutl/vector-view.hxx (renamed from libbutl/vector-view.mxx) | 19 | ||||
-rw-r--r-- | libbutl/win32-utility.cxx | 9 | ||||
-rw-r--r-- | libbutl/win32-utility.hxx | 4 | ||||
-rw-r--r-- | libbutl/xxhash.c | 1030 | ||||
-rw-r--r-- | libbutl/xxhash.h | 328 |
162 files changed, 22581 insertions, 3513 deletions
diff --git a/libbutl/b.cxx b/libbutl/b.cxx index e1caa4c..0b4472f 100644 --- a/libbutl/b.cxx +++ b/libbutl/b.cxx @@ -1,58 +1,19 @@ // file : libbutl/b.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/b.mxx> -#endif - -// C includes. +#include <libbutl/b.hxx> +#include <ios> // ios::failure #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <cstddef> -#include <cstdint> -#include <stdexcept> -#include <functional> - -#include <ios> // ios::failure -#include <utility> // move() +#include <utility> // move() #include <sstream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.b; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.url; -import butl.path; -import butl.process; -import butl.optional; -import butl.project_name; -import butl.standard_version; -#endif - -import butl.utility; // next_word(), eof(), etc -import butl.path_io; -import butl.fdstream; -import butl.process_io; // operator<<(ostream, process_path) -import butl.small_vector; -#else -#include <libbutl/utility.mxx> -#include <libbutl/path-io.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/process-io.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <algorithm> + +#include <libbutl/utility.hxx> // next_word(), eof(), etc +#include <libbutl/path-io.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/process-io.hxx> // operator<<(ostream, process_path) +#include <libbutl/small-vector.hxx> using namespace std; @@ -71,15 +32,30 @@ namespace butl throw runtime_error ("invalid " + d); } - b_project_info - b_info (const dir_path& project, - bool ext_mods, + void + b_info (std::vector<b_project_info>& r, + const vector<dir_path>& projects, + b_info_flags fl, uint16_t verb, const function<b_callback>& cmd_callback, const path& program, const dir_path& search_fallback, const vector<string>& ops) { + // Bail out if the project list is empty. + // + if (projects.empty ()) + return; + + // Reserve enough space in the result and save its original size. + // + size_t rn (r.size ()); + { + size_t n (rn + projects.size ()); + if (r.capacity () < n) + r.reserve (n); + } + try { process_path pp ( @@ -105,6 +81,23 @@ namespace butl else vops.push_back ("-q"); + string spec ("info("); + + // Note that quoting is essential here. + // + for (size_t i (0); i != projects.size(); ++i) + { + if (i != 0) + spec += ' '; + + spec += '\'' + projects[i].representation () + '\''; + } + + if ((fl & b_info_flags::subprojects) == b_info_flags::none) + spec += ",no_subprojects"; + + spec += ')'; + pr = process_start_callback ( cmd_callback ? cmd_callback : [] (const char* const*, size_t) {}, 0 /* stdin */, @@ -112,10 +105,12 @@ namespace butl 2 /* stderr */, pp, vops, - ext_mods ? nullptr : "--no-external-modules", + ((fl & b_info_flags::ext_mods) == b_info_flags::none + ? "--no-external-modules" + : nullptr), "-s", ops, - "info:", "'" + project.representation () + "'"); + spec); pipe.out.close (); ifdstream is (move (pipe.in), fdstream_mode::skip, ifdstream::badbit); @@ -145,31 +140,52 @@ namespace butl } }; - b_project_info r; - for (string l; !eof (getline (is, l)); ) + b_project_info pi; + auto add_project = [&r, &pi] () { - if (l.compare (0, 9, "project: ") == 0) - { - string v (l, 9); - if (!v.empty ()) - r.project = parse_name (move (v), "project"); - } - else if (l.compare (0, 9, "version: ") == 0) + // Parse version string to standard version if the project loaded + // the version module. + // + const auto& ms (pi.modules); + if (find (ms.begin (), ms.end (), "version") != ms.end ()) { - string v (l, 9); - if (!v.empty ()) try { - r.version = standard_version (v, standard_version::allow_stub); + pi.version = standard_version (pi.version_string, + standard_version::allow_stub); } catch (const invalid_argument& e) { - bad_value ("version '" + v + "': " + e.what ()); + bad_value ("version '" + pi.version_string + "': " + e.what ()); } } + + // Add the project info and prepare for the next project info + // parsing. + // + r.push_back (move (pi)); + pi = b_project_info (); + }; + + for (string l; !eof (getline (is, l)); ) + { + if (l.empty ()) + { + add_project (); + } + else if (l.compare (0, 9, "project: ") == 0) + { + string v (l, 9); + if (!v.empty ()) + pi.project = parse_name (move (v), "project"); + } + else if (l.compare (0, 9, "version: ") == 0) + { + pi.version_string = string (l, 9); + } else if (l.compare (0, 9, "summary: ") == 0) { - r.summary = string (l, 9); + pi.summary = string (l, 9); } else if (l.compare (0, 5, "url: ") == 0) { @@ -177,7 +193,7 @@ namespace butl if (!v.empty ()) try { - r.url = url (v); + pi.url = url (v); } catch (const invalid_argument& e) { @@ -186,17 +202,17 @@ namespace butl } else if (l.compare (0, 10, "src_root: ") == 0) { - r.src_root = parse_dir (string (l, 10), "src_root"); + pi.src_root = parse_dir (string (l, 10), "src_root"); } else if (l.compare (0, 10, "out_root: ") == 0) { - r.out_root = parse_dir (string (l, 10), "out_root"); + pi.out_root = parse_dir (string (l, 10), "out_root"); } else if (l.compare (0, 14, "amalgamation: ") == 0) { string v (l, 14); if (!v.empty ()) - r.amalgamation = parse_dir (move (v), "amalgamation"); + pi.amalgamation = parse_dir (move (v), "amalgamation"); } else if (l.compare (0, 13, "subprojects: ") == 0) { @@ -214,7 +230,7 @@ namespace butl if (p != 0) sn = parse_name (string (s, 0, p), "subproject"); - r.subprojects.push_back ( + pi.subprojects.push_back ( b_project_info::subproject {move (sn), parse_dir (string (s, p + 1), "subproject")}); @@ -224,20 +240,36 @@ namespace butl { string v (l, 12); for (size_t b (0), e (0); next_word (v, b, e); ) - r.operations.push_back (string (v, b, e - b)); + pi.operations.push_back (string (v, b, e - b)); } else if (l.compare (0, 17, "meta-operations: ") == 0) { string v (l, 17); for (size_t b (0), e (0); next_word (v, b, e); ) - r.meta_operations.push_back (string (v, b, e - b)); + pi.meta_operations.push_back (string (v, b, e - b)); + } + else if (l.compare (0, 9, "modules: ") == 0) + { + string v (l, 9); + for (size_t b (0), e (0); next_word (v, b, e); ) + pi.modules.push_back (string (v, b, e - b)); } } is.close (); // Detect errors. if (pr.wait ()) - return r; + { + add_project (); // Add the remaining project info. + + if (r.size () - rn == projects.size ()) + return; + + ostringstream os; + os << "invalid " << pp << " output: expected information for " + << projects.size () << " projects instead of " << r.size () - rn; + throw b_error (os.str (), move (pr.exit)); + } } // Note that ios::failure inherits from std::runtime_error, so this // catch-clause must go last. @@ -276,7 +308,7 @@ namespace butl assert (!pr.wait ()); throw b_error ( - string ("process ") + pp.recall_string () + " " + to_string (*pr.exit), + string ("process ") + pp.recall_string () + ' ' + to_string (*pr.exit), move (pr.exit)); } catch (const process_error& e) diff --git a/libbutl/b.hxx b/libbutl/b.hxx new file mode 100644 index 0000000..d3fd2bf --- /dev/null +++ b/libbutl/b.hxx @@ -0,0 +1,150 @@ +// file : libbutl/b.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <string> +#include <vector> +#include <utility> // move() +#include <cstddef> // size_tu +#include <cstdint> // uint16_t +#include <stdexcept> // runtime_error +#include <functional> + +#include <libbutl/url.hxx> +#include <libbutl/path.hxx> +#include <libbutl/process.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/project-name.hxx> +#include <libbutl/standard-version.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + class LIBBUTL_SYMEXPORT b_error: public std::runtime_error + { + public: + // Build system program exit information. May be absent if the error + // occured before the process has been started. + // + // Can be used by the caller to decide if to print the error message to + // stderr. Normally, it is not required if the process exited normally + // with non-zero code, since presumably it has issued diagnostics. Note + // that the normal() function can be used to check for this. + // + optional<process_exit> exit; + + // Return true if the build2 process exited normally with non-zero code. + // + bool + normal () const {return exit && exit->normal () && !*exit;} + + explicit + b_error (const std::string& description, optional<process_exit> = nullopt); + }; + + // Run `b info: <project-dir>...` command and parse and return (via argument + // to allow appending and for error position; see below) the build2 projects + // information it prints to stdout. Return the empty list if the specified + // project list is empty. Throw b_error on error. Note that the size of the + // result vector can be used to determine which project information caused + // the error. + // + // You can also specify the build2 verbosity level, command line callback + // (see process_run_callback() for details), build program search details, + // and additional options. + // + // Note that version_string is only parsed to standard_version if a project + // uses the version module. Otherwise, standard_version is empty. + // + struct b_project_info + { + using url_type = butl::url; + + struct subproject + { + project_name name; // Empty if anonymous. + dir_path path; // Relative to the project root. + }; + + project_name project; + std::string version_string; + standard_version version; + std::string summary; + url_type url; + + dir_path src_root; + dir_path out_root; + + dir_path amalgamation; // Relative to project root and + // empty if not amalgmated. + std::vector<subproject> subprojects; + + std::vector<std::string> operations; + std::vector<std::string> meta_operations; + + std::vector<std::string> modules; + }; + + enum class b_info_flags: std::uint16_t + { + // Retrieve information that may come from external modules (operations, + // meta-operations, etc). Omitting this flag results in passing + // --no-external-modules to the build2 program and speeds up its + // execution. + // + ext_mods = 0x1, + + // Discover subprojects. Omitting this flag results in passing + // no_subprojects info meta-operation parameter to the build2 program and + // speeds up its execution. + // + subprojects = 0x2, + + none = 0 + }; + + inline b_info_flags operator& (b_info_flags, b_info_flags); + inline b_info_flags operator| (b_info_flags, b_info_flags); + inline b_info_flags operator&= (b_info_flags&, b_info_flags); + inline b_info_flags operator|= (b_info_flags&, b_info_flags); + + using b_callback = void (const char* const args[], std::size_t n); + + LIBBUTL_SYMEXPORT void + b_info (std::vector<b_project_info>& result, + const std::vector<dir_path>& projects, + b_info_flags, + std::uint16_t verb = 1, + const std::function<b_callback>& cmd_callback = {}, + const path& program = path ("b"), + const dir_path& search_fallback = {}, + const std::vector<std::string>& options = {}); + + // As above but retrieve information for a single project. + // + inline b_project_info + b_info (const dir_path& project, + b_info_flags fl, + std::uint16_t verb = 1, + const std::function<b_callback>& cmd_callback = {}, + const path& program = path ("b"), + const dir_path& search_fallback = {}, + const std::vector<std::string>& options = {}) + { + std::vector<b_project_info> r; + b_info (r, + std::vector<dir_path> ({project}), + fl, + verb, + cmd_callback, + program, + search_fallback, + options); + + return std::move (r[0]); + } +} + +#include <libbutl/b.ixx> diff --git a/libbutl/b.ixx b/libbutl/b.ixx new file mode 100644 index 0000000..1667101 --- /dev/null +++ b/libbutl/b.ixx @@ -0,0 +1,31 @@ +// file : libbutl/b.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + // b_info_flags + // + inline b_info_flags operator& (b_info_flags x, b_info_flags y) + { + return x &= y; + } + + inline b_info_flags operator| (b_info_flags x, b_info_flags y) + { + return x |= y; + } + + inline b_info_flags operator&= (b_info_flags& x, b_info_flags y) + { + return x = static_cast<b_info_flags> ( + static_cast<std::uint16_t> (x) & + static_cast<std::uint16_t> (y)); + } + + inline b_info_flags operator|= (b_info_flags& x, b_info_flags y) + { + return x = static_cast<b_info_flags> ( + static_cast<std::uint16_t> (x) | + static_cast<std::uint16_t> (y)); + } +} diff --git a/libbutl/b.mxx b/libbutl/b.mxx deleted file mode 100644 index 9e12711..0000000 --- a/libbutl/b.mxx +++ /dev/null @@ -1,115 +0,0 @@ -// file : libbutl/b.mxx -*- C++ -*- -// license : MIT; see accompanying LICENSE file - -#ifndef __cpp_modules_ts -#pragma once -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <cstddef> // size_tu -#include <cstdint> // uint16_t -#include <stdexcept> // runtime_error -#include <functional> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.b; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.url; -import butl.path; -import butl.process; -import butl.optional; -import butl.project_name; -import butl.standard_version; -#else -#include <libbutl/url.mxx> -#include <libbutl/path.mxx> -#include <libbutl/process.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/project-name.mxx> -#include <libbutl/standard-version.mxx> -#endif - -#include <libbutl/export.hxx> - -LIBBUTL_MODEXPORT namespace butl -{ - class LIBBUTL_SYMEXPORT b_error: public std::runtime_error - { - public: - // Build system program exit information. May be absent if the error - // occured before the process has been started. - // - // Can be used by the caller to decide if to print the error message to - // stderr. Normally, it is not required if the process exited normally - // with non-zero code, since presumably it has issued diagnostics. Note - // that the normal() function can be used to check for this. - // - optional<process_exit> exit; - - // Return true if the build2 process exited normally with non-zero code. - // - bool - normal () const {return exit && exit->normal () && !*exit;} - - explicit - b_error (const std::string& description, optional<process_exit> = nullopt); - }; - - // Run `b info: <project-dir>` command and parse and return the build2 - // project information it prints to stdout. Throw b_error on error. - // - // Unless you need information that may come from external modules - // (operations, meta-operations, etc), pass false as the ext_mods argument, - // which results in passing --no-external-modules to the build2 program and - // speeds up its execution. - // - // You can also specify the build2 verbosity level, command line callback - // (see process_run_callback() for details), build program search details - // and additional options. - // - struct b_project_info - { - using url_type = butl::url; - - struct subproject - { - project_name name; // Empty if anonymous. - dir_path path; // Relative to the project root. - }; - - project_name project; - standard_version version; - std::string summary; - url_type url; - - dir_path src_root; - dir_path out_root; - - dir_path amalgamation; // Relative to project root and - // empty if not amalgmated. - std::vector<subproject> subprojects; - - std::vector<std::string> operations; - std::vector<std::string> meta_operations; - }; - - using b_callback = void (const char* const args[], std::size_t n); - - LIBBUTL_SYMEXPORT b_project_info - b_info (const dir_path& project, - bool ext_mods, - std::uint16_t verb = 1, - const std::function<b_callback>& cmd_callback = {}, - const path& program = path ("b"), - const dir_path& search_fallback = {}, - const std::vector<std::string>& options = {}); -} diff --git a/libbutl/backtrace.cxx b/libbutl/backtrace.cxx index 8c9c6ae..347e231 100644 --- a/libbutl/backtrace.cxx +++ b/libbutl/backtrace.cxx @@ -1,15 +1,14 @@ // file : libbutl/backtrace.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/backtrace.mxx> -#endif +#include <libbutl/backtrace.hxx> // We only enable backtrace during bootstrap if we can do it without any // complications of the build scripts/makefiles. // // With glibc linking with -rdynamic gives (non-static) function names. -// FreeBSD/NetBSD requires explicitly linking -lexecinfo. +// FreeBSD/NetBSD requires explicitly linking -lexecinfo. OpenBSD only has +// this functionality built-in from 7.0 and requires -lexecinfo. // // Note that some libc implementation on Linux (most notably, musl), don't // support this, at least not out of the box. @@ -20,6 +19,11 @@ defined(__FreeBSD__) || \ defined(__NetBSD__) # define LIBBUTL_BACKTRACE +# elif defined (__OpenBSD__) +# include <sys/param.h> // OpenBSD (yyyymm) +# if OpenBSD >= 202110 // 7.0 was released in October 2021. +# define LIBBUTL_BACKTRACE +# endif # endif #else # if defined(__GLIBC__) || \ @@ -35,30 +39,12 @@ #include <cassert> -#ifndef __cpp_lib_modules_ts -#include <string> - #ifdef LIBBUTL_BACKTRACE # include <memory> // unique_ptr # include <cstddef> // size_t #endif #include <exception> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.backtrace; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif - -#endif using namespace std; diff --git a/libbutl/backtrace.mxx b/libbutl/backtrace.hxx index f5a63d5..6afb6ea 100644 --- a/libbutl/backtrace.mxx +++ b/libbutl/backtrace.hxx @@ -1,28 +1,13 @@ -// file : libbutl/backtrace.mxx -*- C++ -*- +// file : libbutl/backtrace.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.backtrace; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Return the calling thread's backtrace or empty string if this // functionality is not supported or an error has occurred. The exact diff --git a/libbutl/base64.cxx b/libbutl/base64.cxx index 527c6af..282f7c2 100644 --- a/libbutl/base64.cxx +++ b/libbutl/base64.cxx @@ -1,37 +1,13 @@ // file : libbutl/base64.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/base64.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> +#include <libbutl/base64.hxx> #include <cstddef> // size_t #include <istream> #include <ostream> #include <iterator> // {istreambuf, ostreambuf, back_insert}_iterator #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.base64; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -#endif using namespace std; @@ -40,19 +16,20 @@ namespace butl static const char codes[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + static const char codes_url[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + // base64-encode the data in the iterator range [i, e). Write the encoded - // data starting at the iterator position o. + // data starting at the iterator position o. If url is true, encode using + // base64url. // template <typename I, typename O> static void - base64_encode (I& i, const I& e, O& o) + base64_encode (I& i, const I& e, O& o, bool url = false) { const size_t un (65); // Non-existing index of the codes string. for (size_t n (0); i != e; ++n) { - if (n && n % 19 == 0) - *o++ = '\n'; // Split into lines, like the base64 utility does. - auto next = [&i] () {return static_cast<unsigned char> (*i++);}; unsigned char c (next ()); @@ -75,10 +52,26 @@ namespace butl i4 = c & 0x3F; } - *o++ = codes[i1]; - *o++ = codes[i2]; - *o++ = i3 == un ? '=' : codes[i3]; - *o++ = i4 == un ? '=' : codes[i4]; + if (!url) + { + if (n && n % 19 == 0) + *o++ = '\n'; // Split into lines, like the base64 utility does. + + *o++ = codes[i1]; + *o++ = codes[i2]; + *o++ = i3 == un ? '=' : codes[i3]; + *o++ = i4 == un ? '=' : codes[i4]; + } + // base64url: different 63rd and 64th characters and no padding or + // newlines. + // + else + { + *o++ = codes_url[i1]; + *o++ = codes_url[i2]; + if (i3 != un) *o++ = codes_url[i3]; + if (i4 != un) *o++ = codes_url[i4]; + } } } @@ -194,6 +187,47 @@ namespace butl return r; } + string + base64url_encode (istream& is) + { + if (!is.good ()) + throw invalid_argument ("bad stream"); + + string r; + istreambuf_iterator<char> i (is); + back_insert_iterator<string> o (r); + + base64_encode (i, istreambuf_iterator<char> (), o, true /* url */); + is.setstate (istream::eofbit); + return r; + } + + void + base64url_encode (ostream& os, istream& is) + { + if (!os.good () || !is.good ()) + throw invalid_argument ("bad stream"); + + istreambuf_iterator<char> i (is); + ostreambuf_iterator<char> o (os); + base64_encode (i, istreambuf_iterator<char> (), o, true /* url */); + + if (o.failed ()) + os.setstate (istream::badbit); + + is.setstate (istream::eofbit); + } + + string + base64url_encode (const std::vector<char>& v) + { + string r; + back_insert_iterator<string> o (r); + auto i (v.begin ()); + base64_encode (i, v.end (), o, true /* url */); + return r; + } + void base64_decode (ostream& os, istream& is) { diff --git a/libbutl/base64.mxx b/libbutl/base64.hxx index 698b7e2..a0d1450 100644 --- a/libbutl/base64.mxx +++ b/libbutl/base64.hxx @@ -1,31 +1,15 @@ -// file : libbutl/base64.mxx -*- C++ -*- +// file : libbutl/base64.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <iosfwd> #include <string> #include <vector> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.base64; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Base64-encode a stream or a buffer. Split the output into 76 char-long // lines (new line is the 77th). If reading from a stream, check if it has @@ -43,6 +27,25 @@ LIBBUTL_MODEXPORT namespace butl LIBBUTL_SYMEXPORT std::string base64_encode (const std::vector<char>&); + // Encode a stream or a buffer using base64url (RFC4648), a base64 variant + // with different 62nd and 63rd alphabet characters (- and _ instead of ~ + // and .; to make it filesystem safe) and optional padding because the + // padding character `=` would have to be percent-encoded to be safe in + // URLs. This implementation does not output any padding, newlines or any + // other whitespace (which is required, for example, by RFC7519: JSON Web + // Token (JWT) and RFC7515: JSON Web Signature (JWS)). + // + // Note that base64url decoding has not yet been implemented. + // + LIBBUTL_SYMEXPORT void + base64url_encode (std::ostream&, std::istream&); + + LIBBUTL_SYMEXPORT std::string + base64url_encode (std::istream&); + + LIBBUTL_SYMEXPORT std::string + base64url_encode (const std::vector<char>&); + // Base64-decode a stream or a string. Throw invalid_argument if the input // is not a valid base64 representation. If reading from a stream, check if // it has badbit, failbit, or eofbit set and throw invalid_argument if diff --git a/libbutl/bufstreambuf.cxx b/libbutl/bufstreambuf.cxx new file mode 100644 index 0000000..d152166 --- /dev/null +++ b/libbutl/bufstreambuf.cxx @@ -0,0 +1,13 @@ +// file : libbutl/bufstreambuf.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbutl/bufstreambuf.hxx> + +namespace butl +{ + bufstreambuf:: + ~bufstreambuf () + { + // Vtable. + } +} diff --git a/libbutl/bufstreambuf.hxx b/libbutl/bufstreambuf.hxx new file mode 100644 index 0000000..a49b2d0 --- /dev/null +++ b/libbutl/bufstreambuf.hxx @@ -0,0 +1,67 @@ +// file : libbutl/bufstreambuf.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <cstdint> // uint64_t +#include <streambuf> + +#include <libbutl/export.hxx> + +namespace butl +{ + // A buffered streambuf interface that exposes its buffer for direct scan + // and provides a notion of logical position. See fdstreambuf for background + // and motivation. + // + class LIBBUTL_SYMEXPORT bufstreambuf: public std::basic_streambuf<char> + { + public: + using base = std::basic_streambuf<char>; + + using int_type = base::int_type; + using traits_type = base::traits_type; + + using pos_type = base::pos_type; // std::streampos + using off_type = base::off_type; // std::streamoff + + public: + explicit + bufstreambuf (std::uint64_t pos = 0): off_ (pos) {} + + virtual + ~bufstreambuf (); + + // basic_streambuf input interface. + // + public: + + // Direct access to the get area. Use with caution. + // + using base::gptr; + using base::egptr; + using base::gbump; + + // Return the (logical) position of the next byte to be read. + // + // Note that on Windows when reading in the text mode the logical position + // may differ from the physical file descriptor position due to the CRLF + // character sequence translation. See the fdstreambuf::seekoff() + // implementation for more background on this issue. + // + std::uint64_t + tellg () const {return off_ - (egptr () - gptr ());} + + // basic_streambuf output interface. + // + public: + + // Return the (logical) position of the next byte to be written. + // + std::uint64_t + tellp () const {return off_ + (pptr () - pbase ());} + + protected: + std::uint64_t off_; + }; +} diff --git a/libbutl/buildfile b/libbutl/buildfile index 6526900..bbecf43 100644 --- a/libbutl/buildfile +++ b/libbutl/buildfile @@ -1,37 +1,37 @@ # file : libbutl/buildfile # license : MIT; see accompanying LICENSE file -# This library was modularized using the Modules TS semantics (with support -# for dual, module/header consumption) which was subsequently partially -# dismantled. We, however, kept some of the changes in anticipation that they -# would be useful when attempting to modularize using the merged modules -# semantics. Specifically, there are currently headers with both .mxx and .hxx -# extensions and the code is littered with the `#if __cpp_[lib_]modules_ts` -# blocks. Note that it's important for the auto-generated header support -# that the default extension for hxx{} is .hxx. -# -# @@ If/when going back to using mxx{}, make sure to cleanup explicit .mxx. -# -lib{butl}: {hxx ixx txx cxx}{** -uuid-* +uuid-io \ - -win32-utility \ - -version \ - -builtin-options} \ - hxx{**.mxx} {hxx}{version} {hxx ixx cxx}{builtin-options} +lib{butl}: {hxx ixx txx cxx}{** -uuid-* +uuid-io \ + -win32-utility \ + -mingw-* \ + -version \ + -builtin-options} \ + {hxx}{version} {hxx ixx cxx}{builtin-options} tclass = $cxx.target.class tsys = $cxx.target.system windows = ($tclass == 'windows') -# Exclude these from compilation on non-Windows targets. +# Whether to use our own implementation of C++14 threads on MinGW (note: +# requires Windows 7 or later). +# +# Note that for now we use built-in POSIX thread support during bootstrap +# (which, as a side effect, verifies we still use MinGW GCC configured with +# POSIX support, which we still need for TLS, exceptions, and thread-safe +# static locals). +# +mingw_stdthread = ($tsys == 'mingw32') + +# Exclude these from compilation on targets where does not apply. # lib{butl}: {hxx ixx cxx}{win32-utility}: include = $windows +lib{butl}: hxx{mingw-*}: include = $mingw_stdthread -# Our C-files are included into sha256.cxx (sha256c.c) and timestamp.cxx -# (strptime.c timelocal.h timelocal.c), so treat them as files exclude from -# the compilation. +# Our C-files are always included into C++-files that wrap the corresponding +# API so treat them as files to exclude from the compilation. # -lib{butl}: file{*.c *.h} +lib{butl}: file{**.c **.h} # Platform-specific UUID implementations. # @@ -39,6 +39,13 @@ lib{butl}: cxx{uuid-linux}: include = ($tclass == 'linux') lib{butl}: cxx{uuid-macos}: include = ($tclass == 'macos') lib{butl}: cxx{uuid-windows}: include = $windows lib{butl}: cxx{uuid-freebsd}: include = ($tsys == 'freebsd' || $tsys == 'netbsd') +lib{butl}: cxx{uuid-openbsd}: include = ($tsys == 'openbsd') + +# GCC prior to version 6 has flaky `#pragma GCC diagnostic` so we have to +# disable certain warnings outright. +# +if ($cxx.id == 'gcc' && $cxx.version.major < 6) + cc.coptions += -Wno-unused-function # Additional system libraries. # @@ -58,10 +65,18 @@ switch $tclass, $tsys case 'bsd', 'freebsd' | 'netbsd' cxx.libs += -lexecinfo + + case 'bsd', 'openbsd' + { + # Built-in libexecinfo is only available since OpenBSD 7.0. + # + if (([uint64] $regex.replace($cxx.target.version, '(\d+)\..+', '\1')) >= 7) + cxx.libs += -lexecinfo + } } if! $windows - cxx.libs += -lpthread + cxx.libs += -pthread # Include the generated version header into the distribution (so that we don't # pick up an installed one) and don't remove it when cleaning in src (so that @@ -78,6 +93,9 @@ hxx{version}: # cxx.poptions =+ "-I$out_root" "-I$src_root" +if $mingw_stdthread + cxx.poptions += -D_WIN32_WINNT=0x0601 -DLIBBUTL_MINGW_STDTHREAD + obja{*} bmia{*}: cxx.poptions += -DLIBBUTL_STATIC_BUILD objs{*} bmis{*}: cxx.poptions += -DLIBBUTL_SHARED_BUILD @@ -85,6 +103,9 @@ objs{*} bmis{*}: cxx.poptions += -DLIBBUTL_SHARED_BUILD # lib{butl}: cxx.export.poptions = "-I$out_root" "-I$src_root" +if $mingw_stdthread + lib{butl}: cxx.export.poptions += -D_WIN32_WINNT=0x0601 -DLIBBUTL_MINGW_STDTHREAD + liba{butl}: cxx.export.poptions += -DLIBBUTL_STATIC libs{butl}: cxx.export.poptions += -DLIBBUTL_SHARED diff --git a/libbutl/builtin-options.cxx b/libbutl/builtin-options.cxx index 536f97d..98a47cf 100644 --- a/libbutl/builtin-options.cxx +++ b/libbutl/builtin-options.cxx @@ -15,8 +15,10 @@ #include <set> #include <string> #include <vector> +#include <utility> #include <ostream> #include <sstream> +#include <cstring> namespace butl { @@ -25,7 +27,7 @@ namespace butl // unknown_option // unknown_option:: - ~unknown_option () throw () + ~unknown_option () noexcept { } @@ -36,7 +38,7 @@ namespace butl } const char* unknown_option:: - what () const throw () + what () const noexcept { return "unknown option"; } @@ -44,7 +46,7 @@ namespace butl // unknown_argument // unknown_argument:: - ~unknown_argument () throw () + ~unknown_argument () noexcept { } @@ -55,7 +57,7 @@ namespace butl } const char* unknown_argument:: - what () const throw () + what () const noexcept { return "unknown argument"; } @@ -63,7 +65,7 @@ namespace butl // missing_value // missing_value:: - ~missing_value () throw () + ~missing_value () noexcept { } @@ -74,7 +76,7 @@ namespace butl } const char* missing_value:: - what () const throw () + what () const noexcept { return "missing option value"; } @@ -82,7 +84,7 @@ namespace butl // invalid_value // invalid_value:: - ~invalid_value () throw () + ~invalid_value () noexcept { } @@ -97,7 +99,7 @@ namespace butl } const char* invalid_value:: - what () const throw () + what () const noexcept { return "invalid option value"; } @@ -111,7 +113,7 @@ namespace butl } const char* eos_reached:: - what () const throw () + what () const noexcept { return "end of argument stream reached"; } @@ -158,6 +160,7 @@ namespace butl else ++i_; + ++start_position_; return r; } else @@ -168,11 +171,20 @@ namespace butl skip () { if (i_ < argc_) + { ++i_; + ++start_position_; + } else throw eos_reached (); } + std::size_t argv_scanner:: + position () + { + return start_position_; + } + // vector_scanner // bool vector_scanner:: @@ -208,6 +220,12 @@ namespace butl throw eos_reached (); } + std::size_t vector_scanner:: + position () + { + return start_position_ + i_; + } + template <typename X> struct parser { @@ -235,10 +253,31 @@ namespace butl struct parser<bool> { static void - parse (bool& x, scanner& s) + parse (bool& x, bool& xs, scanner& s) { - s.next (); - x = true; + const char* o (s.next ()); + + if (s.more ()) + { + const char* v (s.next ()); + + if (std::strcmp (v, "1") == 0 || + std::strcmp (v, "true") == 0 || + std::strcmp (v, "TRUE") == 0 || + std::strcmp (v, "True") == 0) + x = true; + else if (std::strcmp (v, "0") == 0 || + std::strcmp (v, "false") == 0 || + std::strcmp (v, "FALSE") == 0 || + std::strcmp (v, "False") == 0) + x = false; + else + throw invalid_value (o, v); + } + else + throw missing_value (o); + + xs = true; } }; @@ -260,6 +299,17 @@ namespace butl }; template <typename X> + struct parser<std::pair<X, std::size_t> > + { + static void + parse (std::pair<X, std::size_t>& x, bool& xs, scanner& s) + { + x.second = s.position (); + parser<X>::parse (x.first, xs, s); + } + }; + + template <typename X> struct parser<std::vector<X> > { static void @@ -297,6 +347,7 @@ namespace butl if (s.more ()) { + std::size_t pos (s.position ()); std::string ov (s.next ()); std::string::size_type p = ov.find ('='); @@ -316,14 +367,14 @@ namespace butl if (!kstr.empty ()) { av[1] = const_cast<char*> (kstr.c_str ()); - argv_scanner s (0, ac, av); + argv_scanner s (0, ac, av, false, pos); parser<K>::parse (k, dummy, s); } if (!vstr.empty ()) { av[1] = const_cast<char*> (vstr.c_str ()); - argv_scanner s (0, ac, av); + argv_scanner s (0, ac, av, false, pos); parser<V>::parse (v, dummy, s); } @@ -336,6 +387,56 @@ namespace butl } }; + template <typename K, typename V, typename C> + struct parser<std::multimap<K, V, C> > + { + static void + parse (std::multimap<K, V, C>& m, bool& xs, scanner& s) + { + const char* o (s.next ()); + + if (s.more ()) + { + std::size_t pos (s.position ()); + std::string ov (s.next ()); + std::string::size_type p = ov.find ('='); + + K k = K (); + V v = V (); + std::string kstr (ov, 0, p); + std::string vstr (ov, (p != std::string::npos ? p + 1 : ov.size ())); + + int ac (2); + char* av[] = + { + const_cast<char*> (o), + 0 + }; + + bool dummy; + if (!kstr.empty ()) + { + av[1] = const_cast<char*> (kstr.c_str ()); + argv_scanner s (0, ac, av, false, pos); + parser<K>::parse (k, dummy, s); + } + + if (!vstr.empty ()) + { + av[1] = const_cast<char*> (vstr.c_str ()); + argv_scanner s (0, ac, av, false, pos); + parser<V>::parse (v, dummy, s); + } + + m.insert (typename std::multimap<K, V, C>::value_type (k, v)); + } + else + throw missing_value (o); + + xs = true; + } + }; + template <typename X, typename T, T X::*M> void thunk (X& x, scanner& s) @@ -343,6 +444,14 @@ namespace butl parser<T>::parse (x.*M, s); } + template <typename X, bool X::*M> + void + thunk (X& x, scanner& s) + { + s.next (); + x.*M = true; + } + template <typename X, typename T, T X::*M, bool X::*S> void thunk (X& x, scanner& s) @@ -353,7 +462,6 @@ namespace butl } #include <map> -#include <cstring> namespace butl { @@ -704,15 +812,15 @@ namespace butl _cli_cp_options_map_init () { _cli_cp_options_map_["--recursive"] = - &::butl::cli::thunk< cp_options, bool, &cp_options::recursive_ >; + &::butl::cli::thunk< cp_options, &cp_options::recursive_ >; _cli_cp_options_map_["-R"] = - &::butl::cli::thunk< cp_options, bool, &cp_options::recursive_ >; + &::butl::cli::thunk< cp_options, &cp_options::recursive_ >; _cli_cp_options_map_["-r"] = - &::butl::cli::thunk< cp_options, bool, &cp_options::recursive_ >; + &::butl::cli::thunk< cp_options, &cp_options::recursive_ >; _cli_cp_options_map_["--preserve"] = - &::butl::cli::thunk< cp_options, bool, &cp_options::preserve_ >; + &::butl::cli::thunk< cp_options, &cp_options::preserve_ >; _cli_cp_options_map_["-p"] = - &::butl::cli::thunk< cp_options, bool, &cp_options::preserve_ >; + &::butl::cli::thunk< cp_options, &cp_options::preserve_ >; } }; @@ -978,9 +1086,9 @@ namespace butl _cli_date_options_map_init () { _cli_date_options_map_["--utc"] = - &::butl::cli::thunk< date_options, bool, &date_options::utc_ >; + &::butl::cli::thunk< date_options, &date_options::utc_ >; _cli_date_options_map_["-u"] = - &::butl::cli::thunk< date_options, bool, &date_options::utc_ >; + &::butl::cli::thunk< date_options, &date_options::utc_ >; } }; @@ -1163,6 +1271,269 @@ namespace butl return r; } + // find_options + // + + find_options:: + find_options () + { + } + + bool find_options:: + parse (int& argc, + char** argv, + bool erase, + ::butl::cli::unknown_mode opt, + ::butl::cli::unknown_mode arg) + { + ::butl::cli::argv_scanner s (argc, argv, erase); + bool r = _parse (s, opt, arg); + return r; + } + + bool find_options:: + parse (int start, + int& argc, + char** argv, + bool erase, + ::butl::cli::unknown_mode opt, + ::butl::cli::unknown_mode arg) + { + ::butl::cli::argv_scanner s (start, argc, argv, erase); + bool r = _parse (s, opt, arg); + return r; + } + + bool find_options:: + parse (int& argc, + char** argv, + int& end, + bool erase, + ::butl::cli::unknown_mode opt, + ::butl::cli::unknown_mode arg) + { + ::butl::cli::argv_scanner s (argc, argv, erase); + bool r = _parse (s, opt, arg); + end = s.end (); + return r; + } + + bool find_options:: + parse (int start, + int& argc, + char** argv, + int& end, + bool erase, + ::butl::cli::unknown_mode opt, + ::butl::cli::unknown_mode arg) + { + ::butl::cli::argv_scanner s (start, argc, argv, erase); + bool r = _parse (s, opt, arg); + end = s.end (); + return r; + } + + bool find_options:: + parse (::butl::cli::scanner& s, + ::butl::cli::unknown_mode opt, + ::butl::cli::unknown_mode arg) + { + bool r = _parse (s, opt, arg); + return r; + } + + typedef + std::map<std::string, void (*) (find_options&, ::butl::cli::scanner&)> + _cli_find_options_map; + + static _cli_find_options_map _cli_find_options_map_; + + struct _cli_find_options_map_init + { + _cli_find_options_map_init () + { + } + }; + + static _cli_find_options_map_init _cli_find_options_map_init_; + + bool find_options:: + _parse (const char* o, ::butl::cli::scanner& s) + { + _cli_find_options_map::const_iterator i (_cli_find_options_map_.find (o)); + + if (i != _cli_find_options_map_.end ()) + { + (*(i->second)) (*this, s); + return true; + } + + return false; + } + + bool find_options:: + _parse (::butl::cli::scanner& s, + ::butl::cli::unknown_mode opt_mode, + ::butl::cli::unknown_mode arg_mode) + { + // Can't skip combined flags (--no-combined-flags). + // + assert (opt_mode != ::butl::cli::unknown_mode::skip); + + bool r = false; + bool opt = true; + + while (s.more ()) + { + const char* o = s.peek (); + + if (std::strcmp (o, "--") == 0) + { + opt = false; + } + + if (opt) + { + if (_parse (o, s)) + { + r = true; + continue; + } + + if (std::strncmp (o, "-", 1) == 0 && o[1] != '\0') + { + // Handle combined option values. + // + std::string co; + if (const char* v = std::strchr (o, '=')) + { + co.assign (o, 0, v - o); + ++v; + + int ac (2); + char* av[] = + { + const_cast<char*> (co.c_str ()), + const_cast<char*> (v) + }; + + ::butl::cli::argv_scanner ns (0, ac, av); + + if (_parse (co.c_str (), ns)) + { + // Parsed the option but not its value? + // + if (ns.end () != 2) + throw ::butl::cli::invalid_value (co, v); + + s.next (); + r = true; + continue; + } + else + { + // Set the unknown option and fall through. + // + o = co.c_str (); + } + } + + // Handle combined flags. + // + char cf[3]; + { + const char* p = o + 1; + for (; *p != '\0'; ++p) + { + if (!((*p >= 'a' && *p <= 'z') || + (*p >= 'A' && *p <= 'Z') || + (*p >= '0' && *p <= '9'))) + break; + } + + if (*p == '\0') + { + for (p = o + 1; *p != '\0'; ++p) + { + std::strcpy (cf, "-"); + cf[1] = *p; + cf[2] = '\0'; + + int ac (1); + char* av[] = + { + cf + }; + + ::butl::cli::argv_scanner ns (0, ac, av); + + if (!_parse (cf, ns)) + break; + } + + if (*p == '\0') + { + // All handled. + // + s.next (); + r = true; + continue; + } + else + { + // Set the unknown option and fall through. + // + o = cf; + } + } + } + + switch (opt_mode) + { + case ::butl::cli::unknown_mode::skip: + { + s.skip (); + r = true; + continue; + } + case ::butl::cli::unknown_mode::stop: + { + break; + } + case ::butl::cli::unknown_mode::fail: + { + throw ::butl::cli::unknown_option (o); + } + } + + break; + } + } + + switch (arg_mode) + { + case ::butl::cli::unknown_mode::skip: + { + s.skip (); + r = true; + continue; + } + case ::butl::cli::unknown_mode::stop: + { + break; + } + case ::butl::cli::unknown_mode::fail: + { + throw ::butl::cli::unknown_argument (o); + } + } + + break; + } + + return r; + } + // ln_options // @@ -1246,9 +1617,9 @@ namespace butl _cli_ln_options_map_init () { _cli_ln_options_map_["--symbolic"] = - &::butl::cli::thunk< ln_options, bool, &ln_options::symbolic_ >; + &::butl::cli::thunk< ln_options, &ln_options::symbolic_ >; _cli_ln_options_map_["-s"] = - &::butl::cli::thunk< ln_options, bool, &ln_options::symbolic_ >; + &::butl::cli::thunk< ln_options, &ln_options::symbolic_ >; } }; @@ -1514,9 +1885,9 @@ namespace butl _cli_mkdir_options_map_init () { _cli_mkdir_options_map_["--parents"] = - &::butl::cli::thunk< mkdir_options, bool, &mkdir_options::parents_ >; + &::butl::cli::thunk< mkdir_options, &mkdir_options::parents_ >; _cli_mkdir_options_map_["-p"] = - &::butl::cli::thunk< mkdir_options, bool, &mkdir_options::parents_ >; + &::butl::cli::thunk< mkdir_options, &mkdir_options::parents_ >; } }; @@ -1782,9 +2153,9 @@ namespace butl _cli_mv_options_map_init () { _cli_mv_options_map_["--force"] = - &::butl::cli::thunk< mv_options, bool, &mv_options::force_ >; + &::butl::cli::thunk< mv_options, &mv_options::force_ >; _cli_mv_options_map_["-f"] = - &::butl::cli::thunk< mv_options, bool, &mv_options::force_ >; + &::butl::cli::thunk< mv_options, &mv_options::force_ >; } }; @@ -2051,13 +2422,13 @@ namespace butl _cli_rm_options_map_init () { _cli_rm_options_map_["--recursive"] = - &::butl::cli::thunk< rm_options, bool, &rm_options::recursive_ >; + &::butl::cli::thunk< rm_options, &rm_options::recursive_ >; _cli_rm_options_map_["-r"] = - &::butl::cli::thunk< rm_options, bool, &rm_options::recursive_ >; + &::butl::cli::thunk< rm_options, &rm_options::recursive_ >; _cli_rm_options_map_["--force"] = - &::butl::cli::thunk< rm_options, bool, &rm_options::force_ >; + &::butl::cli::thunk< rm_options, &rm_options::force_ >; _cli_rm_options_map_["-f"] = - &::butl::cli::thunk< rm_options, bool, &rm_options::force_ >; + &::butl::cli::thunk< rm_options, &rm_options::force_ >; } }; @@ -2323,9 +2694,9 @@ namespace butl _cli_rmdir_options_map_init () { _cli_rmdir_options_map_["--force"] = - &::butl::cli::thunk< rmdir_options, bool, &rmdir_options::force_ >; + &::butl::cli::thunk< rmdir_options, &rmdir_options::force_ >; _cli_rmdir_options_map_["-f"] = - &::butl::cli::thunk< rmdir_options, bool, &rmdir_options::force_ >; + &::butl::cli::thunk< rmdir_options, &rmdir_options::force_ >; } }; @@ -2594,13 +2965,13 @@ namespace butl _cli_sed_options_map_init () { _cli_sed_options_map_["--quiet"] = - &::butl::cli::thunk< sed_options, bool, &sed_options::quiet_ >; + &::butl::cli::thunk< sed_options, &sed_options::quiet_ >; _cli_sed_options_map_["-n"] = - &::butl::cli::thunk< sed_options, bool, &sed_options::quiet_ >; + &::butl::cli::thunk< sed_options, &sed_options::quiet_ >; _cli_sed_options_map_["--in-place"] = - &::butl::cli::thunk< sed_options, bool, &sed_options::in_place_ >; + &::butl::cli::thunk< sed_options, &sed_options::in_place_ >; _cli_sed_options_map_["-i"] = - &::butl::cli::thunk< sed_options, bool, &sed_options::in_place_ >; + &::butl::cli::thunk< sed_options, &sed_options::in_place_ >; _cli_sed_options_map_["--expression"] = &::butl::cli::thunk< sed_options, std::vector<std::string>, &sed_options::expression_, &sed_options::expression_specified_ >; @@ -3136,13 +3507,13 @@ namespace butl _cli_test_options_map_init () { _cli_test_options_map_["--file"] = - &::butl::cli::thunk< test_options, bool, &test_options::file_ >; + &::butl::cli::thunk< test_options, &test_options::file_ >; _cli_test_options_map_["-f"] = - &::butl::cli::thunk< test_options, bool, &test_options::file_ >; + &::butl::cli::thunk< test_options, &test_options::file_ >; _cli_test_options_map_["--directory"] = - &::butl::cli::thunk< test_options, bool, &test_options::directory_ >; + &::butl::cli::thunk< test_options, &test_options::directory_ >; _cli_test_options_map_["-d"] = - &::butl::cli::thunk< test_options, bool, &test_options::directory_ >; + &::butl::cli::thunk< test_options, &test_options::directory_ >; } }; diff --git a/libbutl/builtin-options.hxx b/libbutl/builtin-options.hxx index b389298..70179dd 100644 --- a/libbutl/builtin-options.hxx +++ b/libbutl/builtin-options.hxx @@ -68,7 +68,7 @@ namespace butl { public: virtual - ~unknown_option () throw (); + ~unknown_option () noexcept; unknown_option (const std::string& option); @@ -79,7 +79,7 @@ namespace butl print (::std::ostream&) const; virtual const char* - what () const throw (); + what () const noexcept; private: std::string option_; @@ -89,7 +89,7 @@ namespace butl { public: virtual - ~unknown_argument () throw (); + ~unknown_argument () noexcept; unknown_argument (const std::string& argument); @@ -100,7 +100,7 @@ namespace butl print (::std::ostream&) const; virtual const char* - what () const throw (); + what () const noexcept; private: std::string argument_; @@ -110,7 +110,7 @@ namespace butl { public: virtual - ~missing_value () throw (); + ~missing_value () noexcept; missing_value (const std::string& option); @@ -121,7 +121,7 @@ namespace butl print (::std::ostream&) const; virtual const char* - what () const throw (); + what () const noexcept; private: std::string option_; @@ -131,7 +131,7 @@ namespace butl { public: virtual - ~invalid_value () throw (); + ~invalid_value () noexcept; invalid_value (const std::string& option, const std::string& value, @@ -150,7 +150,7 @@ namespace butl print (::std::ostream&) const; virtual const char* - what () const throw (); + what () const noexcept; private: std::string option_; @@ -165,7 +165,7 @@ namespace butl print (::std::ostream&) const; virtual const char* - what () const throw (); + what () const noexcept; }; // Command line argument scanner interface. @@ -174,6 +174,14 @@ namespace butl // for the two previous arguments up until a call to a third // peek() or next(). // + // The position() function returns a monotonically-increasing + // number which, if stored, can later be used to determine the + // relative position of the argument returned by the following + // call to next(). Note that if multiple scanners are used to + // extract arguments from multiple sources, then the end + // position of the previous scanner should be used as the + // start position of the next. + // class scanner { public: @@ -191,13 +199,24 @@ namespace butl virtual void skip () = 0; + + virtual std::size_t + position () = 0; }; class argv_scanner: public scanner { public: - argv_scanner (int& argc, char** argv, bool erase = false); - argv_scanner (int start, int& argc, char** argv, bool erase = false); + argv_scanner (int& argc, + char** argv, + bool erase = false, + std::size_t start_position = 0); + + argv_scanner (int start, + int& argc, + char** argv, + bool erase = false, + std::size_t start_position = 0); int end () const; @@ -214,7 +233,11 @@ namespace butl virtual void skip (); - private: + virtual std::size_t + position (); + + protected: + std::size_t start_position_; int i_; int& argc_; char** argv_; @@ -224,13 +247,15 @@ namespace butl class vector_scanner: public scanner { public: - vector_scanner (const std::vector<std::string>&, std::size_t start = 0); + vector_scanner (const std::vector<std::string>&, + std::size_t start = 0, + std::size_t start_position = 0); std::size_t end () const; void - reset (std::size_t start = 0); + reset (std::size_t start = 0, std::size_t start_position = 0); virtual bool more (); @@ -244,7 +269,11 @@ namespace butl virtual void skip (); + virtual std::size_t + position (); + private: + std::size_t start_position_; const std::vector<std::string>& v_; std::size_t i_; }; @@ -455,6 +484,67 @@ namespace butl bool utc_; }; + class find_options + { + public: + find_options (); + + // Return true if anything has been parsed. + // + bool + parse (int& argc, + char** argv, + bool erase = false, + ::butl::cli::unknown_mode option = ::butl::cli::unknown_mode::fail, + ::butl::cli::unknown_mode argument = ::butl::cli::unknown_mode::stop); + + bool + parse (int start, + int& argc, + char** argv, + bool erase = false, + ::butl::cli::unknown_mode option = ::butl::cli::unknown_mode::fail, + ::butl::cli::unknown_mode argument = ::butl::cli::unknown_mode::stop); + + bool + parse (int& argc, + char** argv, + int& end, + bool erase = false, + ::butl::cli::unknown_mode option = ::butl::cli::unknown_mode::fail, + ::butl::cli::unknown_mode argument = ::butl::cli::unknown_mode::stop); + + bool + parse (int start, + int& argc, + char** argv, + int& end, + bool erase = false, + ::butl::cli::unknown_mode option = ::butl::cli::unknown_mode::fail, + ::butl::cli::unknown_mode argument = ::butl::cli::unknown_mode::stop); + + bool + parse (::butl::cli::scanner&, + ::butl::cli::unknown_mode option = ::butl::cli::unknown_mode::fail, + ::butl::cli::unknown_mode argument = ::butl::cli::unknown_mode::stop); + + // Option accessors. + // + // Implementation details. + // + protected: + bool + _parse (const char*, ::butl::cli::scanner&); + + private: + bool + _parse (::butl::cli::scanner&, + ::butl::cli::unknown_mode option, + ::butl::cli::unknown_mode argument); + + public: + }; + class ln_options { public: diff --git a/libbutl/builtin-options.ixx b/libbutl/builtin-options.ixx index f10f82d..e118156 100644 --- a/libbutl/builtin-options.ixx +++ b/libbutl/builtin-options.ixx @@ -107,14 +107,29 @@ namespace butl // argv_scanner // inline argv_scanner:: - argv_scanner (int& argc, char** argv, bool erase) - : i_ (1), argc_ (argc), argv_ (argv), erase_ (erase) + argv_scanner (int& argc, + char** argv, + bool erase, + std::size_t sp) + : start_position_ (sp + 1), + i_ (1), + argc_ (argc), + argv_ (argv), + erase_ (erase) { } inline argv_scanner:: - argv_scanner (int start, int& argc, char** argv, bool erase) - : i_ (start), argc_ (argc), argv_ (argv), erase_ (erase) + argv_scanner (int start, + int& argc, + char** argv, + bool erase, + std::size_t sp) + : start_position_ (sp + static_cast<std::size_t> (start)), + i_ (start), + argc_ (argc), + argv_ (argv), + erase_ (erase) { } @@ -127,8 +142,10 @@ namespace butl // vector_scanner // inline vector_scanner:: - vector_scanner (const std::vector<std::string>& v, std::size_t i) - : v_ (v), i_ (i) + vector_scanner (const std::vector<std::string>& v, + std::size_t i, + std::size_t sp) + : start_position_ (sp), v_ (v), i_ (i) { } @@ -139,9 +156,10 @@ namespace butl } inline void vector_scanner:: - reset (std::size_t i) + reset (std::size_t i, std::size_t sp) { i_ = i; + start_position_ = sp; } } } @@ -175,6 +193,9 @@ namespace butl return this->utc_; } + // find_options + // + // ln_options // diff --git a/libbutl/builtin.cli b/libbutl/builtin.cli index adc47fa..23a5708 100644 --- a/libbutl/builtin.cli +++ b/libbutl/builtin.cli @@ -34,6 +34,11 @@ namespace butl bool --utc|-u; }; + class find_options + { + // No options so far (expression/primaries handled as arguments). + }; + class ln_options { bool --symbolic|-s; diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx index 79ff968..a5861d4 100644 --- a/libbutl/builtin.cxx +++ b/libbutl/builtin.cxx @@ -1,28 +1,16 @@ // file : libbutl/builtin.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/builtin.mxx> -#endif +#include <libbutl/builtin.hxx> #ifdef _WIN32 # include <libbutl/win32-utility.hxx> #endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <map> -#include <string> -#include <vector> -#include <thread> -#include <utility> // move(), forward() -#include <cstdint> // uint*_t -#include <functional> - #include <ios> #include <chrono> #include <cerrno> +#include <cassert> #include <ostream> #include <sstream> #include <cstdlib> // strtoull() @@ -30,41 +18,16 @@ #include <exception> #include <system_error> -#endif +#include <libbutl/regex.hxx> +#include <libbutl/path-io.hxx> +#include <libbutl/utility.hxx> // operator<<(ostream,exception), + // throw_generic_error() +#include <libbutl/optional.hxx> +#include <libbutl/filesystem.hxx> +#include <libbutl/small-vector.hxx> #include <libbutl/builtin-options.hxx> -#ifdef __cpp_modules_ts -module butl.builtin; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -import std.threading; -#endif -import butl.path; -import butl.fdstream; -import butl.timestamp; -#endif - -import butl.regex; -import butl.path_io; -import butl.utility; // operator<<(ostream,exception), - // throw_generic_error() -import butl.optional; -import butl.filesystem; -import butl.small_vector; -#else -#include <libbutl/regex.mxx> -#include <libbutl/path-io.mxx> -#include <libbutl/utility.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/filesystem.mxx> -#include <libbutl/small-vector.mxx> -#endif - // Strictly speaking a builtin which reads/writes from/to standard streams // must be asynchronous so that the caller can communicate with it through // pipes without being blocked on I/O operations. However, as an optimization, @@ -280,7 +243,7 @@ namespace butl // completed using the current directory if it is relative. Fail if // std::system_error is thrown by the underlying function call. // - dir_path + static dir_path current_directory (const dir_path& wd, const function<error_record ()>& fail) { try @@ -507,7 +470,7 @@ namespace butl if (cbs.create) call (fail, cbs.create, to, false /* pre */); - for (const auto& de: dir_iterator (from, false /* ignore_dangling */)) + for (const auto& de: dir_iterator (from, dir_iterator::no_follow)) { path f (from / de.path ()); path t (to / de.path ()); @@ -853,6 +816,314 @@ namespace butl return builtin (r = 0); } + // find <start-path>... [-name <pattern>] + // [-type <type>] + // [-mindepth <depth>] + // [-maxdepth <depth>] + // + // Note: must be executed asynchronously. + // + static uint8_t + find (const strings& args, + auto_fd in, auto_fd out, auto_fd err, + const dir_path& cwd, + const builtin_callbacks& cbs) noexcept + try + { + uint8_t r (1); + ofdstream cerr (err != nullfd ? move (err) : fddup (stderr_fd ())); + + // Note that on some errors we will issue diagnostics but continue the + // search and return with non-zero code at the end. This is consistent + // with how major implementations behave (see below). + // + bool error_occured (false); + auto error = [&cerr, &error_occured] (bool fail = false) + { + error_occured = true; + return error_record (cerr, fail, "find"); + }; + + auto fail = [&error] () {return error (true /* fail */);}; + + try + { + in.close (); + ofdstream cout (out != nullfd ? move (out) : fddup (stdout_fd ())); + + // Parse arguments. + // + cli::vector_scanner scan (args); + + // Currently, we don't expect any options. + // + parse<find_options> (scan, args, cbs.parse_option, fail); + + // Parse path arguments until the first primary (starts with '-') is + // encountered. + // + small_vector<path, 1> paths; + + while (scan.more ()) + { + if (*scan.peek () == '-') + break; + + try + { + paths.emplace_back (scan.next ()); + } + catch (const invalid_path& e) + { + fail () << "invalid path '" << e.path << "'"; + } + } + + // Note that POSIX doesn't explicitly describe the behavior if no paths + // are specified on the command line. On Linux the current directory is + // assumed in this case. We, however, will follow the FreeBSD behavior + // and fail since this seems to be less error-prone. + // + if (paths.empty ()) + fail () << "missing start path"; + + // Parse primaries. + // + optional<string> name; + optional<entry_type> type; + optional<uint64_t> min_depth; + optional<uint64_t> max_depth; + + while (scan.more ()) + { + const char* p (scan.next ()); + + // Return the string value of the current primary. Fail if absent or + // empty, unless empty value is allowed. + // + auto str = [p, &scan, &fail] (bool allow_empty = false) + { + if (!scan.more ()) + { + fail () << "missing value for primary '" << p << "'"; + } + + string n (p); // Save for diagnostics. + string r (scan.next ()); + + if (r.empty () && !allow_empty) + fail () << "empty value for primary '" << n << "'"; + + return r; + }; + + // Return the unsigned numeric value of the current primary. Fail if + // absent or is not a valid number. + // + auto num = [p, &str, &fail] () + { + string n (p); // Save for diagnostics. + string s (str ()); + + const char* b (s.c_str ()); + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + uint64_t r (strtoull (b, &e, 10)); // Can't throw. + + if (errno == ERANGE || e != b + s.size ()) + fail () << "invalid value '" << s << "' for primary '" << n << "'"; + + return r; + }; + + if (strcmp (p, "-name") == 0) + { + // Note that the empty never-matching pattern is allowed. + // + name = str (true /* allow_empty */); + } + else if (strcmp (p, "-type") == 0) + { + string s (str ()); + char t (s.size () == 1 ? s[0] : '\0'); + + switch (t) + { + case 'f': type = entry_type::regular; break; + case 'd': type = entry_type::directory; break; + case 'l': type = entry_type::symlink; break; + default: fail () << "invalid value '" << s << "' for primary '-type'"; + } + } + else if (strcmp (p, "-mindepth") == 0) + { + min_depth = num (); + } + else if (strcmp (p, "-maxdepth") == 0) + { + max_depth = num (); + } + else + fail () << "unknown primary '" << p << "'"; + } + + // Print the path if the expression evaluates to true for it. Traverse + // further down if the path refers to a directory and the maximum depth + // is not specified or is not reached. + // + // Note that paths for evaluating/printing (pp) and for + // stating/traversing (ap) are passed separately. The former is + // potentially relative and the latter is absolute. Also note that + // for optimization we separately pass the base name simple path. + // + auto find = [&cout, + &name, + &type, + &min_depth, + &max_depth, + &fail] (const path& pp, + const path& ap, + const path& bp, + entry_type t, + uint64_t level, + const auto& find) -> void + { + // Print the path if no primary evaluates to false. + // + if ((!type || *type == t) && + (!min_depth || level >= *min_depth) && + (!name || path_match (bp.string (), *name))) + { + // Print the trailing directory separator, if present. + // + if (pp.to_directory ()) + { + // The trailing directory separator can only be present for + // paths specified on the command line. + // + assert (level == 0); + + cout << pp.representation () << '\n'; + } + else + cout << pp << '\n'; + } + + // Traverse the directory, unless the max depth is specified and + // reached. + // + if (t == entry_type::directory && (!max_depth || level < *max_depth)) + try + { + for (const auto& de: dir_iterator (path_cast<dir_path> (ap), + dir_iterator::no_follow)) + { + find (pp / de.path (), + ap / de.path (), + de.path (), + de.ltype (), + level + 1, + find); + } + } + catch (const system_error& e) + { + fail () << "unable to scan directory '" << pp << "': " << e; + } + }; + + dir_path wd; + + for (const path& p: paths) + { + // Complete the path if it is relative, so that we can properly stat + // it and, potentially, traverse. Note that we don't normalize it + // since POSIX requires that the paths should be evaluated (by + // primaries) and printed unaltered. + // + path ap; + + if (p.relative ()) + { + if (wd.empty () && cwd.relative ()) + wd = current_directory (cwd, fail); + + ap = (!wd.empty () ? wd : cwd) / p; + } + + // Issue an error if the path is empty, doesn't exist, or has the + // trailing directory separator but refers to a non-directory. + // + // Note that POSIX doesn't explicitly describe the behavior if any of + // the above happens. We will follow the behavior which is common for + // both Linux and FreeBSD by issuing the diagnostics, proceeding to + // the subsequent paths, and returning with non-zero code at the end. + // + if (p.empty ()) + { + error () << "empty path"; + continue; + } + + const path& fp (!ap.empty () ? ap : p); + pair<bool, entry_stat> pe; + + try + { + pe = path_entry (fp); + } + catch (const system_error& e) + { + fail () << "unable to stat '" << p << "': " << e; + } + + if (!pe.first) + { + error () << "'" << p << "' doesn't exists"; + continue; + } + + entry_type t (pe.second.type); + + if (p.to_directory () && t != entry_type::directory) + { + error () << "'" << p << "' is not a directory"; + continue; + } + + find (p, fp, p.leaf (), t, 0 /* level */, find); + } + + cout.close (); + r = !error_occured ? 0 : 1; + } + // Can be thrown while closing cin or creating, writing to, or closing + // cout or writing to cerr. + // + catch (const io_error& e) + { + error () << e; + } + catch (const failed&) + { + // Diagnostics has already been issued. + } + catch (const cli::exception& e) + { + error () << e; + } + + cerr.close (); + return r; + } + // In particular, handles io_error exception potentially thrown while + // creating, writing to, or closing cerr. + // + catch (const std::exception&) + { + return 1; + } + // Create a symlink to a file or directory at the specified path and calling // the hook for the created filesystem entries. The paths must be absolute // and normalized. Fall back to creating a hardlink, if symlink creation is @@ -1632,15 +1903,6 @@ namespace butl string replacement; bool global; bool print; - - subst (const string& re, bool ic, string rp, bool gl, bool pr) - // - // Note that ECMAScript is implied if no grammar flag is specified. - // - : regex (re, ic ? regex::icase : regex::ECMAScript), - replacement (move (rp)), - global (gl), - print (pr) {} }; small_vector<subst, 1> substs; @@ -1663,57 +1925,59 @@ namespace butl if (delim == '\\' || delim == '\n') fail () << "invalid delimiter for 's' command in '" << v << "'"; - size_t p (v.find (delim, 2)); - if (p == string::npos) - fail () << "unterminated 's' command regex in '" << v << "'"; - - string regex (v, 2, p - 2); - - // Empty regex matches nothing, so not of much use. - // - if (regex.empty ()) - fail () << "empty regex in 's' command in '" << v << "'"; - - size_t b (p + 1); - p = v.find (delim, b); - if (p == string::npos) - fail () << "unterminated 's' command replacement in '" << v << "'"; - - string replacement (v, b, p - b); - - // Parse the substitute command flags. + // Parse the substitute command regex (as string), replacement, and + // flags. // + pair<string, string> rf; bool icase (false); bool global (false); bool print (false); - char c; - for (++p; (c = v[p]) != '\0'; ++p) + try { - switch (c) + size_t e; + rf = regex_replace_parse (v.c_str () + 1, v.size () - 1, e); + + char c; + for (size_t i (e + 1); (c = v[i]) != '\0'; ++i) { - case 'i': icase = true; break; - case 'g': global = true; break; - case 'p': print = true; break; - default: + switch (c) { - fail () << "invalid 's' command flag '" << c << "' in '" << v - << "'"; + case 'i': icase = true; break; + case 'g': global = true; break; + case 'p': print = true; break; + default: + { + fail () << "invalid 's' command flag '" << c << "' in '" << v + << "'"; + } } } } + catch (const invalid_argument& e) + { + fail () << "invalid 's' command '" << v << "': " << e; + } + // Parse the regex and add the substitution to the list. + // try { - substs.emplace_back (regex, icase, - move (replacement), - global, print); + // Note that ECMAScript is implied if no grammar flag is specified. + // + regex re (rf.first, icase ? regex::icase : regex::ECMAScript); + + substs.push_back ({move (re), + move (rf.second), + global, + print}); } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // - fail () << "invalid regex '" << regex << "' in '" << v << "'" << e; + fail () << "invalid regex '" << rf.first << "' in '" << v << "'" + << e; } } @@ -1936,6 +2200,7 @@ namespace butl if (!a.empty () && a[0] != '-' && a[0] != '+') { char* e (nullptr); + errno = 0; // We must clear it according to POSIX. n = strtoull (a.c_str (), &e, 10); // Can't throw. if (errno != ERANGE && e == a.c_str () + a.size ()) @@ -2220,17 +2485,17 @@ namespace butl { unique_ptr<builtin::async_state> s ( new builtin::async_state ( + r, [fn, - &r, &args, in = move (in), out = move (out), err = move (err), &cwd, - &cbs] () mutable noexcept + &cbs] () mutable noexcept -> uint8_t { - r = fn (args, - move (in), move (out), move (err), - cwd, - cbs); + return fn (args, + move (in), move (out), move (err), + cwd, + cbs); })); return builtin (r, move (s)); @@ -2270,6 +2535,7 @@ namespace butl {"diff", {nullptr, 2}}, {"echo", {&async_impl<&echo>, 2}}, {"false", {&false_, 0}}, + {"find", {&async_impl<&find>, 2}}, {"ln", {&sync_impl<&ln>, 2}}, {"mkdir", {&sync_impl<&mkdir>, 2}}, {"mv", {&sync_impl<&mv>, 2}}, @@ -2289,7 +2555,7 @@ namespace butl { if (state_ != nullptr) { - unique_lock<mutex> l (state_->mutex); + unique_lock l (state_->mutex); if (!state_->finished) state_->condv.wait (l, [this] {return state_->finished;}); @@ -2304,7 +2570,7 @@ namespace butl { if (state_ != nullptr) { - unique_lock<mutex> l (state_->mutex); + unique_lock l (state_->mutex); if (!state_->finished && !state_->condv.wait_for (l, tm, [this] {return state_->finished;})) diff --git a/libbutl/builtin.mxx b/libbutl/builtin.hxx index a99d6f4..b301f8a 100644 --- a/libbutl/builtin.mxx +++ b/libbutl/builtin.hxx @@ -1,47 +1,35 @@ -// file : libbutl/builtin.mxx -*- C++ -*- +// file : libbutl/builtin.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. -#ifndef __cpp_lib_modules_ts #include <map> -#include <mutex> #include <string> #include <vector> -#include <thread> #include <chrono> #include <memory> // unique_ptr #include <cstddef> // size_t #include <utility> // move() #include <cstdint> // uint8_t #include <functional> -#include <condition_variable> -#endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.builtin; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.threading; -#endif -import butl.path; -import butl.fdstream; -import butl.timestamp; +#ifndef LIBBUTL_MINGW_STDTHREAD +# include <mutex> +# include <thread> +# include <condition_variable> #else -#include <libbutl/path.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/timestamp.mxx> +# include <libbutl/mingw-mutex.hxx> +# include <libbutl/mingw-thread.hxx> +# include <libbutl/mingw-condition_variable.hxx> #endif +#include <libbutl/path.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/timestamp.hxx> + #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // A process/thread-like object representing a running builtin. // @@ -75,12 +63,26 @@ LIBBUTL_MODEXPORT namespace butl ~builtin () {if (state_ != nullptr) state_->thread.join ();} public: +#ifndef LIBBUTL_MINGW_STDTHREAD + using mutex_type = std::mutex; + using condition_variable_type = std::condition_variable; + using thread_type = std::thread; + + using unique_lock = std::unique_lock<mutex_type>; +#else + using mutex_type = mingw_stdthread::mutex; + using condition_variable_type = mingw_stdthread::condition_variable; + using thread_type = mingw_stdthread::thread; + + using unique_lock = mingw_stdthread::unique_lock<mutex_type>; +#endif + struct async_state { bool finished = false; - std::mutex mutex; - std::condition_variable condv; - std::thread thread; + mutex_type mutex; + condition_variable_type condv; + thread_type thread; // Note that we can't use std::function as an argument type to get rid // of the template since std::function can only be instantiated with a @@ -88,8 +90,7 @@ LIBBUTL_MODEXPORT namespace butl // be able to capture auto_fd by value in a lambda, etc). // template <typename F> - explicit - async_state (F); + async_state (uint8_t&, F); }; builtin (std::uint8_t& r, std::unique_ptr<async_state>&& s = nullptr) diff --git a/libbutl/builtin.ixx b/libbutl/builtin.ixx index 0356f8b..d77590b 100644 --- a/libbutl/builtin.ixx +++ b/libbutl/builtin.ixx @@ -25,7 +25,7 @@ namespace butl { if (state_ != nullptr) { - std::unique_lock<std::mutex> l (state_->mutex); + unique_lock l (state_->mutex); if (!state_->finished) return nullopt; @@ -47,13 +47,14 @@ namespace butl // template <typename F> inline builtin::async_state:: - async_state (F f) - : thread ([f = std::move (f), this] () mutable noexcept + async_state (uint8_t& r, F f) + : thread ([this, &r, f = std::move (f)] () mutable noexcept { - f (); + uint8_t t (f ()); { - std::unique_lock<std::mutex> l (this->mutex); + unique_lock l (this->mutex); + r = t; finished = true; } @@ -68,9 +69,10 @@ namespace butl { std::unique_ptr<builtin::async_state> s ( new builtin::async_state ( - [f = std::move (f), &r] () mutable noexcept + r, + [f = std::move (f)] () mutable noexcept -> uint8_t { - r = f (); + return f (); })); return builtin (r, move (s)); diff --git a/libbutl/char-scanner.mxx b/libbutl/char-scanner.hxx index 60994cf..24865b7 100644 --- a/libbutl/char-scanner.mxx +++ b/libbutl/char-scanner.hxx @@ -1,37 +1,21 @@ -// file : libbutl/char-scanner.mxx -*- C++ -*- +// file : libbutl/char-scanner.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> // char_traits +#include <cassert> #include <cstddef> // size_t #include <cstdint> // uint64_t #include <climits> // INT_* #include <utility> // pair, make_pair() #include <istream> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.char_scanner; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.fdstream; -#else -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/bufstreambuf.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Refer to utf8_validator for details. // @@ -59,23 +43,25 @@ LIBBUTL_MODEXPORT namespace butl // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D // are treated as one. // - // Note also that if the stream happens to be ifdstream, then it includes - // a number of optimizations that assume nobody else is messing with the - // stream. + // Note also that if the stream happens to be bufstreambuf-based, then it + // includes a number of optimizations that assume nobody else is messing + // with the stream. // - // The line and position arguments can be used to override the start line - // and position in the stream (useful when re-scanning data saved with the - // save_* facility). + // The line, column, and position arguments can be used to override the + // start line, column, and position in the stream (useful when re-scanning + // data saved with the save_* facility). // char_scanner (std::istream&, bool crlf = true, std::uint64_t line = 1, + std::uint64_t column = 1, std::uint64_t position = 0); char_scanner (std::istream&, validator_type, bool crlf = true, std::uint64_t line = 1, + std::uint64_t column = 1, std::uint64_t position = 0); char_scanner (const char_scanner&) = delete; @@ -106,8 +92,9 @@ LIBBUTL_MODEXPORT namespace butl std::uint64_t line; std::uint64_t column; - // Logical character position (see ifdstream for details on the logical - // part) if the scanned stream is ifdstream and always zero otherwise. + // Logical character position (see bufstreambuf for details on the + // logical part) if the scanned stream is bufstreambuf-based and always + // zero otherwise. // std::uint64_t position; @@ -240,7 +227,7 @@ LIBBUTL_MODEXPORT namespace butl // the hairy details; realistically, you would probably only direct-scan // ASCII fragments). // - fdbuf* buf_; // NULL if not ifdstream. + bufstreambuf* buf_; // NULL if not bufstreambuf-based. const char_type* gptr_; const char_type* egptr_; diff --git a/libbutl/char-scanner.ixx b/libbutl/char-scanner.ixx index 57aefc2..2dc41de 100644 --- a/libbutl/char-scanner.ixx +++ b/libbutl/char-scanner.ixx @@ -5,8 +5,10 @@ namespace butl { template <typename V, std::size_t N> inline char_scanner<V, N>:: - char_scanner (std::istream& is, bool crlf, std::uint64_t l, std::uint64_t p) - : char_scanner (is, validator_type (), crlf, l, p) + char_scanner (std::istream& is, + bool crlf, + std::uint64_t l, std::uint64_t c, std::uint64_t p) + : char_scanner (is, validator_type (), crlf, l, c, p) { } diff --git a/libbutl/char-scanner.txx b/libbutl/char-scanner.txx index 35edf42..75ea189 100644 --- a/libbutl/char-scanner.txx +++ b/libbutl/char-scanner.txx @@ -1,9 +1,7 @@ // file : libbutl/char-scanner.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_lib_modules_ts #include <utility> // move -#endif namespace butl { @@ -13,13 +11,14 @@ namespace butl validator_type v, bool crlf, std::uint64_t l, + std::uint64_t c, std::uint64_t p) : line (l), - column (1), + column (c), position (p), is_ (is), val_ (std::move (v)), - buf_ (dynamic_cast<fdbuf*> (is.rdbuf ())), + buf_ (dynamic_cast<bufstreambuf*> (is.rdbuf ())), gptr_ (nullptr), egptr_ (nullptr), crlf_ (crlf) diff --git a/libbutl/command.cxx b/libbutl/command.cxx index c23dfd5..2df52dd 100644 --- a/libbutl/command.cxx +++ b/libbutl/command.cxx @@ -1,48 +1,18 @@ // file : libbutl/command.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/command.mxx> -#endif - -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <map> -#include <string> -#include <cstddef> -#include <functional> +#include <libbutl/command.hxx> #include <ios> // ios::failure #include <vector> +#include <cassert> #include <utility> // move() #include <stdexcept> // invalid_argument #include <system_error> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.command; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.process; -import butl.optional; -#endif - -import butl.builtin; -import butl.fdstream; -import butl.string_parser; -#else -#include <libbutl/builtin.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/string-parser.mxx> -#endif + +#include <libbutl/builtin.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/string-parser.hxx> using namespace std; @@ -81,7 +51,7 @@ namespace butl // if (p == string::npos) throw invalid_argument (string ("unmatched substitution character '") + - open + "'"); + open + '\''); if (p == sp) throw invalid_argument ("empty substitution variable"); @@ -90,12 +60,12 @@ namespace butl if (vn.find_first_of (" \t") != string::npos) throw invalid_argument ("whitespace in substitution variable '" + - vn + "'"); + vn + '\''); // Find the variable and append its value or fail if it's unknown. // if (!sc (vn, r)) - throw invalid_argument ("unknown substitution variable '" + vn + "'"); + throw invalid_argument ("unknown substitution variable '" + vn + '\''); } // Append the source string tail following the last substitution. @@ -198,7 +168,7 @@ namespace butl catch (const invalid_path& e) { throw invalid_argument ("invalid stdout redirect file path '" + - e.path + "'"); + e.path + '\''); } if (redir->empty ()) diff --git a/libbutl/command.mxx b/libbutl/command.hxx index 143d406..fb7258f 100644 --- a/libbutl/command.mxx +++ b/libbutl/command.hxx @@ -1,34 +1,19 @@ -// file : libbutl/command.mxx -*- C++ -*- +// file : libbutl/command.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#ifndef __cpp_lib_modules_ts #include <map> #include <string> #include <cstddef> // size_t #include <functional> -#endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.command; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.process; -import butl.optional; -#else -#include <libbutl/process.mxx> -#include <libbutl/optional.mxx> -#endif +#include <libbutl/process.hxx> +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Run a process or a builtin, interpreting the command line as // whitespace-separated, potentially quoted program path/builtin name, diff --git a/libbutl/const-ptr.mxx b/libbutl/const-ptr.hxx index 343ecf6..1474e17 100644 --- a/libbutl/const-ptr.mxx +++ b/libbutl/const-ptr.hxx @@ -1,28 +1,11 @@ -// file : libbutl/const-ptr.mxx -*- C++ -*- +// file : libbutl/const-ptr.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <cstddef> // nullptr_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.const_ptr; -#ifdef __cpp_lib_modules_ts -import std.core; // @@ MOD std.fundamental. -#endif -#endif - -#include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Const-propagating pointer. // diff --git a/libbutl/curl.cxx b/libbutl/curl.cxx index ac3d0cb..5649965 100644 --- a/libbutl/curl.cxx +++ b/libbutl/curl.cxx @@ -1,41 +1,14 @@ // file : libbutl/curl.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/curl.mxx> -#endif - -// C includes. +#include <libbutl/curl.hxx> #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> - #include <utility> // move() +#include <cstdlib> // strtoul(), size_t #include <exception> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.curl; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.process; -import butl.fdstream; -import butl.small_vector; -#endif -import butl.utility; // icasecmp() -#else -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utility.hxx> using namespace std; @@ -49,7 +22,17 @@ namespace butl case ftp_put: throw invalid_argument ("no input specified for PUT method"); case http_post: - throw invalid_argument ("no input specified for POST method"); + { + // Post the empty data. + // + // Note that while it's tempting to specify the --request POST option + // instead, that can potentially overwrite the request methods for the + // HTTP 30X response code redirects. + // + d.options.push_back ("--data-raw"); + d.options.push_back (""); + } + // Fall through. case ftp_get: case http_get: { @@ -170,7 +153,7 @@ namespace butl } curl::method_proto curl:: - translate (method_type m, const string& u, method_proto_options& o) + translate (method_type m, const string& u, method_proto_options& o, flags fs) { size_t n (u.find ("://")); @@ -189,8 +172,11 @@ namespace butl } else if (icasecmp (u, "http", n) == 0 || icasecmp (u, "https", n) == 0) { - o.push_back ("--fail"); // Fail on HTTP errors (e.g., 404). - o.push_back ("--location"); // Follow redirects. + if ((fs & flags::no_fail) == flags::none) + o.push_back ("--fail"); // Fail on HTTP errors (e.g., 404). + + if ((fs & flags::no_location) == flags::none) + o.push_back ("--location"); // Follow redirects. switch (m) { @@ -203,4 +189,123 @@ namespace butl throw invalid_argument ("unsupported protocol"); } + + uint16_t curl:: + parse_http_status_code (const string& s) + { + char* e (nullptr); + unsigned long c (strtoul (s.c_str (), &e, 10)); // Can't throw. + assert (e != nullptr); + + return *e == '\0' && c >= 100 && c < 600 + ? static_cast<uint16_t> (c) + : 0; + } + + string curl:: + read_http_response_line (ifdstream& is) + { + string r; + getline (is, r); // Strips the trailing LF (0xA). + + // Note that on POSIX CRLF is not automatically translated into LF, so we + // need to strip CR (0xD) manually. + // + if (!r.empty () && r.back () == '\r') + r.pop_back (); + + return r; + } + + curl::http_status curl:: + read_http_status (ifdstream& is, bool skip_headers) + { + // After getting the status line, if requested, we will read until the + // empty line (containing just CRLF). Not being able to reach such a line + // is an error, which is the reason for the exception mask choice. When + // done, we will restore the original exception mask. + // + ifdstream::iostate es (is.exceptions ()); + is.exceptions (ifdstream::badbit | ifdstream::failbit | ifdstream::eofbit); + + auto read_status = [&is, es] () + { + string l (read_http_response_line (is)); + + for (;;) // Breakout loop. + { + if (l.compare (0, 5, "HTTP/") != 0) + break; + + size_t p (l.find (' ', 5)); // The protocol end. + if (p == string::npos) + break; + + p = l.find_first_not_of (' ', p + 1); // The code start. + if (p == string::npos) + break; + + size_t e (l.find (' ', p + 1)); // The code end. + if (e == string::npos) + break; + + uint16_t c (parse_http_status_code (string (l, p, e - p))); + if (c == 0) + break; + + string r; + p = l.find_first_not_of (' ', e + 1); // The reason start. + if (p != string::npos) + { + e = l.find_last_not_of (' '); // The reason end. + assert (e != string::npos && e >= p); + + r = string (l, p, e - p + 1); + } + + return http_status {c, move (r)}; + } + + is.exceptions (es); // Restore the exception mask. + + throw invalid_argument ("invalid status line '" + l + "'"); + }; + + // The curl output for a successfull request looks like this: + // + // HTTP/1.1 100 Continue + // + // HTTP/1.1 200 OK + // Content-Length: 83 + // Content-Type: text/manifest;charset=utf-8 + // + // <response-body> + // + // curl normally sends the 'Expect: 100-continue' header for uploads, so + // we need to handle the interim HTTP server response with the continue + // (100) status code. + // + // Interestingly, Apache can respond with the continue (100) code and with + // the not found (404) code afterwords. + // + http_status rs (read_status ()); + + if (rs.code == 100) + { + // Skips the interim response. + // + while (!read_http_response_line (is).empty ()) ; + + rs = read_status (); // Reads the final status code. + } + + if (skip_headers) + { + while (!read_http_response_line (is).empty ()) ; // Skips headers. + } + + is.exceptions (es); + + return rs; + } } diff --git a/libbutl/curl.mxx b/libbutl/curl.hxx index 03aac99..ea91807 100644 --- a/libbutl/curl.mxx +++ b/libbutl/curl.hxx @@ -1,42 +1,20 @@ -// file : libbutl/curl.mxx -*- C++ -*- +// file : libbutl/curl.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> +#include <cstdint> // uint16_t #include <type_traits> -#include <cstddef> // size_t -#include <utility> // forward() -#include <exception> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.curl; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.process; //@@ MOD TODO: should we re-export? -import butl.fdstream; -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/process.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/process.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/small-vector.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Perform a method (GET, POST, PUT) on a URL using the curl(1) program. // Throw process_error and io_error (both derive from system_error) in case @@ -113,6 +91,19 @@ LIBBUTL_MODEXPORT namespace butl public: enum method_type {get, put, post}; + // By default the -sS and, for the HTTP protocol, --fail and --location + // options are passed to curl on the command line. Optionally, these + // options can be suppressed. + // + enum class flags: std::uint16_t + { + no_fail = 0x01, // Don't pass --fail. + no_location = 0x02, // Don't pass --location + no_sS = 0x04, // Don't pass -sS + + none = 0 // Default options set. + }; + ifdstream in; ofdstream out; @@ -143,12 +134,77 @@ LIBBUTL_MODEXPORT namespace butl const std::string& url, A&&... options); + // Similar to the above, but allows to adjust the curl's default command + // line. + // + template <typename I, + typename O, + typename E, + typename... A> + curl (I&& in, + O&& out, + E&& err, + method_type, + flags, + const std::string& url, + A&&... options); + + template <typename C, + typename I, + typename O, + typename E, + typename... A> + curl (const C&, + I&& in, + O&& out, + E&& err, + method_type, + flags, + const std::string& url, + A&&... options); + + // Read the HTTP response status from an input stream. + // + // Specifically, read and parse the HTTP status line, by default skip over + // the remaining headers (leaving the stream at the beginning of the + // response body), and return the status code and the reason phrase. Throw + // std::invalid_argument if the status line could not be parsed. Pass + // through the ios::failure exception on the stream error. + // + // Note that if ios::failure is thrown the stream's exception mask may not + // be preserved. + // + struct http_status + { + std::uint16_t code; + std::string reason; + }; + + static http_status + read_http_status (ifdstream&, bool skip_headers = true); + + // Parse and return the HTTP status code. Return 0 if the argument is + // invalid. + // + static std::uint16_t + parse_http_status_code (const std::string&); + + // Read the CRLF-terminated line from an input stream, stripping the + // trailing CRLF. Pass through the ios::failure exception on the stream + // error. + // + static std::string + read_http_response_line (ifdstream&); + private: enum method_proto {ftp_get, ftp_put, http_get, http_post}; using method_proto_options = small_vector<const char*, 2>; method_proto - translate (method_type, const std::string& url, method_proto_options&); + translate (method_type, + const std::string& url, + method_proto_options&, + flags); private: template <typename T> @@ -188,6 +244,11 @@ LIBBUTL_MODEXPORT namespace butl typename std::enable_if<is_other<O>::value, O>::type map_out (O&&, method_proto, io_data&); }; + + curl::flags operator& (curl::flags, curl::flags); + curl::flags operator| (curl::flags, curl::flags); + curl::flags operator&= (curl::flags&, curl::flags); + curl::flags operator|= (curl::flags&, curl::flags); } #include <libbutl/curl.ixx> diff --git a/libbutl/curl.ixx b/libbutl/curl.ixx index 61a4ff5..6dcfe13 100644 --- a/libbutl/curl.ixx +++ b/libbutl/curl.ixx @@ -1,7 +1,11 @@ // file : libbutl/curl.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <cstddef> // size_t +#include <utility> // forward() +#include <exception> // invalid_argument + +namespace butl { template <typename I, typename O, @@ -12,6 +16,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. O&& out, E&& err, method_type m, + flags fs, const std::string& url, A&&... options) : curl ([] (const char* [], std::size_t) {}, @@ -19,8 +24,80 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. std::forward<O> (out), std::forward<E> (err), m, + fs, + url, + std::forward<A> (options)...) + { + } + + template <typename C, + typename I, + typename O, + typename E, + typename... A> + inline curl:: + curl (const C& cmdc, + I&& in, + O&& out, + E&& err, + method_type m, + const std::string& url, + A&&... options) + : curl (cmdc, + std::forward<I> (in), + std::forward<O> (out), + std::forward<E> (err), + m, + flags::none, + url, + std::forward<A> (options)...) + { + } + + template <typename I, + typename O, + typename E, + typename... A> + inline curl:: + curl (I&& in, + O&& out, + E&& err, + method_type m, + const std::string& url, + A&&... options) + : curl (std::forward<I> (in), + std::forward<O> (out), + std::forward<E> (err), + m, + flags::none, url, std::forward<A> (options)...) { } + + inline curl::flags + operator&= (curl::flags& x, curl::flags y) + { + return x = static_cast<curl::flags> (static_cast<std::uint16_t> (x) & + static_cast<std::uint16_t> (y)); + } + + inline curl::flags + operator|= (curl::flags& x, curl::flags y) + { + return x = static_cast<curl::flags> (static_cast<std::uint16_t> (x) | + static_cast<std::uint16_t> (y)); + } + + inline curl::flags + operator& (curl::flags x, curl::flags y) + { + return x &= y; + } + + inline curl::flags + operator| (curl::flags x, curl::flags y) + { + return x |= y; + } } diff --git a/libbutl/curl.txx b/libbutl/curl.txx index 0c07d35..fc74470 100644 --- a/libbutl/curl.txx +++ b/libbutl/curl.txx @@ -1,7 +1,7 @@ // file : libbutl/curl.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { template <typename I> typename std::enable_if<curl::is_other<I>::value, I>::type curl:: @@ -65,11 +65,12 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. O&& out, E&& err, method_type m, + flags fs, const std::string& url, A&&... options) { method_proto_options mpo; - method_proto mp (translate (m, url, mpo)); + method_proto mp (translate (m, url, mpo, fs)); io_data in_data; io_data out_data; @@ -81,8 +82,9 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. map_out (std::forward<O> (out), mp, out_data), std::forward<E> (err), "curl", - "-s", // Silent. - "-S", // But do show diagnostics. + ((fs & flags::no_sS) == flags::none + ? "-sS" // Silent but do show diagnostics. + : nullptr), mpo, in_data.options, out_data.options, diff --git a/libbutl/default-options.cxx b/libbutl/default-options.cxx deleted file mode 100644 index 28f6fb7..0000000 --- a/libbutl/default-options.cxx +++ /dev/null @@ -1,73 +0,0 @@ -// file : libbutl/default-options.cxx -*- C++ -*- -// license : MIT; see accompanying LICENSE file - -#ifndef __cpp_modules_ts -#include <libbutl/default-options.mxx> -#endif - -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <vector> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.default_options; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -import butl.small_vector; -#endif - -#endif - -using namespace std; - -namespace butl -{ - optional<dir_path> - default_options_start (const optional<dir_path>& home, - const vector<dir_path>& dirs) - { - if (home) - assert (home->absolute () && home->normalized ()); - - if (dirs.empty ()) - return nullopt; - - // Use the first directory as a start. - // - auto i (dirs.begin ()); - dir_path d (*i); - - // Try to find a common prefix for each subsequent directory. - // - for (++i; i != dirs.end (); ++i) - { - bool p (false); - - for (; - !(d.root () || (home && d == *home)); - d = d.directory ()) - { - if (i->sub (d)) - { - p = true; - break; - } - } - - if (!p) - return nullopt; - } - - return d; - } -} diff --git a/libbutl/default-options.mxx b/libbutl/default-options.hxx index 11f7bb2..1d363b6 100644 --- a/libbutl/default-options.mxx +++ b/libbutl/default-options.hxx @@ -1,44 +1,18 @@ -// file : libbutl/default-options.mxx -*- C++ -*- +// file : libbutl/default-options.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> -#include <utility> // move(), forward(), make_pair() -#include <algorithm> // reverse() -#include <system_error> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.default_options; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -import butl.small_vector; - -import butl.git; -import butl.filesystem; -#else -#include <libbutl/path.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/small-vector.mxx> - -#include <libbutl/git.mxx> -#include <libbutl/filesystem.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/small-vector.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Default options files helper implementation. // @@ -107,6 +81,15 @@ LIBBUTL_MODEXPORT namespace butl // // Note that the extra directory options files are never considered remote. // + // For the convenience of implementation, the function parses the option + // files in the reverse order. Thus, to make sure that positions in the + // options list monotonically increase, it needs the maximum number of + // arguments, globally and per file, to be specified. This way the starting + // options position for each file will be less than for the previously + // parsed file by arg_max_file and equal to arg_max - arg_max_file for the + // first file. If the actual number of arguments exceeds the specified, then + // invalid_argument is thrown. + // template <typename O, typename S, typename U, typename F> default_options<O> load_default_options (const optional<dir_path>& sys_dir, @@ -115,6 +98,8 @@ LIBBUTL_MODEXPORT namespace butl const default_options_files&, F&&, const std::string& option, + std::size_t arg_max, + std::size_t arg_max_file, bool args = false); // Merge the default options/arguments and the command line @@ -152,12 +137,25 @@ LIBBUTL_MODEXPORT namespace butl AS merge_default_arguments (const default_options<O>&, const AS&, F&&); - // Find a common start (parent) directory stopping at home or root - // (excluding). + // Find a common start (parent) directory for directories specified as an + // iterator range, stopping at home or root (excluding). Optionally pass a + // function resolving an iterator into a directory in a way other than just + // dereferencing it. The function signature is: + // + // const dir_path& (I) // - LIBBUTL_SYMEXPORT optional<dir_path> - default_options_start (const optional<dir_path>& home_dir, - const std::vector<dir_path>&); + template <typename I, typename F> + optional<dir_path> + default_options_start (const optional<dir_path>& home, I, I, F&&); + + template <typename I> + inline optional<dir_path> + default_options_start (const optional<dir_path>& home, I b, I e) + { + return default_options_start (home, + b, e, + [] (I i) -> const dir_path& {return *i;}); + } } #include <libbutl/default-options.ixx> diff --git a/libbutl/default-options.ixx b/libbutl/default-options.ixx index 4a551ac..7248d7d 100644 --- a/libbutl/default-options.ixx +++ b/libbutl/default-options.ixx @@ -1,7 +1,7 @@ // file : libbutl/default-options.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { template <typename O> inline O diff --git a/libbutl/default-options.txx b/libbutl/default-options.txx index eaf4235..aa254b2 100644 --- a/libbutl/default-options.txx +++ b/libbutl/default-options.txx @@ -1,7 +1,15 @@ // file : libbutl/default-options.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <utility> // move(), forward(), make_pair() +#include <algorithm> // reverse() +#include <stdexcept> // invalid_argument +#include <system_error> + +#include <libbutl/git.hxx> +#include <libbutl/filesystem.hxx> + +namespace butl { inline bool options_dir_exists (const dir_path& d) @@ -14,10 +22,11 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. throw std::make_pair (path_cast<path> (d), std::move (e)); } - // Search for and parse the options files in the specified directory and - // its local/ subdirectory, if exists, in the reverse order and append the - // options to the resulting list. Return false if --no-default-options is - // encountered. + // Search for and parse the options files in the specified directory and its + // local/ subdirectory, if exists, in the reverse order and append the + // options to the resulting list. Verify that the number of arguments + // doesn't exceed the limits and decrement arg_max by arg_max_file after + // parsing each file. Return false if --no-default-options is encountered. // // Note that by default we check for the local/ subdirectory even if we // don't think it belongs to the remote directory; the user may move things @@ -36,6 +45,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. bool remote, const small_vector<path, 2>& fs, F&& fn, + std::size_t& arg_max, + std::size_t arg_max_file, default_options<O>& def_ops, bool load_sub = true, bool load_dir = true) @@ -44,7 +55,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. bool r (true); - auto load = [&opt, args, &fs, &fn, &def_ops, &r] + auto load = [&opt, args, &fs, &fn, &def_ops, &arg_max, arg_max_file, &r] (const dir_path& d, bool rem) { using namespace std; @@ -57,9 +68,14 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. { if (file_exists (p)) // Follows symlinks. { + if (arg_max < arg_max_file) + throw invalid_argument ("too many options files"); + + size_t start_pos (arg_max - arg_max_file); + fn (p, rem, false /* overwrite */); - S s (p.string (), opt); + S s (p.string (), opt, start_pos); // @@ Note that the potentially thrown exceptions (unknown option, // unexpected argument, etc) will not contain any location @@ -81,6 +97,15 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. else o.parse (s, U::fail, U::fail); + if (s.position () > arg_max) + throw invalid_argument ("too many options in file " + + p.string ()); + + // Don't decrement arg_max for the empty option files. + // + if (s.position () != start_pos) + arg_max = start_pos; + if (o.no_default_options ()) r = false; @@ -119,6 +144,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. const default_options_files& ofs, F&& fn, const std::string& opt, + std::size_t arg_max, + std::size_t arg_max_file, bool args) { if (sys_dir) @@ -214,6 +241,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. false /* remote */, ofs.files, std::forward<F> (fn), + arg_max, + arg_max_file, r); load_extra = false; @@ -228,6 +257,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. remote, ofs.files, std::forward<F> (fn), + arg_max, + arg_max_file, r, load_build2_local, load_build2); @@ -245,6 +276,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. false /* remote */, ofs.files, std::forward<F> (fn), + arg_max, + arg_max_file, r); if (load && home_dir) @@ -258,6 +291,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. false /* remote */, ofs.files, std::forward<F> (fn), + arg_max, + arg_max_file, r); } @@ -268,6 +303,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. false /* remote */, ofs.files, std::forward<F> (fn), + arg_max, + arg_max_file, r); std::reverse (r.begin (), r.end ()); @@ -318,4 +355,43 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. r.insert (r.end (), cmd_args.begin (), cmd_args.end ()); return r; } + + template <typename I, typename F> + optional<dir_path> + default_options_start (const optional<dir_path>& home, I b, I e, F&& f) + { + if (home) + assert (home->absolute () && home->normalized ()); + + if (b == e) + return nullopt; + + // Use the first directory as a start. + // + I i (b); + dir_path d (f (i)); + + // Try to find a common prefix for each subsequent directory. + // + for (++i; i != e; ++i) + { + bool p (false); + + for (; + !(d.root () || (home && d == *home)); + d = d.directory ()) + { + if (f (i).sub (d)) + { + p = true; + break; + } + } + + if (!p) + return nullopt; + } + + return d; + } } diff --git a/libbutl/diagnostics.cxx b/libbutl/diagnostics.cxx index b038e5d..6ac8192 100644 --- a/libbutl/diagnostics.cxx +++ b/libbutl/diagnostics.cxx @@ -1,9 +1,7 @@ // file : libbutl/diagnostics.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/diagnostics.mxx> -#endif +#include <libbutl/diagnostics.hxx> #ifndef _WIN32 # include <unistd.h> // write() @@ -12,49 +10,36 @@ # include <io.h> //_write() #endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <utility> -#include <exception> - #include <ios> // ios::failure #include <mutex> #include <string> +#include <cassert> #include <cstddef> // size_t #include <iostream> // cerr -#endif - -// Other includes. -#ifdef __cpp_modules_ts -module butl.diagnostics; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -import std.threading; -import butl.utility; -import butl.optional; -import butl.fdstream; // stderr_fd(), fdterm() +#ifndef LIBBUTL_MINGW_STDTHREAD +# include <mutex> #else -#include <libbutl/utility.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/fdstream.mxx> +# include <libbutl/mingw-mutex.hxx> #endif +#include <libbutl/ft/lang.hxx> // thread_local + +#include <libbutl/utility.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/fdstream.hxx> + using namespace std; namespace butl { ostream* diag_stream = &cerr; - static mutex diag_mutex; +#ifndef LIBBUTL_MINGW_STDTHREAD + static std::mutex diag_mutex; +#else + static mingw_stdthread::mutex diag_mutex; +#endif string diag_progress; static string diag_progress_blank; // Being printed blanks out the line. @@ -158,28 +143,28 @@ namespace butl default_writer (const diag_record& r) { r.os.put ('\n'); - diag_stream_lock () << r.os.str (); + + diag_stream_lock l; + (*diag_stream) << r.os.str (); // We can endup flushing the result of several writes. The last one may // possibly be incomplete, but that's not a problem as it will also be // followed by the flush() call. // - // @@ Strange: why not just hold the lock for both write and flush? - // diag_stream->flush (); } - void (*diag_record::writer) (const diag_record&) = &default_writer; + diag_writer* diag_record::writer = &default_writer; void diag_record:: - flush () const + flush (void (*w) (const diag_record&)) const { if (!empty_) { if (epilogue_ == nullptr) { - if (writer != nullptr) - writer (*this); + if (w != nullptr || (w = writer) != nullptr) + w (*this); empty_ = true; } @@ -189,8 +174,8 @@ namespace butl // auto e (epilogue_); epilogue_ = nullptr; - e (*this); // Can throw. - flush (); // Call ourselves to write the data in case it returns. + e (*this, w); // Can throw. + flush (w); // Call ourselves to write the data in case it returns. } } } @@ -213,4 +198,28 @@ namespace butl flush (); #endif } + + // Diagnostics stack. + // + static +#ifdef __cpp_thread_local + thread_local +#else + __thread +#endif + const diag_frame* diag_frame_stack = nullptr; + + const diag_frame* diag_frame:: + stack () noexcept + { + return diag_frame_stack; + } + + const diag_frame* diag_frame:: + stack (const diag_frame* f) noexcept + { + const diag_frame* r (diag_frame_stack); + diag_frame_stack = f; + return r; + } } diff --git a/libbutl/diagnostics.mxx b/libbutl/diagnostics.hxx index d41ba74..c6db34b 100644 --- a/libbutl/diagnostics.mxx +++ b/libbutl/diagnostics.hxx @@ -1,32 +1,19 @@ -// file : libbutl/diagnostics.mxx -*- C++ -*- +// file : libbutl/diagnostics.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif #include <cassert> - -#ifndef __cpp_lib_modules_ts #include <ostream> #include <sstream> #include <utility> // move(), forward() #include <exception> // uncaught_exception[s]() -#endif #include <libbutl/ft/exception.hxx> // uncaught_exceptions -#ifdef __cpp_modules_ts -export module butl.diagnostics; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Diagnostic facility base infrastructure. // @@ -40,8 +27,11 @@ LIBBUTL_MODEXPORT namespace butl LIBBUTL_SYMEXPORT extern std::ostream* diag_stream; // Acquire the diagnostics exclusive access mutex in ctor, release in dtor. - // An object of the type must be created prior to writing to diag_stream (see - // above). + // An object of the type must be created prior to writing to diag_stream + // (see above). + // + // Note that this class also manages the interaction with the progress + // printing (see below). // struct LIBBUTL_SYMEXPORT diag_stream_lock { @@ -87,13 +77,26 @@ LIBBUTL_MODEXPORT namespace butl ~diag_progress_lock (); }; + // Diagnostic record and marks (error, warn, etc). // + // There are two ways to use this facility in a project: simple, where we + // just alias the types in our namespace, and complex, where instead we + // derive from them and "override" (hide, really) operator<< (and a few + // other functions) in order to make ADL look in our namespace rather than + // butl. In the simple case we may have to resort to defining some + // operator<< overloads in namespace std in order to satisfy ADL. This is + // usually not an acceptable approach for libraries, which is where the + // complex case comes in (see libbuild2 for a "canonical" example of the + // complex case). Note also that it doesn't seem worth templatazing epilogue + // so the complex case may also need to do a few casts but those should be + // limited to the diagnostics infrastructure. // struct diag_record; template <typename> struct diag_prologue; template <typename> struct diag_mark; - using diag_epilogue = void (const diag_record&); + using diag_writer = void (const diag_record&); + using diag_epilogue = void (const diag_record&, diag_writer*); struct LIBBUTL_SYMEXPORT diag_record { @@ -130,7 +133,7 @@ LIBBUTL_MODEXPORT namespace butl full () const {return !empty_;} void - flush () const; + flush (diag_writer* = nullptr) const; void append (const char* indent, diag_epilogue* e) const @@ -163,7 +166,7 @@ LIBBUTL_MODEXPORT namespace butl #endif empty_ (r.empty_), epilogue_ (r.epilogue_), - os (std::move (r.os)) + os (std::move (r.os)) // Note: can throw. { if (!empty_) { @@ -181,7 +184,7 @@ LIBBUTL_MODEXPORT namespace butl // Diagnostics writer. The default implementation writes the record text // to diag_stream. If it is NULL, then the record text is ignored. // - static void (*writer) (const diag_record&); + static diag_writer* writer; protected: #ifdef __cpp_lib_uncaught_exceptions @@ -276,4 +279,97 @@ LIBBUTL_MODEXPORT namespace butl e.B::operator() (r); } }; + + // Diagnostics stack. Each frame is "applied" to the diag record. + // + // Unfortunately most of our use-cases don't fit into the 2-pointer small + // object optimization of std::function. So we have to complicate things + // a bit here. + // + struct LIBBUTL_SYMEXPORT diag_frame + { + explicit + diag_frame (void (*f) (const diag_frame&, const diag_record&)) + : func_ (f) + { + if (func_ != nullptr) + prev_ = stack (this); + } + + diag_frame (diag_frame&& x) + : func_ (x.func_) + { + if (func_ != nullptr) + { + prev_ = x.prev_; + stack (this); + + x.func_ = nullptr; + } + } + + diag_frame& operator= (diag_frame&&) = delete; + + diag_frame (const diag_frame&) = delete; + diag_frame& operator= (const diag_frame&) = delete; + + ~diag_frame () + { + if (func_ != nullptr ) + stack (prev_); + } + + // Normally passed as an epilogue. Writer is not used. + // + static void + apply (const diag_record& r, diag_writer* = nullptr) + { + for (const diag_frame* f (stack ()); f != nullptr; f = f->prev_) + f->func_ (*f, r); + } + + // Tip of the stack. + // + static const diag_frame* + stack () noexcept; + + // Set the new and return the previous tip of the stack. + // + static const diag_frame* + stack (const diag_frame*) noexcept; + + struct stack_guard + { + explicit stack_guard (const diag_frame* s): s_ (stack (s)) {} + ~stack_guard () {stack (s_);} + const diag_frame* s_; + }; + + private: + void (*func_) (const diag_frame&, const diag_record&); + const diag_frame* prev_; + }; + + template <typename F> + struct diag_frame_impl: diag_frame + { + explicit + diag_frame_impl (F f): diag_frame (&thunk), func_ (move (f)) {} + + private: + static void + thunk (const diag_frame& f, const diag_record& r) + { + static_cast<const diag_frame_impl&> (f).func_ (r); + } + + const F func_; + }; + + template <typename F> + inline diag_frame_impl<F> + make_diag_frame (F f) + { + return diag_frame_impl<F> (move (f)); + } } diff --git a/libbutl/export.hxx b/libbutl/export.hxx index 3353ca8..dc04f85 100644 --- a/libbutl/export.hxx +++ b/libbutl/export.hxx @@ -3,14 +3,6 @@ #pragma once -// If modules are available, setup the module export. -// -#ifdef __cpp_modules_ts -# define LIBBUTL_MODEXPORT export -#else -# define LIBBUTL_MODEXPORT -#endif - // Normally we don't export class templates (but do complete specializations), // inline functions, and classes with only inline member functions. Exporting // classes that inherit from non-exported/imported bases (e.g., std::string) diff --git a/libbutl/fdstream.cxx b/libbutl/fdstream.cxx index 4948052..df5b531 100644 --- a/libbutl/fdstream.cxx +++ b/libbutl/fdstream.cxx @@ -1,9 +1,7 @@ // file : libbutl/fdstream.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/fdstream.hxx> #include <errno.h> // errno, E* @@ -12,72 +10,54 @@ # include <unistd.h> // close(), read(), write(), lseek(), dup(), pipe(), // ftruncate(), isatty(), ssize_t, STD*_FILENO # include <sys/uio.h> // writev(), iovec -# include <sys/stat.h> // stat(), S_I* +# include <sys/stat.h> // stat(), fstat(), S_I* # include <sys/time.h> // timeval # include <sys/types.h> // stat, off_t # include <sys/select.h> #else # include <libbutl/win32-utility.hxx> -# include <io.h> // _close(), _read(), _write(), _setmode(), _sopen(), - // _lseek(), _dup(), _pipe(), _chsize_s, - // _get_osfhandle() -# include <share.h> // _SH_DENYNO -# include <stdio.h> // _fileno(), stdin, stdout, stderr, SEEK_* -# include <fcntl.h> // _O_* -# include <sys/stat.h> // S_I* +# ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING +# define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04 +# endif + +# include <io.h> // _close(), _read(), _write(), _setmode(), _sopen(), + // _lseek(), _dup(), _pipe(), _chsize_s, + // _get_osfhandle() +# include <share.h> // _SH_DENYNO +# include <stdio.h> // _fileno(), stdin, stdout, stderr, SEEK_* +# include <fcntl.h> // _O_* +# include <sys/types.h> // _stat +# include <sys/stat.h> // fstat(), S_I* + +# ifdef _MSC_VER // Unlikely to be fixed in newer versions. +# define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +# define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +# define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) +# endif # include <wchar.h> // wcsncmp(), wcsstr() +# include <thread> // this_thread::yield() # include <algorithm> // count() #endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <vector> -#include <string> -#include <chrono> -#include <istream> -#include <ostream> -#include <utility> -#include <cstdint> -#include <cstddef> - #include <ios> // ios_base::openmode, ios_base::failure #include <new> // bad_alloc #include <limits> // numeric_limits -#include <cstring> // memcpy(), memmove() +#include <cassert> +#include <cstring> // memcpy(), memmove(), memchr(), strcmp() +#include <cstdlib> // getenv() #include <iostream> // cin, cout #include <exception> // uncaught_exception[s]() #include <stdexcept> // invalid_argument #include <system_error> -#endif -#include <libbutl/ft/exception.hxx> // uncaught_exceptions +#include <libbutl/ft/exception.hxx> // uncaught_exceptions #include <libbutl/process-details.hxx> -#ifdef __cpp_modules_ts -module butl.fdstream; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -import std.threading; // Clang wants it in purview (see process-details.hxx). -#endif -import butl.path; -import butl.filesystem; -import butl.small_vector; -#endif - -import butl.utility; // throw_*_ios_failure(), function_cast() -import butl.timestamp; -#else -#include <libbutl/utility.mxx> -#include <libbutl/timestamp.mxx> -#endif +#include <libbutl/utility.hxx> // throw_*_ios_failure(), function_cast() +#include <libbutl/timestamp.hxx> using namespace std; @@ -167,7 +147,7 @@ namespace butl } #endif - // fdbuf + // fdstreambuf // // Return true if the file descriptor is in the non-blocking mode. Throw // ios::failure on the underlying OS error. @@ -188,7 +168,7 @@ namespace butl #endif } - void fdbuf:: + void fdstreambuf:: open (auto_fd&& fd, uint64_t pos) { close (); @@ -201,7 +181,7 @@ namespace butl fd_ = move (fd); } - bool fdbuf:: + bool fdstreambuf:: blocking (bool m) { // Verify that the file descriptor is open. @@ -225,7 +205,7 @@ namespace butl return !m; } - streamsize fdbuf:: + streamsize fdstreambuf:: showmanyc () { if (!is_open ()) @@ -260,7 +240,7 @@ namespace butl return 0; } - fdbuf::int_type fdbuf:: + fdstreambuf::int_type fdstreambuf:: underflow () { int_type r (traits_type::eof ()); @@ -282,7 +262,7 @@ namespace butl return r; } - bool fdbuf:: + bool fdstreambuf:: load () { // Doesn't handle blocking mode and so should not be called. @@ -299,7 +279,7 @@ namespace butl return n != 0; } - void fdbuf:: + void fdstreambuf:: seekg (uint64_t off) { // In the future we may implement the blocking behavior for a non-blocking @@ -334,7 +314,7 @@ namespace butl setg (buf_, buf_, buf_); } - fdbuf::int_type fdbuf:: + fdstreambuf::int_type fdstreambuf:: overflow (int_type c) { int_type r (traits_type::eof ()); @@ -362,7 +342,7 @@ namespace butl return r; } - int fdbuf:: + int fdstreambuf:: sync () { if (!is_open ()) @@ -379,15 +359,7 @@ namespace butl return save () ? 0 : -1; } -#ifdef _WIN32 - static inline int - write (int fd, const void* buf, size_t n) - { - return _write (fd, buf, static_cast<unsigned int> (n)); - } -#endif - - bool fdbuf:: + bool fdstreambuf:: save () { size_t n (pptr () - pbase ()); @@ -398,7 +370,7 @@ namespace butl // descriptor opened for read-only access (while -1 with errno EBADF is // expected). This is in contrast with VC's _write() and POSIX's write(). // - auto m (write (fd_.get (), buf_, n)); + auto m (fdwrite (fd_.get (), buf_, n)); if (m == -1) throw_generic_ios_failure (errno); @@ -414,7 +386,7 @@ namespace butl return true; } - streamsize fdbuf:: + streamsize fdstreambuf:: xsputn (const char_type* s, streamsize sn) { // The xsputn() function interface doesn't support the non-blocking @@ -513,7 +485,7 @@ namespace butl // Flush the buffer. // size_t wn (bn + an); - int r (wn > 0 ? write (fd_.get (), buf_, wn) : 0); + streamsize r (wn > 0 ? fdwrite (fd_.get (), buf_, wn) : 0); if (r == -1) throw_generic_ios_failure (errno); @@ -556,7 +528,7 @@ namespace butl // The data tail doesn't fit the buffer so write it to the file. // - r = write (fd_.get (), s, n); + r = fdwrite (fd_.get (), s, n); if (r == -1) throw_generic_ios_failure (errno); @@ -571,13 +543,13 @@ namespace butl // // - basic_ostream::seekp(pos) -> // basic_streambuf::pubseekpos(pos, ios::out) -> - // fdbuf::seekpos(pos, ios::out) + // fdstreambuf::seekpos(pos, ios::out) // // - basic_istream::seekg(pos) -> // basic_streambuf::pubseekpos(pos, ios::in) -> - // fdbuf::seekpos(pos, ios::in) + // fdstreambuf::seekpos(pos, ios::in) // - fdbuf::pos_type fdbuf:: + fdstreambuf::pos_type fdstreambuf:: seekpos (pos_type pos, ios_base::openmode which) { // Note that the position type provides an explicit conversion to the @@ -592,21 +564,21 @@ namespace butl // // - basic_ostream::seekp(off, dir) -> // basic_streambuf::pubseekoff(off, dir, ios::out) -> - // fdbuf::seekoff(off, dir, ios::out) + // fdstreambuf::seekoff(off, dir, ios::out) // // - basic_ostream::tellp() -> // basic_streambuf::pubseekoff(0, ios::cur, ios::out) -> - // fdbuf::seekoff(0, ios::cur, ios::out) + // fdstreambuf::seekoff(0, ios::cur, ios::out) // // - basic_istream::seekg(off, dir) -> // basic_streambuf::pubseekoff(off, dir, ios::in) -> - // fdbuf::seekoff(off, dir, ios::in) + // fdstreambuf::seekoff(off, dir, ios::in) // // - basic_istream::tellg() -> // basic_streambuf::pubseekoff(0, ios::cur, ios::in) -> - // fdbuf::seekoff(0, ios::cur, ios::in) + // fdstreambuf::seekoff(0, ios::cur, ios::in) // - fdbuf::pos_type fdbuf:: + fdstreambuf::pos_type fdstreambuf:: seekoff (off_type off, ios_base::seekdir dir, ios_base::openmode which) { // The seekoff() function interface doesn't support the non-blocking @@ -830,9 +802,8 @@ namespace butl catch (const ios_base::failure&) {} } - // Underlying file descriptor is closed by fdbuf dtor with errors (if any) - // being ignored. - // + // Underlying file descriptor is closed by fdstreambuf dtor with errors + // (if any) being ignored. } void ifdstream:: @@ -873,7 +844,7 @@ namespace butl } ifdstream& - getline (ifdstream& is, string& s, char delim) + getline (ifdstream& is, string& l, char delim) { ifdstream::iostate eb (is.exceptions ()); assert (eb & ifdstream::badbit); @@ -881,16 +852,16 @@ namespace butl // Amend the exception mask to prevent exceptions being thrown by the C++ // IO runtime to avoid incompatibility issues due to ios_base::failure ABI // fiasco (#66145). We will not restore the mask when ios_base::failure is - // thrown by fdbuf since there is no way to "silently" restore it if the - // corresponding bits are in the error state without the exceptions() call - // throwing ios_base::failure. Not restoring exception mask on throwing - // because of badbit should probably be ok since the stream is no longer - // usable. + // thrown by fdstreambuf since there is no way to "silently" restore it if + // the corresponding bits are in the error state without the exceptions() + // call throwing ios_base::failure. Not restoring exception mask on + // throwing because of badbit should probably be ok since the stream is no + // longer usable. // if (eb != ifdstream::badbit) is.exceptions (ifdstream::badbit); - std::getline (is, s, delim); + std::getline (is, l, delim); // Throw if any of the newly set bits are present in the exception mask. // @@ -903,6 +874,58 @@ namespace butl return is; } + bool + getline_non_blocking (ifdstream& is, string& l, char delim) + { + assert (!is.blocking () && (is.exceptions () & ifdstream::badbit) != 0); + + fdstreambuf& sb (*static_cast<fdstreambuf*> (is.rdbuf ())); + + // Read until blocked (0), EOF (-1) or encounter the delimiter. + // + // Note that here we reasonably assume that any failure in in_avail() + // will lead to badbit and thus an exception (see showmanyc()). + // + streamsize s; + while ((s = sb.in_avail ()) > 0) + { + const char* p (sb.gptr ()); + size_t n (sb.egptr () - p); + + const char* e (static_cast<const char*> (memchr (p, delim, n))); + if (e != nullptr) + n = e - p; + + l.append (p, n); + + // Note: consume the delimiter if found. + // + sb.gbump (static_cast<int> (n + (e != nullptr ? 1 : 0))); + + if (e != nullptr) + break; + } + + // Here s can be: + // + // -1 -- EOF. + // 0 -- blocked before encountering delimiter/EOF. + // >0 -- encountered the delimiter. + // + if (s == -1) + { + is.setstate (ifdstream::eofbit); + + // If we couldn't extract anything, not even the delimiter, then this is + // a failure per the getline() interface. + // + if (l.empty ()) + is.setstate (ifdstream::failbit); + } + + return s != 0; + } + // ofdstream // ofdstream:: @@ -1052,10 +1075,11 @@ namespace butl #endif // Unlike other platforms, *BSD allows opening a directory as a file which - // will cause all kinds of problems upstream (e.g., cpfile()). So we detect - // and diagnose this. + // will cause all kinds of problems upstream (e.g., cpfile()). So we + // detect and diagnose this. Note: not certain this is the case for NetBSD + // and OpenBSD. // -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) { struct stat s; if (stat (f, &s) == 0 && S_ISDIR (s.st_mode)) @@ -1141,12 +1165,17 @@ namespace butl // underlying CreateFile() function call (see mventry() for details). If // that's the case, we will keep trying to open the file for two seconds. // - for (size_t i (0); i < 21; ++i) + // Also, it turns out, if someone memory-maps a file, it takes Windows + // some time to realize it's been unmapped and until then any attempt to + // open it results in EINVAL POSIX error, ERROR_USER_MAPPED_FILE system + // error. So we retry those as well. + // + for (size_t i (0); i < 41; ++i) { - // Sleep 100 milliseconds before the open retry. + // Sleep 50 milliseconds before the open retry. // if (i != 0) - Sleep (100); + Sleep (50); fd = pass_perm ? _sopen (f, of, _SH_DENYNO, pf) @@ -1160,10 +1189,11 @@ namespace butl // Note that MinGW's _sopen() is just a stub forwarding the call to the // (publicly available) MSVCRT's implementation. // - if (!(fd == -1 && - out && - errno == EACCES && - GetLastError () == ERROR_SHARING_VIOLATION)) + if (!(fd == -1 && + out && + (errno == EACCES || errno == EINVAL) && + (GetLastError () == ERROR_SHARING_VIOLATION || + GetLastError () == ERROR_USER_MAPPED_FILE))) break; } @@ -1372,6 +1402,28 @@ namespace butl throw_generic_ios_failure (errno); } + entry_stat + fdstat (int fd) + { + struct stat s; + if (fstat (fd, &s) != 0) + throw_generic_error (errno); + + auto m (s.st_mode); + entry_type t (entry_type::unknown); + + // Note: cannot be a symlink. + // + if (S_ISREG (m)) + t = entry_type::regular; + else if (S_ISDIR (m)) + t = entry_type::directory; + else if (S_ISBLK (m) || S_ISCHR (m) || S_ISFIFO (m) || S_ISSOCK (m)) + t = entry_type::other; + + return entry_stat {t, static_cast<uint64_t> (s.st_size)}; + } + bool fdterm (int fd) { @@ -1392,6 +1444,16 @@ namespace butl throw_generic_ios_failure (errno); } + bool + fdterm_color (int, bool) + { + const char* t (std::getenv ("TERM")); + + // This test was lifted from GCC (Emacs shell sets TERM=dumb). + // + return t != nullptr && strcmp (t, "dumb") != 0; + } + static pair<size_t, size_t> fdselect (fdselect_set& read, fdselect_set& write, @@ -1410,6 +1472,8 @@ namespace butl for (fdselect_state& s: from) { + s.ready = false; + if (s.fd == nullfd) continue; @@ -1417,7 +1481,6 @@ namespace butl throw invalid_argument ("invalid file descriptor"); FD_SET (s.fd, &to); - s.ready = false; if (max_fd < s.fd) max_fd = s.fd; @@ -1524,6 +1587,12 @@ namespace butl return read (fd, buf, n); } + streamsize + fdwrite (int fd, const void* buf, size_t n) + { + return write (fd, buf, n); + } + #else auto_fd @@ -1779,9 +1848,34 @@ namespace butl throw_generic_ios_failure (e); } + entry_stat + fdstat (int fd) + { + // Since symlinks have been taken care of, we can just _fstat(). + // + struct __stat64 s; + if (_fstat64 (fd, &s) != 0) + throw_generic_error (errno); + + auto m (s.st_mode); + entry_type t (entry_type::unknown); + + if (S_ISREG (m)) + t = entry_type::regular; + else if (S_ISDIR (m)) + t = entry_type::directory; + else if (S_ISCHR (m)) + t = entry_type::other; + + return entry_stat {t, static_cast<uint64_t> (s.st_size)}; + } + bool fdterm (int fd) { + // @@ Both GCC and Clang simply call GetConsoleMode() for this check. I + // wonder why we don't do the same? See also fdterm_color() below. + // We don't need to close it (see fd_to_handle()). // HANDLE h (fd_to_handle (fd)); @@ -1795,7 +1889,13 @@ namespace butl throw_system_ios_failure (e); if (t == FILE_TYPE_CHAR) // Terminal. - return true; + { + // One notable special file that has this type is nul (as returned by + // fdopen_null()). So tighten this case with the GetConsoleMode() call. + // + DWORD m; + return GetConsoleMode (h, &m) != 0; + } if (t != FILE_TYPE_PIPE) // Pipe still can be a terminal (see below). return false; @@ -1867,6 +1967,42 @@ namespace butl return false; } + bool + fdterm_color (int fd, bool enable) + { + // We don't need to close it (see fd_to_handle()). + // + HANDLE h (fd_to_handle (fd)); + + // See GH issue #312 for background on this logic. + // + DWORD m; + if (!GetConsoleMode (h, &m)) + throw_system_ios_failure (GetLastError ()); + + // Some terminals (e.g. Windows Terminal) enable VT processing by default. + // + if ((m & ENABLE_VIRTUAL_TERMINAL_PROCESSING) != 0) + return true; + + if (enable) + { + // If SetConsoleMode() fails, assume VT processing is unsupported (it + // is only supported from a certain build of Windows 10). + // + // Note that Wine pretends to support this but doesn't handle the escape + // sequences. See https://bugs.winehq.org/show_bug.cgi?id=49780. + // + if (SetConsoleMode (h, + (m | + ENABLE_PROCESSED_OUTPUT | + ENABLE_VIRTUAL_TERMINAL_PROCESSING))) + return true; + } + + return false; + } + static pair<size_t, size_t> fdselect (fdselect_set& read, fdselect_set& write, @@ -1883,13 +2019,14 @@ namespace butl for (fdselect_state& s: read) { + s.ready = false; + if (s.fd == nullfd) continue; if (s.fd < 0) throw invalid_argument ("invalid file descriptor"); - s.ready = false; ++n; } @@ -1910,7 +2047,7 @@ namespace butl // size_t r (0); - while (true) + for (size_t i (0);; ++i) { for (fdselect_state& s: read) { @@ -1983,7 +2120,11 @@ namespace butl if (r != 0) break; - DWORD t (50); + // Use exponential backoff but not too aggressive and with 25ms max. + // + DWORD t ( + static_cast<DWORD> (i <= 1000 ? 0 : + i >= 1000 + 100 ? 25 : 1 + ((i - 1000) / 4))); if (timeout) { @@ -2000,7 +2141,10 @@ namespace butl break; } - Sleep (t); + if (t == 0) + this_thread::yield (); + else + Sleep (t); } return make_pair (r, 0); @@ -2043,6 +2187,12 @@ namespace butl return r; } + streamsize + fdwrite (int fd, const void* buf, size_t n) + { + return _write (fd, buf, static_cast<unsigned int> (n)); + } + #endif pair<size_t, size_t> diff --git a/libbutl/fdstream.mxx b/libbutl/fdstream.hxx index c863d2c..9c8f786 100644 --- a/libbutl/fdstream.mxx +++ b/libbutl/fdstream.hxx @@ -1,13 +1,8 @@ -// file : libbutl/fdstream.mxx -*- C++ -*- +// file : libbutl/fdstream.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -#include <cassert> -#ifndef __cpp_lib_modules_ts #include <ios> // streamsize #include <vector> #include <string> @@ -18,29 +13,14 @@ #include <cstdint> // uint16_t, uint64_t #include <cstddef> // size_t -#include <iterator> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.fdstream; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.path; -import butl.filesystem; // permissions -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/filesystem.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/filesystem.hxx> // permissions, entry_stat +#include <libbutl/small-vector.hxx> +#include <libbutl/bufstreambuf.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // RAII type for file descriptors. Note that failure to close the descriptor // is silently ignored by both the destructor and reset(). @@ -54,9 +34,6 @@ LIBBUTL_MODEXPORT namespace butl constexpr operator int () const {return -1;} }; -#if defined(__cpp_modules_ts) && defined(__clang__) //@@ MOD Clang duplicate sym. - inline -#endif constexpr nullfd_t nullfd (-1); class LIBBUTL_SYMEXPORT auto_fd @@ -142,9 +119,9 @@ LIBBUTL_MODEXPORT namespace butl // - input or output but not both (can use a union of two streams for that) // - no support for put back // - use of tell[gp]() and seek[gp]() is discouraged on Windows for - // fdstreams opened in the text mode (see fdbuf::seekoff() implementation - // for reasoning and consider using non-standard tellg() and seekg() in - // fdbuf, instead) + // fdstreams opened in the text mode (see fdstreambuf::seekoff() + // implementation for reasoning and consider using non-standard tellg() + // and seekg() in fdstreambuf, instead) // - non-blocking file descriptor is supported only by showmanyc() function // and only for pipes on Windows, in contrast to POSIX systems // - throws ios::failure in case of open(), read(), write(), close(), @@ -157,20 +134,26 @@ LIBBUTL_MODEXPORT namespace butl // - passing to constructor auto_fd with a negative file descriptor is valid // and results in the creation of an unopened object // - class LIBBUTL_SYMEXPORT fdbuf: public std::basic_streambuf<char> + class LIBBUTL_SYMEXPORT fdstreambuf: public bufstreambuf { public: - fdbuf () = default; + // Reasonable (for stack allocation) buffer size that provides decent + // performance. + // + static const std::size_t buffer_size = 8192; + + fdstreambuf () = default; // Unless specified, the current read/write position is assumed to // be 0 (note: not queried). // - fdbuf (auto_fd&&, std::uint64_t pos = 0); + fdstreambuf (auto_fd&&, std::uint64_t pos = 0); - // Before we invented auto_fd into fdstreams we keept fdbuf opened on - // faulty close attempt. Now fdbuf is always closed by close() function. - // This semantics change seems to be the right one as there is no reason to - // expect fdclose() to succeed after it has already failed once. + // Before we invented auto_fd into fdstreams we keept fdstreambuf opened + // on faulty close attempt. Now fdstreambuf is always closed by close() + // function. This semantics change seems to be the right one as there is + // no reason to expect fdclose() to succeed after it has already failed + // once. // void close () {fd_.close ();} @@ -196,14 +179,11 @@ LIBBUTL_MODEXPORT namespace butl bool blocking (bool); - public: - using base = std::basic_streambuf<char>; - - using int_type = base::int_type; - using traits_type = base::traits_type; + bool + blocking () const {return !non_blocking_;} - using pos_type = base::pos_type; // std::streampos - using off_type = base::off_type; // std::streamoff + public: + using base = bufstreambuf; // basic_streambuf input interface. // @@ -222,13 +202,7 @@ LIBBUTL_MODEXPORT namespace butl // Return the (logical) position of the next byte to be read. // - // Note that on Windows when reading in the text mode the logical position - // may differ from the physical file descriptor position due to the CRLF - // character sequence translation. See the seekoff() implementation for - // more background on this issue. - // - std::uint64_t - tellg () const {return off_ - (egptr () - gptr ());} + using base::tellg; // Seek to the (logical) position as if by reading the specified number of // bytes from the beginning of the stream. Throw ios::failure on the @@ -255,8 +229,7 @@ LIBBUTL_MODEXPORT namespace butl // Return the (logical) position of the next byte to be written. // - std::uint64_t - tellp () const {return off_ + (pptr () - buf_);} + using base::tellp; // basic_streambuf positioning interface (both input/output). // @@ -273,8 +246,7 @@ LIBBUTL_MODEXPORT namespace butl private: auto_fd fd_; - std::uint64_t off_; - char buf_[8192]; + char buf_[buffer_size]; bool non_blocking_ = false; }; @@ -307,7 +279,9 @@ LIBBUTL_MODEXPORT namespace butl binary = 0x02, skip = 0x04, blocking = 0x08, - non_blocking = 0x10 + non_blocking = 0x10, + + none = 0 }; inline fdstream_mode operator& (fdstream_mode, fdstream_mode); @@ -347,8 +321,11 @@ LIBBUTL_MODEXPORT namespace butl int fd () const {return buf_.fd ();} + bool + blocking () const {return buf_.blocking ();} + protected: - fdbuf buf_; + fdstreambuf buf_; }; // iofdstream constructors and open() functions that take openmode as an @@ -677,6 +654,54 @@ LIBBUTL_MODEXPORT namespace butl LIBBUTL_SYMEXPORT ifdstream& getline (ifdstream&, std::string&, char delim = '\n'); + // The non-blocking getline() version that reads the line in potentially + // multiple calls. Key differences compared to getline(): + // + // - Stream must be in the non-blocking mode and exception mask must have + // at least badbit. + // + // - Return type is bool instead of stream. Return true if the line has been + // read or false if it should be called again once the stream has more + // data to read. Also return true on failure. + // + // - The string must be empty on the first call. + // + // - There could still be data to read in the stream's buffer (as opposed to + // file descriptor) after this function returns true and you should be + // careful not to block on fdselect() in this case. In fact, the + // recommended pattern is to call this function first and only call + // fdselect() if it returns false. + // + // The typical usage in combination with the eof() helper: + // + // fdselect_set fds {is.fd (), ...}; + // fdselect_state& ist (fds[0]); + // fdselect_state& ...; + // + // for (string l; ist.fd != nullfd || ...; ) + // { + // if (ist.fd != nullfd && getline_non_blocking (is, l)) + // { + // if (eof (is)) + // ist.fd = nullfd; + // else + // { + // // Consume line. + // + // l.clear (); + // } + // + // continue; + // } + // + // ifdselect (fds); + // + // // Handle other ready fds. + // } + // + LIBBUTL_SYMEXPORT bool + getline_non_blocking (ifdstream&, std::string&, char delim = '\n'); + // Open a file returning an auto_fd that holds its file descriptor on // success and throwing ios::failure otherwise. // @@ -862,12 +887,28 @@ LIBBUTL_MODEXPORT namespace butl LIBBUTL_SYMEXPORT void fdtruncate (int, std::uint64_t); - // Test whether a file descriptor refers to a terminal. Throw ios::failure on + // Return filesystem entry stat from file descriptor. Throw ios::failure on // the underlying OS error. // + // See also path_entry() in filesystem. + // + LIBBUTL_SYMEXPORT entry_stat + fdstat (int); + + // Test whether a file descriptor refers to a terminal. Throw ios::failure + // on the underlying OS error. + // LIBBUTL_SYMEXPORT bool fdterm (int); + // Test whether a terminal file descriptor supports ANSI color output. If + // the enable argument is true, then also try to enable color output (only + // applicable on some platforms, such as Windows). Throw ios::failure on the + // underlying OS error. + // + LIBBUTL_SYMEXPORT bool + fdterm_color (int, bool enable); + // Wait until one or more file descriptors becomes ready for input (reading) // or output (writing). Return the pair of numbers of descriptors that are // ready. Throw std::invalid_argument if anything is wrong with arguments @@ -875,7 +916,7 @@ LIBBUTL_MODEXPORT namespace butl // underlying OS error. // // Note that the function clears all the previously-ready entries on each - // call. Entries with nullfd are ignored. + // call. Entries with nullfd are ignored (but cleared). // // On Windows only pipes and only their input (read) ends are supported. // @@ -883,11 +924,13 @@ LIBBUTL_MODEXPORT namespace butl { int fd; bool ready; + void* data; // Arbitrary data which can be associated with the descriptor. // Note: intentionally non-explicit to allow implicit initialization when // pushing to fdselect_set. // - fdselect_state (int fd): fd (fd), ready (false) {} + fdselect_state (int fd, void* d = nullptr) + : fd (fd), ready (false), data (d) {} }; using fdselect_set = small_vector<fdselect_state, 4>; @@ -940,6 +983,11 @@ LIBBUTL_MODEXPORT namespace butl // LIBBUTL_SYMEXPORT std::streamsize fdread (int, void*, std::size_t); + + // POSIX write() function wrapper, for uniformity. + // + LIBBUTL_SYMEXPORT std::streamsize + fdwrite (int, const void*, std::size_t); } #include <libbutl/fdstream.ixx> diff --git a/libbutl/fdstream.ixx b/libbutl/fdstream.ixx index 4ef5b1d..e024af9 100644 --- a/libbutl/fdstream.ixx +++ b/libbutl/fdstream.ixx @@ -1,6 +1,8 @@ // file : libbutl/fdstream.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file +#include <cassert> + namespace butl { // auto_fd @@ -27,16 +29,16 @@ namespace butl reset (); } - // fdbuf + // fdstreambuf // - inline fdbuf:: - fdbuf (auto_fd&& fd, std::uint64_t pos) + inline fdstreambuf:: + fdstreambuf (auto_fd&& fd, std::uint64_t pos) { if (fd.get () >= 0) open (std::move (fd), pos); } - inline auto_fd fdbuf:: + inline auto_fd fdstreambuf:: release () { return std::move (fd_); @@ -165,6 +167,8 @@ namespace butl inline std::vector<char> ifdstream:: read_binary () { + // @@ TODO: surely there is a more efficient way! See sha256! + std::vector<char> v (std::istreambuf_iterator<char> (*this), std::istreambuf_iterator<char> ()); return v; diff --git a/libbutl/filesystem.cxx b/libbutl/filesystem.cxx index 18be8a9..28a0de8 100644 --- a/libbutl/filesystem.cxx +++ b/libbutl/filesystem.cxx @@ -1,9 +1,7 @@ // file : libbutl/filesystem.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/filesystem.mxx> -#endif +#include <libbutl/filesystem.hxx> #include <errno.h> // errno, E* @@ -18,61 +16,34 @@ #else # include <libbutl/win32-utility.hxx> -# include <io.h> // _find*(), _unlink(), _chmod() +# include <io.h> // _unlink(), _chmod() # include <direct.h> // _mkdir(), _rmdir() # include <winioctl.h> // FSCTL_SET_REPARSE_POINT # include <sys/types.h> // _stat # include <sys/stat.h> // _stat(), S_I* -# include <cwchar> // mbsrtowcs(), wcsrtombs(), mbstate_t -# include <cstring> // strncmp() - # ifdef _MSC_VER // Unlikely to be fixed in newer versions. # define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) # define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +# define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) # endif -#endif - -#include <cassert> -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <cstdint> -#include <utility> -#include <iterator> -#include <functional> +# include <cwchar> // mbsrtowcs(), wcsrtombs(), mbstate_t +# include <cstring> // strncmp() +# include <type_traits> // is_same +#endif +#include <chrono> #include <vector> #include <memory> // unique_ptr +#include <cassert> #include <algorithm> // find(), copy() #include <system_error> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.filesystem; -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.timestamp; -import butl.path_pattern; -#endif - -import butl.utility; // throw_generic_error() -import butl.fdstream; -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/utility.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/utility.hxx> // throw_generic_error() +#include <libbutl/fdstream.hxx> +#include <libbutl/small-vector.hxx> #ifndef _WIN32 # ifndef PATH_MAX @@ -213,6 +184,19 @@ namespace butl // static inline constexpr int // ansec (...) {return 0;} + static inline entry_time + entry_tm (const struct stat& s) noexcept + { + auto tm = [] (time_t sec, auto nsec) -> timestamp + { + return system_clock::from_time_t (sec) + + chrono::duration_cast<duration> (chrono::nanoseconds (nsec)); + }; + + return {tm (s.st_mtime, mnsec<struct stat> (&s, true)), + tm (s.st_atime, ansec<struct stat> (&s, true))}; + } + // Return the modification and access times of a regular file or directory. // static entry_time @@ -230,14 +214,7 @@ namespace butl if (dir ? !S_ISDIR (s.st_mode) : !S_ISREG (s.st_mode)) return {timestamp_nonexistent, timestamp_nonexistent}; - auto tm = [] (time_t sec, auto nsec) -> timestamp - { - return system_clock::from_time_t (sec) + - chrono::duration_cast<duration> (chrono::nanoseconds (nsec)); - }; - - return {tm (s.st_mtime, mnsec<struct stat> (&s, true)), - tm (s.st_atime, ansec<struct stat> (&s, true))}; + return entry_tm (s); } // Set the modification and access times for a regular file or directory. @@ -339,16 +316,15 @@ namespace butl // Open a filesystem entry for reading and optionally writing its // meta-information and return the entry handle and meta-information if the - // path refers to an existing entry and nullhandle otherwise. Follow reparse - // points by default. Underlying OS errors are reported by throwing - // std::system_error, unless ignore_error is true in which case nullhandle - // is returned. In the latter case the error code can be obtained by calling - // GetLastError(). + // path refers to an existing entry and nullhandle otherwise. Underlying OS + // errors are reported by throwing std::system_error, unless ignore_error is + // true in which case nullhandle is returned. In the latter case the error + // code can be obtained by calling GetLastError(). // static inline pair<win32::auto_handle, BY_HANDLE_FILE_INFORMATION> entry_info_handle (const char* p, bool write, - bool fr = true, + bool follow_reparse_points, bool ie = false) { // Open the entry for reading/writing its meta-information. Follow reparse @@ -363,7 +339,7 @@ namespace butl nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS | // Required for a directory. - (fr ? 0 : FILE_FLAG_OPEN_REPARSE_POINT), + (follow_reparse_points ? 0 : FILE_FLAG_OPEN_REPARSE_POINT), nullptr)); if (h == nullhandle) @@ -388,13 +364,15 @@ namespace butl } // Return a flag indicating whether the path is to an existing filesystem - // entry and its meta-information if so. Follow reparse points by default. + // entry and its meta-information if so. // static inline pair<bool, BY_HANDLE_FILE_INFORMATION> - path_entry_info (const char* p, bool fr = true, bool ie = false) + path_entry_handle_info (const char* p, + bool follow_reparse_points, + bool ie = false) { pair<auto_handle, BY_HANDLE_FILE_INFORMATION> hi ( - entry_info_handle (p, false /* write */, fr, ie)); + entry_info_handle (p, false /* write */, follow_reparse_points, ie)); if (hi.first == nullhandle) return make_pair (false, BY_HANDLE_FILE_INFORMATION ()); @@ -406,9 +384,34 @@ namespace butl } static inline pair<bool, BY_HANDLE_FILE_INFORMATION> - path_entry_info (const path& p, bool fr = true, bool ie = false) + path_entry_handle_info (const path& p, bool fr, bool ie = false) { - return path_entry_info (p.string ().c_str (), fr, ie); + return path_entry_handle_info (p.string ().c_str (), fr, ie); + } + + // Return a flag indicating whether the path is to an existing filesystem + // entry and its extended attributes if so. Don't follow reparse points. + // + static inline pair<bool, WIN32_FILE_ATTRIBUTE_DATA> + path_entry_info (const char* p, bool ie = false) + { + WIN32_FILE_ATTRIBUTE_DATA r; + if (!GetFileAttributesExA (p, GetFileExInfoStandard, &r)) + { + DWORD ec; + if (ie || error_file_not_found (ec = GetLastError ())) + return make_pair (false, WIN32_FILE_ATTRIBUTE_DATA ()); + + throw_system_error (ec); + } + + return make_pair (true, r); + } + + static inline pair<bool, WIN32_FILE_ATTRIBUTE_DATA> + path_entry_info (const path& p, bool ie = false) + { + return path_entry_info (p.string ().c_str (), ie); } // Reparse point data. @@ -644,8 +647,48 @@ namespace butl return reparse_point_entry (p.string ().c_str (), ie); } - pair<bool, entry_stat> - path_entry (const char* p, bool fl, bool ie) + static inline timestamp + to_timestamp (const FILETIME& t) + { + // Time in FILETIME is in 100 nanosecond "ticks" since "Windows epoch" + // (1601-01-01T00:00:00Z). To convert it to "UNIX epoch" + // (1970-01-01T00:00:00Z) we need to subtract 11644473600 seconds. + // + uint64_t nsec ((static_cast<uint64_t> (t.dwHighDateTime) << 32) | + t.dwLowDateTime); + + nsec -= 11644473600ULL * 10000000; // Now in UNIX epoch. + nsec *= 100; // Now in nanoseconds. + + return timestamp ( + chrono::duration_cast<duration> (chrono::nanoseconds (nsec))); + } + + static inline FILETIME + to_filetime (timestamp t) + { + // Time in FILETIME is in 100 nanosecond "ticks" since "Windows epoch" + // (1601-01-01T00:00:00Z). To convert "UNIX epoch" (1970-01-01T00:00:00Z) + // to it we need to add 11644473600 seconds. + // + uint64_t ticks (chrono::duration_cast<chrono::nanoseconds> ( + t.time_since_epoch ()).count ()); + + ticks /= 100; // Now in 100 nanosecond "ticks". + ticks += 11644473600ULL * 10000000; // Now in "Windows epoch". + + FILETIME r; + r.dwHighDateTime = (ticks >> 32) & 0xFFFFFFFF; + r.dwLowDateTime = ticks & 0xFFFFFFFF; + return r; + } + + // If the being returned entry type is regular or directory and et is not + // NULL, then also save the entry modification and access times into the + // referenced variable. + // + static inline pair<bool, entry_stat> + path_entry (const char* p, bool fl, bool ie, entry_time* et) { // A path like 'C:', while being a root path in our terminology, is not as // such for Windows, that maintains current directory for each drive, and @@ -656,73 +699,105 @@ namespace butl string d; if (path::traits_type::root (p)) { - d = p; + d = string (p); // GCC bug #105329. d += path::traits_type::directory_separator; p = d.c_str (); } // Stat the entry not following reparse points. // - pair<bool, BY_HANDLE_FILE_INFORMATION> pi ( - path_entry_info (p, false /* follow_reparse_points */, ie)); + pair<bool, WIN32_FILE_ATTRIBUTE_DATA> pi (path_entry_info (p, ie)); if (!pi.first) return make_pair (false, entry_stat {entry_type::unknown, 0}); - if (reparse_point (pi.second.dwFileAttributes)) + auto entry_info = [et] (const auto& ei) { - pair<entry_type, path> rp (reparse_point_entry (p, ie)); + if (et != nullptr) + { + et->modification = to_timestamp (ei.ftLastWriteTime); + et->access = to_timestamp (ei.ftLastAccessTime); + } + + if (directory (ei.dwFileAttributes)) + return make_pair (true, entry_stat {entry_type::directory, 0}); + else + return make_pair ( + true, + entry_stat {entry_type::regular, + ((uint64_t (ei.nFileSizeHigh) << 32) | ei.nFileSizeLow)}); + }; + + if (!reparse_point (pi.second.dwFileAttributes)) + return entry_info (pi.second); - if (rp.first == entry_type::symlink) + pair<entry_type, path> rp (reparse_point_entry (p, ie)); + + if (rp.first == entry_type::symlink) + { + // If following symlinks is requested, then follow the reparse point and + // return its target information. Otherwise, return the symlink entry + // type. + // + if (fl) { - // If following symlinks is requested, then follow the reparse point, - // overwrite its own information with the resolved target information, - // and fall through. Otherwise, return the symlink entry type. - // - if (fl) - { - pi = path_entry_info (p, true /* follow_reparse_points */, ie); + pair<bool, BY_HANDLE_FILE_INFORMATION> pi ( + path_entry_handle_info (p, true /* follow_reparse_points */, ie)); - if (!pi.first) - return make_pair (false, entry_stat {entry_type::unknown, 0}); - } - else - return make_pair (true, entry_stat {entry_type::symlink, 0}); + return pi.first + ? entry_info (pi.second) + : make_pair (false, entry_stat {entry_type::unknown, 0}); } - else if (rp.first == entry_type::unknown) - return make_pair (false, entry_stat {entry_type::unknown, 0}); - else // entry_type::other - return make_pair (true, entry_stat {entry_type::other, 0}); + else + return make_pair (true, entry_stat {entry_type::symlink, 0}); } + else if (rp.first == entry_type::unknown) + return make_pair (false, entry_stat {entry_type::unknown, 0}); + else // entry_type::other + return make_pair (true, entry_stat {entry_type::other, 0}); + } - if (directory (pi.second.dwFileAttributes)) - return make_pair (true, entry_stat {entry_type::directory, 0}); - else - return make_pair ( - true, - entry_stat {entry_type::regular, - ((uint64_t (pi.second.nFileSizeHigh) << 32) | - pi.second.nFileSizeLow)}); + static inline pair<bool, entry_stat> + path_entry (const path& p, bool fl, bool ie, entry_time* et) + { + return path_entry (p.string ().c_str (), fl, ie, et); + } + + pair<bool, entry_stat> + path_entry (const char* p, bool fl, bool ie) + { + return path_entry (p, fl, ie, nullptr /* entry_time */); } permissions path_permissions (const path& p) { - pair<bool, BY_HANDLE_FILE_INFORMATION> pi (path_entry_info (p)); + // Let's optimize for the common case when the entry is not a reparse + // point. + // + auto attr_to_perm = [] (const auto& pi) -> permissions + { + if (!pi.first) + throw_generic_error (ENOENT); - if (!pi.first) - throw_generic_error (ENOENT); + // On Windows a filesystem entry is always readable. Also there is no + // notion of group/other permissions at OS level, so we extrapolate user + // permissions to group/other permissions (as the _stat() function + // does). + // + permissions r (permissions::ru | permissions::rg | permissions::ro); - // On Windows a filesystem entry is always readable. Also there is no - // notion of group/other permissions at OS level, so we extrapolate user - // permissions to group/other permissions (as the _stat() function does). - // - permissions r (permissions::ru | permissions::rg | permissions::ro); + if (!readonly (pi.second.dwFileAttributes)) + r |= permissions::wu | permissions::wg | permissions::wo; - if (!readonly (pi.second.dwFileAttributes)) - r |= permissions::wu | permissions::wg | permissions::wo; + return r; + }; - return r; + pair<bool, WIN32_FILE_ATTRIBUTE_DATA> pi (path_entry_info (p)); + return !pi.first || !reparse_point (pi.second.dwFileAttributes) + ? attr_to_perm (pi) + : attr_to_perm ( + path_entry_handle_info (p, true /* follow_reparse_points */)); } void @@ -748,50 +823,26 @@ namespace butl static entry_time entry_tm (const char* p, bool dir) { - pair<bool, BY_HANDLE_FILE_INFORMATION> pi (path_entry_info (p)); - - // If the entry is of the wrong type, then let's pretend that it doesn't - // exists. + // Let's optimize for the common case when the entry is not a reparse + // point. // - if (!pi.first || directory (pi.second.dwFileAttributes) != dir) - return {timestamp_nonexistent, timestamp_nonexistent}; - - auto tm = [] (const FILETIME& t) -> timestamp + auto attr_to_time = [dir] (const auto& pi) -> entry_time { - // Time in FILETIME is in 100 nanosecond "ticks" since "Windows epoch" - // (1601-01-01T00:00:00Z). To convert it to "UNIX epoch" - // (1970-01-01T00:00:00Z) we need to subtract 11644473600 seconds. + // If the entry is of the wrong type, then let's pretend that it doesn't + // exists. // - uint64_t nsec ((static_cast<uint64_t> (t.dwHighDateTime) << 32) | - t.dwLowDateTime); - - nsec -= 11644473600ULL * 10000000; // Now in UNIX epoch. - nsec *= 100; // Now in nanoseconds. + if (!pi.first || directory (pi.second.dwFileAttributes) != dir) + return entry_time {timestamp_nonexistent, timestamp_nonexistent}; - return timestamp ( - chrono::duration_cast<duration> (chrono::nanoseconds (nsec))); + return entry_time {to_timestamp (pi.second.ftLastWriteTime), + to_timestamp (pi.second.ftLastAccessTime)}; }; - return {tm (pi.second.ftLastWriteTime), tm (pi.second.ftLastAccessTime)}; - } - - static inline FILETIME - to_filetime (timestamp t) - { - // Time in FILETIME is in 100 nanosecond "ticks" since "Windows epoch" - // (1601-01-01T00:00:00Z). To convert "UNIX epoch" - // (1970-01-01T00:00:00Z) to it we need to add 11644473600 seconds. - // - uint64_t ticks (chrono::duration_cast<chrono::nanoseconds> ( - t.time_since_epoch ()).count ()); - - ticks /= 100; // Now in 100 nanosecond "ticks". - ticks += 11644473600ULL * 10000000; // Now in "Windows epoch". - - FILETIME r; - r.dwHighDateTime = (ticks >> 32) & 0xFFFFFFFF; - r.dwLowDateTime = ticks & 0xFFFFFFFF; - return r; + pair<bool, WIN32_FILE_ATTRIBUTE_DATA> pi (path_entry_info (p)); + return !pi.first || !reparse_point (pi.second.dwFileAttributes) + ? attr_to_time (pi) + : attr_to_time ( + path_entry_handle_info (p, true /* follow_reparse_points */)); } // Set the modification and access times for a regular file or directory. @@ -802,7 +853,9 @@ namespace butl // See also touch_file() below. // pair<auto_handle, BY_HANDLE_FILE_INFORMATION> hi ( - entry_info_handle (p, true /* write */)); + entry_info_handle (p, + true /* write */, + true /* follow_reparse_points */)); // If the entry is of the wrong type, then let's pretend that it doesn't // exist. @@ -887,7 +940,9 @@ namespace butl // implicitly. // pair<auto_handle, BY_HANDLE_FILE_INFORMATION> hi ( - entry_info_handle (p.string ().c_str (), true /* write */)); + entry_info_handle (p.string ().c_str (), + true /* write */, + true /* follow_reparse_points */)); if (hi.first != nullhandle) { @@ -1036,7 +1091,7 @@ namespace butl // try { - for (const dir_entry& de: dir_iterator (p, false /* ignore_dangling */)) + for (const dir_entry& de: dir_iterator (p, dir_iterator::no_follow)) { path ep (p / de.path ()); //@@ Would be good to reuse the buffer. @@ -1063,8 +1118,8 @@ namespace butl } } - rmfile_status - try_rmfile (const path& p, bool ignore_error) + optional<rmfile_status> + try_rmfile_maybe_ignore_error (const path& p, bool ignore_error) { rmfile_status r (rmfile_status::success); const char* f (p.string ().c_str ()); @@ -1087,12 +1142,12 @@ namespace butl // failure (see mventry() for details). If that's the case, we will keep // trying to move the file for two seconds. // - for (size_t i (0); i < 21; ++i) + for (size_t i (0); i < 41; ++i) { - // Sleep 100 milliseconds before the removal retry. + // Sleep 50 milliseconds before the removal retry. // if (i != 0) - Sleep (100); + Sleep (50); ur = _unlink (f); @@ -1143,6 +1198,8 @@ namespace butl r = rmfile_status::not_exist; else if (!ignore_error) throw_generic_error (errno); + else + return nullopt; } return r; @@ -1596,7 +1653,7 @@ namespace butl rm = auto_rmfile (to); - // Throws ios::failure on fdbuf read/write failures. + // Throws ios::failure on fdstreambuf read/write failures. // // Note that the eof check is important: if the stream is at eof (empty // file) then this write will fail. @@ -1638,9 +1695,12 @@ namespace butl } void - cpfile (const path& from, const path& to, cpflags fl) + cpfile (const path& from, + const path& to, + cpflags fl, + optional<permissions> cperm) { - permissions perm (path_permissions (from)); + permissions perm (cperm ? *cperm : path_permissions (from)); auto_rmfile rm; cpfile<is_base_of<system_error, ios_base::failure>::value> ( @@ -1732,12 +1792,12 @@ namespace butl // fdopen(). // DWORD ec; - for (size_t i (0); i < 21; ++i) + for (size_t i (0); i < 41; ++i) { // Sleep 100 milliseconds before the move retry. // if (i != 0) - Sleep (100); + Sleep (50); if (MoveFileExA (f, t, mfl)) return; @@ -1839,7 +1899,7 @@ namespace butl h_ = x.h_; x.h_ = nullptr; - ignore_dangling_ = x.ignore_dangling_; + mode_ = x.mode_; } return *this; } @@ -1860,6 +1920,11 @@ namespace butl entry_type dir_entry:: type (bool follow_symlinks) const { + // Note that this function can only be used for resolving an entry type + // lazily and thus can't be used with the detect_dangling dir_iterator + // mode (see dir_iterator::next () implementation for details). Thus, we + // always throw on the stat()/lstat() failure. + // path_type p (b_ / p_); struct stat s; if ((follow_symlinks @@ -1867,7 +1932,18 @@ namespace butl : lstat (p.string ().c_str (), &s)) != 0) throw_generic_error (errno); - return butl::type (s); + entry_type r (butl::type (s)); + + // While at it, also save the entry modification and access times. + // + if (r != entry_type::symlink) + { + entry_time t (entry_tm (s)); + mtime_ = t.modification; + atime_ = t.access; + } + + return r; } // dir_iterator @@ -1878,8 +1954,8 @@ namespace butl }; dir_iterator:: - dir_iterator (const dir_path& d, bool ignore_dangling) - : ignore_dangling_ (ignore_dangling) + dir_iterator (const dir_path& d, mode m) + : mode_ (m) { unique_ptr<DIR, dir_deleter> h (opendir (d.string ().c_str ())); h_ = h.get (); @@ -1895,7 +1971,7 @@ namespace butl } template <typename D> - static inline /*constexpr*/ entry_type + static inline /*constexpr*/ optional<entry_type> d_type (const D* d, decltype(d->d_type)*) { switch (d->d_type) @@ -1923,13 +1999,13 @@ namespace butl #endif return entry_type::other; - default: return entry_type::unknown; + default: return nullopt; } } template <typename D> - static inline constexpr entry_type - d_type (...) {return entry_type::unknown;} + static inline constexpr optional<entry_type> + d_type (...) {return nullopt;} void dir_iterator:: next () @@ -1951,25 +2027,43 @@ namespace butl e_.p_ = move (p); e_.t_ = d_type<struct dirent> (de, nullptr); - e_.lt_ = entry_type::unknown; + e_.lt_ = nullopt; + + e_.mtime_ = timestamp_unknown; + e_.atime_ = timestamp_unknown; // If requested, we ignore dangling symlinks, skipping ones with - // non-existing or inaccessible targets. + // non-existing or inaccessible targets (ignore_dangling mode), or set + // the entry_type::unknown type for them (detect_dangling mode). // - if (ignore_dangling_) + if (mode_ != no_follow) { - // Note that ltype () can potentially lstat() (see d_type() for + bool dd (mode_ == detect_dangling); + + // Note that ltype () can potentially lstat() (see type() for // details) and so throw. We, however, need to skip the entry if it // is already removed (due to a race) and throw on any other error. // path fp (e_.base () / e_.path ()); const char* p (fp.string ().c_str ()); - if (e_.t_ == entry_type::unknown) + if (!e_.t_) { struct stat s; if (lstat (p, &s) != 0) { + // Given that we have already enumerated the filesystem entry, + // these error codes can only mean that the entry doesn't exist + // anymore and so we always skip it. + // + // If errno is EACCES, then the permission to search a directory + // we currently iterate over has been revoked. Throwing in this + // case sounds like the best choice. + // + // Note that according to POSIX the filesystem entry we call + // lstat() on doesn't require any specific permissions to be + // granted. + // if (errno == ENOENT || errno == ENOTDIR) continue; @@ -1977,21 +2071,53 @@ namespace butl } e_.t_ = type (s); + + if (*e_.t_ != entry_type::symlink) + { + entry_time t (entry_tm (s)); + e_.mtime_ = t.modification; + e_.atime_ = t.access; + } } - if (e_.t_ == entry_type::symlink) + // The entry type should be present and may not be + // entry_type::unknown. + // + //assert (e_.t_ && *e_.t_ != entry_type::unknown); + + // Check if the symlink target exists and is accessible and set the + // target type. + // + if (*e_.t_ == entry_type::symlink) { struct stat s; if (stat (p, &s) != 0) { if (errno == ENOENT || errno == ENOTDIR || errno == EACCES) - continue; - - throw_generic_error (errno); + { + if (dd) + e_.lt_ = entry_type::unknown; + else + continue; + } + else + throw_generic_error (errno); } + else + { + e_.lt_ = type (s); - e_.lt_ = type (s); // While at it, set the target type. + entry_time t (entry_tm (s)); + e_.mtime_ = t.modification; + e_.atime_ = t.access; + } } + + // The symlink target type should be present and in the + // ignore_dangling mode it may not be entry_type::unknown. + // + //assert (*e_.t_ != entry_type::symlink || + // (e_.lt_ && (dd || *e_.lt_ != entry_type::unknown))); } } else if (errno == 0) @@ -2012,11 +2138,49 @@ namespace butl // dir_entry // + entry_type dir_entry:: + type (bool follow_symlinks) const + { + // Note that this function can only be used for resolving an entry type + // lazily and thus can't be used with the detect_dangling dir_iterator + // mode (see dir_iterator::next () implementation for details). Thus, we + // always throw if the entry info can't be retrieved. + // + // While at it, also save the entry modification and access times. + // + path_type p (base () / path ()); + entry_time et; + pair<bool, entry_stat> e ( + path_entry (p, follow_symlinks, false /* ignore_error */, &et)); + + if (!e.first) + throw_generic_error (ENOENT); + + if (e.second.type == entry_type::regular || + e.second.type == entry_type::directory) + { + mtime_ = et.modification; + atime_ = et.access; + } + + return e.second.type; + } + + // dir_iterator + // + static_assert(is_same<HANDLE, void*>::value, "HANDLE is not void*"); + + static inline HANDLE + to_handle (intptr_t h) + { + return reinterpret_cast<HANDLE> (h); + } + dir_iterator:: ~dir_iterator () { if (h_ != -1) - _findclose (h_); // Ignore any errors. + FindClose (to_handle (h_)); // Ignore any errors. } dir_iterator& dir_iterator:: @@ -2026,56 +2190,32 @@ namespace butl { e_ = move (x.e_); - if (h_ != -1 && _findclose (h_) == -1) - throw_generic_error (errno); + if (h_ != -1 && !FindClose (to_handle (h_))) + throw_system_error (GetLastError ()); h_ = x.h_; x.h_ = -1; - ignore_dangling_ = x.ignore_dangling_; + mode_ = x.mode_; } return *this; } - entry_type dir_entry:: - type (bool follow_symlinks) const - { - path_type p (base () / path ()); - pair<bool, entry_stat> e (path_entry (p, follow_symlinks)); - - if (!e.first) - throw_generic_error (ENOENT); - - return e.second.type; - } - - // dir_iterator - // - struct auto_dir + dir_iterator:: + dir_iterator (const dir_path& d, mode m) + : mode_ (m) { - explicit - auto_dir (intptr_t& h): h_ (&h) {} - - auto_dir (const auto_dir&) = delete; - auto_dir& operator= (const auto_dir&) = delete; - - ~auto_dir () + struct deleter { - if (h_ != nullptr && *h_ != -1) - _findclose (*h_); - } - - void release () {h_ = nullptr;} + void operator() (intptr_t* p) const + { + if (p != nullptr && *p != -1) + FindClose (to_handle (*p)); + } + }; - private: - intptr_t* h_; - }; + unique_ptr<intptr_t, deleter> h (&h_); - dir_iterator:: - dir_iterator (const dir_path& d, bool ignore_dangling) - : ignore_dangling_ (ignore_dangling) - { - auto_dir h (h_); e_.b_ = d; // Used by next(). next (); @@ -2088,31 +2228,37 @@ namespace butl for (;;) { bool r; - _finddata_t fi; + WIN32_FIND_DATA fi; if (h_ == -1) { // The call is made from the constructor. Any other call with h_ == -1 // is illegal. // - - // Check to distinguish non-existent vs empty directories. + // Note that we used to check for the directory existence before + // iterating over it. However, let's not pessimize things and only + // check for the directory existence if FindFirstFileExA() fails. // - if (!dir_exists (e_.base ())) - throw_generic_error (ENOENT); - h_ = _findfirst ((e_.base () / path ("*")).string ().c_str (), &fi); - r = h_ != -1; + h_ = reinterpret_cast<intptr_t> ( + FindFirstFileExA ((e_.base () / path ("*")).string ().c_str (), + FindExInfoBasic, + &fi, + FindExSearchNameMatch, + NULL, + 0)); + + r = (h_ != -1); } else - r = _findnext (h_, &fi) == 0; + r = FindNextFileA (to_handle (h_), &fi); if (r) { // We can accept some overhead for '.' and '..' (relying on short // string optimization) in favor of a more compact code. // - path p (fi.name); + path p (fi.cFileName); // Skip '.' and '..'. // @@ -2121,26 +2267,47 @@ namespace butl e_.p_ = move (p); - // Note that the entry type detection always requires to additionally - // query the entry information. Thus, we evaluate its type lazily. + DWORD a (fi.dwFileAttributes); + bool rp (reparse_point (a)); + + // Evaluate the entry type lazily if this is a reparse point since it + // requires to additionally query the entry information (see + // reparse_point_entry() for details). // - e_.t_ = entry_type::unknown; + e_.t_ = rp ? nullopt : + directory (a) ? optional<entry_type> (entry_type::directory) : + optional<entry_type> (entry_type::regular) ; + + e_.lt_ = nullopt; - e_.lt_ = entry_type::unknown; + e_.mtime_ = rp ? timestamp_unknown : to_timestamp (fi.ftLastWriteTime); + + // Note that according to MSDN for the FindFirstFile[Ex]() function + // "the NTFS file system delays updates to the last access time for a + // file by up to 1 hour after the last access" and "on the FAT file + // system access time has a resolution of 1 day". + // + e_.atime_ = timestamp_unknown; // If requested, we ignore dangling symlinks and junctions, skipping - // ones with non-existing or inaccessible targets. + // ones with non-existing or inaccessible targets (ignore_dangling + // mode), or set the entry_type::unknown type for them + // (detect_dangling mode). // - if (ignore_dangling_) + if (rp && mode_ != no_follow) { + bool dd (mode_ == detect_dangling); + // Check the last error code throwing for codes other than "path not - // found" and "access denied". + // found" and "access denied" and returning this error code + // otherwise. // auto verify_error = [] () { DWORD ec (GetLastError ()); if (!error_file_not_found (ec) && ec != ERROR_ACCESS_DENIED) throw_system_error (ec); + return ec; }; // Note that ltype() queries the entry information due to the type @@ -2151,48 +2318,50 @@ namespace butl path fp (e_.base () / e_.path ()); const char* p (fp.string ().c_str ()); - DWORD a (GetFileAttributesA (p)); - if (a == INVALID_FILE_ATTRIBUTES) - { - // Note that sometimes trying to obtain attributes for a - // filesystem entry that was potentially removed ends up with - // ERROR_ACCESS_DENIED. One can argue that there can be another - // reason for this error (antivirus, indexer, etc). However, given - // that the entry is seen by a _find*() function and normally you - // can retrieve attributes for a read-only entry and for an entry - // opened in the non-shared mode (see the CreateFile() function - // documentation for details) the only meaningful explanation for - // ERROR_ACCESS_DENIED is that the entry is being removed. Also - // the DeleteFile() documentation mentions such a possibility. - // - verify_error (); - continue; - } + pair<entry_type, path> rpe ( + reparse_point_entry (p, true /* ignore_error */)); - if (reparse_point (a)) + if (rpe.first == entry_type::unknown) { - pair<entry_type, path> rp ( - reparse_point_entry (p, true /* ignore_error */)); + DWORD ec (verify_error ()); - if (rp.first == entry_type::unknown) - { - verify_error (); + // Silently skip the entry if it is not found (being already + // deleted) or we are in the ignore dangling mode. Otherwise, set + // the entry type to unknown. + // + // Note that sometimes trying to obtain information for a being + // removed filesystem entry ends up with ERROR_ACCESS_DENIED (see + // DeleteFile() and CreateFile() for details). Probably getting + // this error code while trying to obtain the reparse point + // information (involves calling CreateFile(FILE_READ_EA) and + // DeviceIoControl()) can also be interpreted differently. We, + // however, always treat it as "access denied" in the detect + // dangling mode for good measure. Let's see if that won't be too + // noisy. + // + if (ec != ERROR_ACCESS_DENIED || !dd) continue; - } - e_.t_ = rp.first; + // Fall through. } - else - e_.t_ = directory (a) - ? entry_type::directory - : entry_type::regular; - if (e_.t_ == entry_type::symlink) + e_.t_ = rpe.first; + + // In this mode the entry type should be present and in the + // ignore_dangling mode it may not be entry_type::unknown. + // + //assert (e_.t_ && (dd || *e_.t_ != entry_type::unknown)); + + // Check if the symlink target exists and is accessible and set the + // target type. + // + if (*e_.t_ == entry_type::symlink) { // Query the target info. // // Note that we use entry_info_handle() rather than - // path_entry_info() to be able to verify an error on failure. + // path_entry_handle_info() to be able to verify an error on + // failure. // pair<auto_handle, BY_HANDLE_FILE_INFORMATION> ti ( entry_info_handle (p, @@ -2203,31 +2372,59 @@ namespace butl if (ti.first == nullhandle) { verify_error (); - continue; + + if (dd) + e_.lt_ = entry_type::unknown; + else + continue; } + else + { + ti.first.close (); // Checks for error. - ti.first.close (); // Checks for error. + e_.lt_ = directory (ti.second.dwFileAttributes) + ? entry_type::directory + : entry_type::regular; - // While at it, set the target type. - // - e_.lt_ = directory (ti.second.dwFileAttributes) - ? entry_type::directory - : entry_type::regular; + e_.mtime_ = to_timestamp (ti.second.ftLastWriteTime); + e_.atime_ = to_timestamp (ti.second.ftLastAccessTime); + } } + + // In this mode the symlink target type should be present and in the + // ignore_dangling mode it may not be entry_type::unknown. + // + //assert (*e_.t_ != entry_type::symlink || + // (e_.lt_ && (dd || *e_.lt_ != entry_type::unknown))); } } - else if (errno == ENOENT) + else { - // End of stream. + DWORD ec (GetLastError ()); + bool first (h_ == -1); + + // Check to distinguish non-existent vs empty directories. + // + // Note that dir_exists() handles not only the "filesystem entry does + // not exist" case but also the case when the entry exists but is not + // a directory. // - if (h_ != -1) + if (first && !dir_exists (e_.base ())) + throw_generic_error (ENOENT); + + if (ec == (first ? ERROR_FILE_NOT_FOUND : ERROR_NO_MORE_FILES)) { - _findclose (h_); - h_ = -1; + // End of stream. + // + if (h_ != -1) + { + FindClose (to_handle (h_)); + h_ = -1; + } } + else + throw_system_error (ec); } - else - throw_generic_error (errno); break; } @@ -2235,14 +2432,27 @@ namespace butl #endif // Search for paths matching the pattern and call the specified function for - // each matching path. Return false if the underlying func() call returns - // false. Otherwise the function conforms to the path_search() description. + // each matching path. Return false if the underlying func() or + // dangling_func() call returns false. Otherwise the function conforms to + // the path_search() description. // // Note that the access to the traversed directory tree (real or virtual) is // performed through the provided filesystem object. // static const string any_dir ("*/"); + // Filesystem traversal callbacks. + // + // Called before entering a directory for the recursive traversal. If + // returns false, then the directory is not entered. + // + using preopen = function<bool (const dir_path&)>; + + // Called before skipping a dangling link. If returns false, then the + // traversal is stopped. + // + using preskip = function<bool (const dir_entry&)>; + template <typename FS> static bool search ( @@ -2250,11 +2460,14 @@ namespace butl dir_path pattern_dir, path_match_flags fl, const function<bool (path&&, const string& pattern, bool interm)>& func, + const function<bool (const dir_entry&)>& dangling_func, FS& filesystem) { bool follow_symlinks ((fl & path_match_flags::follow_symlinks) != path_match_flags::none); + assert (follow_symlinks || dangling_func == nullptr); + // Fast-forward the leftmost pattern non-wildcard components. So, for // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/. // @@ -2301,17 +2514,47 @@ namespace butl // bool simple (pattern.simple ()); - // Note that we rely on "small function object" optimization here. + // If symlinks need to be followed, then pass the preskip callback for the + // filesystem iterator. + // + bool fs (follow_symlinks || !simple); + preskip ps; + bool dangling_stop (false); + + if (fs) + { + if (dangling_func != nullptr) + { + // Note that we rely on the "small function object" optimization here. + // + ps = [&dangling_func, &dangling_stop] (const dir_entry& de) -> bool + { + dangling_stop = !dangling_func (de); + return !dangling_stop; + }; + } + else + { + ps = [] (const dir_entry& de) -> bool + { + throw_generic_error ( + de.ltype () == entry_type::symlink ? ENOENT : EACCES); + }; + } + } + + // Note that we rely on the "small function object" optimization here. // typename FS::iterator_type i (filesystem.iterator ( pattern_dir, path_pattern_recursive (pcr), path_pattern_self_matching (pcr), - follow_symlinks || !simple, + fs, [&pattern_dir, &func] (const dir_path& p) -> bool // Preopen. { return func (pattern_dir / p, any_dir, true); - })); + }, + move (ps))); // Canonicalize the pattern component collapsing consecutive stars (used to // express that it is recursive) into a single one. @@ -2357,7 +2600,7 @@ namespace butl // represented by the iterator as an empty path, and so we need to // compute it (the leaf would actually be enough) for matching. This // leaf can be acquired from the pattern_dir (if not empty) or - // start_dir. We don't expect the start_dir to be empty, as the + // start_dir. We don't expect the start_dir to be empty, as the // filesystem object must replace an empty start directory with the // current one. This is the case when we search in the current directory // (start_dir is empty) with a pattern that starts with a *** wildcard @@ -2396,10 +2639,14 @@ namespace butl pattern_dir / path_cast<dir_path> (move (p)), fl, func, + dangling_func, filesystem)) return false; } + if (dangling_stop) + return false; + // If requested, also search with the absent-matching pattern path // component omitted, unless this is the only pattern component. // @@ -2407,8 +2654,15 @@ namespace butl pc.to_directory () && (!pattern_dir.empty () || !simple) && pc.string ().find_first_not_of ('*') == string::npos && - !search (pattern.leaf (pc), pattern_dir, fl, func, filesystem)) + !search (pattern.leaf (pc), + pattern_dir, + fl, + func, + dangling_func, + filesystem)) + { return false; + } return true; } @@ -2417,8 +2671,6 @@ namespace butl // static const dir_path empty_dir; - using preopen = function<bool (const dir_path&)>; - // Base for filesystem (see above) implementations. // // Don't copy start directory. It is expected to exist till the end of the @@ -2468,13 +2720,17 @@ namespace butl bool recursive, bool self, bool fs, - preopen po) + preopen po, + preskip ps) : start_ (move (p)), recursive_ (recursive), self_ (self), follow_symlinks_ (fs), - preopen_ (move (po)) + preopen_ (move (po)), + preskip_ (move (ps)) { + assert (fs || ps == nullptr); + open (dir_path (), self_); } @@ -2484,12 +2740,16 @@ namespace butl recursive_dir_iterator& operator= (const recursive_dir_iterator&) = delete; recursive_dir_iterator (recursive_dir_iterator&&) = default; - // Return false if no more entries left. Otherwise save the next entry path - // and return true. The path is relative to the directory being + // Return false if no more entries left. Otherwise save the next entry + // path and return true. The path is relative to the directory being // traversed and contains a trailing separator for sub-directories. Throw // std::system_error in case of a failure (insufficient permissions, // dangling symlink encountered, etc). // + // If symlinks need to be followed, then skip inaccessible/dangling + // entries or, if the preskip callback is specified and returns false for + // such an entry, stop the entire traversal. + // bool next (path& p) { @@ -2498,44 +2758,64 @@ namespace butl auto& i (iters_.back ()); - // If we got to the end of directory sub-entries, then go one level up - // and return this directory path. - // - if (i.first == dir_iterator ()) + for (;;) // Skip inaccessible/dangling entries. { - path d (move (i.second)); - iters_.pop_back (); + // If we got to the end of directory sub-entries, then go one level up + // and return this directory path. + // + if (i.first == dir_iterator ()) + { + path d (move (i.second)); + iters_.pop_back (); + + // Return the path unless it is the last one (the directory we + // started to iterate from) and the self flag is not set. + // + if (iters_.empty () && !self_) + return false; + + p = move (d); + return true; + } + + const dir_entry& de (*i.first); - // Return the path unless it is the last one (the directory we started - // to iterate from) and the self flag is not set. + // Append separator if a directory. Note that dir_entry::type() can + // throw. // - if (iters_.empty () && !self_) - return false; + entry_type et (follow_symlinks_ ? de.type () : de.ltype ()); - p = move (d); - return true; - } + // If the entry turned out to be inaccessible/dangling, then skip it + // if the preskip function is not specified or returns true and stop + // the entire traversal otherwise. + // + if (et == entry_type::unknown) + { + if (preskip_ != nullptr && !preskip_ (de)) + { + iters_.clear (); + return false; + } - const dir_entry& de (*i.first); + ++i.first; + continue; + } - // Append separator if a directory. Note that dir_entry::type() can - // throw. - // - entry_type et (follow_symlinks_ ? de.type () : de.ltype ()); - path pe (et == entry_type::directory - ? path_cast<dir_path> (i.second / de.path ()) - : i.second / de.path ()); + path pe (et == entry_type::directory + ? path_cast<dir_path> (i.second / de.path ()) + : i.second / de.path ()); - ++i.first; + ++i.first; - if (recursive_ && pe.to_directory ()) - { - open (path_cast<dir_path> (move (pe)), true); - return next (p); - } + if (recursive_ && pe.to_directory ()) + { + open (path_cast<dir_path> (move (pe)), true); + return next (p); + } - p = move (pe); - return true; + p = move (pe); + return true; + } } private: @@ -2557,10 +2837,15 @@ namespace butl { dir_path d (start_ / p); - // If we follow symlinks, then we ignore the dangling ones. + // If we follow symlinks, then we may need to skip the dangling + // ones. Note, however, that we will be skipping them not at the + // dir_iterator level but ourselves, after calling the preskip + // callback function (see next() for details). // i = dir_iterator (!d.empty () ? d : dir_path ("."), - follow_symlinks_); + follow_symlinks_ + ? dir_iterator::detect_dangling + : dir_iterator::no_follow); } iters_.emplace_back (move (i), move (p)); @@ -2590,6 +2875,7 @@ namespace butl bool self_; bool follow_symlinks_; preopen preopen_; + preskip preskip_; small_vector<pair<dir_iterator, dir_path>, 1> iters_; }; @@ -2613,13 +2899,15 @@ namespace butl bool recursive, bool self, bool follow_symlinks, - preopen po) const + preopen po, + preskip ps) const { return iterator_type (start_ / p, recursive, self, follow_symlinks, - move (po)); + move (po), + move (ps)); } }; @@ -2628,10 +2916,11 @@ namespace butl const path& pattern, const function<bool (path&&, const string& pattern, bool interm)>& func, const dir_path& start, - path_match_flags flags) + path_match_flags flags, + const function<bool (const dir_entry&)>& dangling_func) { real_filesystem fs (pattern.relative () ? start : empty_dir); - search (pattern, dir_path (), flags, func, fs); + search (pattern, dir_path (), flags, func, dangling_func, fs); } // Search path in the directory tree represented by a path. @@ -2789,7 +3078,8 @@ namespace butl bool recursive, bool self, bool /*follow_symlinks*/, - preopen po) + preopen po, + preskip) { // If path and sub-path are non-empty, and both are absolute or relative, // then no extra effort is required (prior to checking if one is a @@ -2848,6 +3138,6 @@ namespace butl path_match_flags flags) { path_filesystem fs (start, entry); - search (pattern, dir_path (), flags, func, fs); + search (pattern, dir_path (), flags, func, nullptr /* dangle_func */, fs); } } diff --git a/libbutl/filesystem.mxx b/libbutl/filesystem.hxx index 935fc3f..0f5fb0b 100644 --- a/libbutl/filesystem.mxx +++ b/libbutl/filesystem.hxx @@ -1,9 +1,7 @@ -// file : libbutl/filesystem.mxx -*- C++ -*- +// file : libbutl/filesystem.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif #include <errno.h> // E* @@ -22,7 +20,6 @@ using mode_t = int; #endif -#ifndef __cpp_lib_modules_ts #include <string> #include <cstddef> // ptrdiff_t #include <cstdint> // uint16_t, etc @@ -30,37 +27,45 @@ #include <iterator> // input_iterator_tag #include <functional> -#include <chrono> //@@ MOD needed by timestamp module (no re-export). -#endif +#include <libbutl/path.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/timestamp.hxx> +#include <libbutl/path-pattern.hxx> // path_match_flags -// Other includes. -#ifdef __cpp_modules_ts -export module butl.filesystem; +#include <libbutl/export.hxx> -#ifdef __cpp_lib_modules_ts -import std.core; -#endif +namespace butl +{ + // Path permissions. + // + enum class permissions: std::uint16_t + { + // Note: matching POSIX values. + // + xo = 0001, + wo = 0002, + ro = 0004, -import butl.path; -import butl.timestamp; -import butl.path_pattern; // path_match_flags + xg = 0010, + wg = 0020, + rg = 0040, -import butl.utility; // operator<<(ostream,exception), throw_generic_error() -#else -#include <libbutl/path.mxx> -#include <libbutl/timestamp.mxx> -#include <libbutl/path-pattern.mxx> + xu = 0100, + wu = 0200, + ru = 0400, -#include <libbutl/utility.mxx> -#endif + none = 0 + }; -#include <libbutl/export.hxx> + inline permissions operator& (permissions, permissions); + inline permissions operator| (permissions, permissions); + inline permissions operator&= (permissions&, permissions); + inline permissions operator|= (permissions&, permissions); -LIBBUTL_MODEXPORT namespace butl -{ // Return true if the path is to an existing regular file. Note that by // default this function follows symlinks. Underlying OS errors are reported - // by throwing std::system_error, unless ignore_error is true. + // by throwing std::system_error, unless ignore_error is true (in which case + // erroneous entries are treated as non-existent). // LIBBUTL_SYMEXPORT bool file_exists (const char*, @@ -73,7 +78,8 @@ LIBBUTL_MODEXPORT namespace butl // Return true if the path is to an existing directory. Note that this // function follows symlinks. Underlying OS errors are reported by throwing - // std::system_error, unless ignore_error is true. + // std::system_error, unless ignore_error is true (in which case erroneous + // entries are treated as non-existent). // LIBBUTL_SYMEXPORT bool dir_exists (const char*, bool ignore_error = false); @@ -84,7 +90,8 @@ LIBBUTL_MODEXPORT namespace butl // Return true if the path is to an existing file system entry. Note that by // default this function doesn't follow symlinks. Underlying OS errors are - // reported by throwing std::system_error, unless ignore_error is true. + // reported by throwing std::system_error, unless ignore_error is true (in + // which case erroneous entries are treated as non-existent). // LIBBUTL_SYMEXPORT bool entry_exists (const char*, @@ -117,7 +124,10 @@ LIBBUTL_MODEXPORT namespace butl // Return a flag indicating if the path is to an existing filesystem entry // and its info if so. Note that by default this function doesn't follow // symlinks. Underlying OS errors are reported by throwing - // std::system_error, unless ignore_error is true. + // std::system_error, unless ignore_error is true (in which case erroneous + // entries are treated as non-existent). + // + // See also fdstat() in fdstream. // LIBBUTL_SYMEXPORT std::pair<bool, entry_stat> path_entry (const char*, @@ -206,9 +216,12 @@ LIBBUTL_MODEXPORT namespace butl // is not atomic. It is also not atomic for the directory-type reparse point // removal. // - LIBBUTL_SYMEXPORT rmfile_status + rmfile_status try_rmfile (const path&, bool ignore_error = false); + optional<rmfile_status> + try_rmfile_ignore_error (const path&); + // Automatically try to remove a non-empty path on destruction unless // cancelled. Since the non-cancelled destruction will normally happen as a // result of an exception, the failure to remove the path is silently @@ -228,8 +241,8 @@ LIBBUTL_MODEXPORT namespace butl // Movable-only type. Move-assignment cancels the lhs object. // - auto_rm (auto_rm&&); - auto_rm& operator= (auto_rm&&); + auto_rm (auto_rm&&) noexcept; + auto_rm& operator= (auto_rm&&) noexcept; auto_rm (const auto_rm&) = delete; auto_rm& operator= (const auto_rm&) = delete; @@ -394,11 +407,13 @@ LIBBUTL_MODEXPORT namespace butl inline cpflags operator&= (cpflags&, cpflags); inline cpflags operator|= (cpflags&, cpflags); - // Copy a regular file, including its permissions, and optionally timestamps. - // Throw std::system_error on failure. Fail if the destination file exists - // and the overwrite_content flag is not set. Leave permissions of an - // existing destination file intact unless the overwrite_permissions flag is - // set. Delete incomplete copies before throwing. + // Copy a regular file, including its permissions (unless custom permissions + // are specified), and optionally timestamps. Throw std::system_error on + // failure. Fail if the destination file exists and the overwrite_content + // flag is not set. Leave permissions of an existing destination file intact + // (including if custom permissions are specified) unless the + // overwrite_permissions flag is set. Delete incomplete copies before + // throwing. // // Note that in case of overwriting, the existing destination file gets // truncated (not deleted) prior to being overwritten. As a side-effect, @@ -410,7 +425,10 @@ LIBBUTL_MODEXPORT namespace butl // fail. // LIBBUTL_SYMEXPORT void - cpfile (const path& from, const path& to, cpflags = cpflags::none); + cpfile (const path& from, + const path& to, + cpflags = cpflags::none, + optional<permissions> perm = nullopt); // Copy a regular file into (inside) an existing directory. // @@ -618,32 +636,6 @@ LIBBUTL_MODEXPORT namespace butl return dir_atime (p.string ().c_str (), t); } - // Path permissions. - // - enum class permissions: std::uint16_t - { - // Note: matching POSIX values. - // - xo = 0001, - wo = 0002, - ro = 0004, - - xg = 0010, - wg = 0020, - rg = 0040, - - xu = 0100, - wu = 0200, - ru = 0400, - - none = 0 - }; - - inline permissions operator& (permissions, permissions); - inline permissions operator| (permissions, permissions); - inline permissions operator&= (permissions&, permissions); - inline permissions operator|= (permissions&, permissions); - // Get path permissions. Throw std::system_error on failure. Note that this // function resolves symlinks. // @@ -665,12 +657,45 @@ LIBBUTL_MODEXPORT namespace butl // Symlink target type in case of the symlink, ltype() otherwise. // + // If type() returns entry_type::unknown then this entry is inaccessible + // (ltype() also returns entry_type::unknown) or is a dangling symlink + // (ltype() returns entry_type::symlink). Used with the detect_dangling + // dir_iterator mode. Note that on POSIX ltype() can never return unknown + // (because it is part of the directory iteration result). + // entry_type type () const; entry_type ltype () const; + // Modification and access times of the filesystem entry if it is not a + // symlink and of the symlink target otherwise. + // + // These are provided as an optimization if they can be obtained as a + // byproduct of work that is already being done anyway (iteration itself, + // calls to [l]type(), etc). If (not yet) available, timestamp_unknown is + // returned. + // + // Specifically: + // + // - On Windows mtime is always set by dir_iterator for entries other than + // reparse points. + // + // - On all platforms mtime and atime are always set for symlink targets + // by dir_iterator in the {detect,ignore}_dangling modes. + // + // - On all platforms mtime and atime can potentially be set by [l]type() + // if the stat() call is required to retrieve the type information (the + // native directory entry iterating API doesn't provide it, the type of + // the symlink target is queried, etc). + // + timestamp + mtime () const {return mtime_;} + + timestamp + atime () const {return atime_;} + // Entry path (excluding the base). To get the full path, do // base () / path (). // @@ -681,8 +706,17 @@ LIBBUTL_MODEXPORT namespace butl base () const {return b_;} dir_entry () = default; - dir_entry (entry_type t, path_type p, dir_path b) - : t_ (t), p_ (std::move (p)), b_ (std::move (b)) {} + + dir_entry (entry_type t, + path_type p, + dir_path b, + timestamp mt = timestamp_unknown, + timestamp at = timestamp_unknown) + : t_ (t), + mtime_ (mt), + atime_ (at), + p_ (std::move (p)), + b_ (std::move (b)) {} private: entry_type @@ -691,8 +725,14 @@ LIBBUTL_MODEXPORT namespace butl private: friend class dir_iterator; - mutable entry_type t_ = entry_type::unknown; // Lazy evaluation. - mutable entry_type lt_ = entry_type::unknown; // Lazy evaluation. + // Note: lazy evaluation. + // + mutable optional<entry_type> t_; // Entry type. + mutable optional<entry_type> lt_; // Symlink target type. + + mutable timestamp mtime_ = timestamp_unknown; + mutable timestamp atime_ = timestamp_unknown; + path_type p_; dir_path b_; }; @@ -709,12 +749,15 @@ LIBBUTL_MODEXPORT namespace butl ~dir_iterator (); dir_iterator () = default; - // If it is requested to ignore dangling symlinks, then the increment - // operator will skip symlinks that refer to non-existing or inaccessible - // targets. That implies that it will always try to stat() symlinks. + // If the mode is either ignore_dangling or detect_dangling, then stat() + // the entry and either ignore inaccessible/dangling entry or return it + // with the corresponding dir_entry type set to unknown (see dir_entry + // type()/ltype() for details). // + enum mode {no_follow, detect_dangling, ignore_dangling}; + explicit - dir_iterator (const dir_path&, bool ignore_dangling); + dir_iterator (const dir_path&, mode); dir_iterator (const dir_iterator&) = delete; dir_iterator& operator= (const dir_iterator&) = delete; @@ -740,10 +783,10 @@ LIBBUTL_MODEXPORT namespace butl #ifndef _WIN32 DIR* h_ = nullptr; #else - intptr_t h_ = -1; + intptr_t h_ = -1; // INVALID_HANDLE_VALUE #endif - bool ignore_dangling_ = false; + mode mode_ = no_follow; }; // Range-based for loop support. @@ -769,7 +812,7 @@ LIBBUTL_MODEXPORT namespace butl // Wildcard pattern search (aka glob). // - // For details on the wildcard patterns see <libbutl/path-pattern.mxx> + // For details on the wildcard patterns see <libbutl/path-pattern.hxx> // Search for paths matching the pattern calling the specified function for // each matching path (see below for details). @@ -834,9 +877,20 @@ LIBBUTL_MODEXPORT namespace butl // (a/b/, b*/, true) // (a/b/c/, c*/, false) // - // Note that recursive iterating through directories currently goes - // depth-first which make sense for the cleanup use cases. In future we may - // want to make it controllable. + // Note that recursive iterating through directories currently goes depth- + // first which make sense for the cleanup use cases. In the future we may + // want to make this controllable. + // + // If the match flags contain follow_symlinks, then call the dangling + // callback function for inaccessible/dangling entries if specified, and + // throw appropriate std::system_error otherwise. If the callback function + // returns true, then inaccessible/dangling entry is ignored. Otherwise, + // the entire search is stopped. + // + // Note also that if pattern is not simple (that is, contains directory + // components), then some symlinks (those that are matched against the + // directory components) may still be followed and thus the dangling + // function called. // LIBBUTL_SYMEXPORT void path_search (const path& pattern, @@ -844,7 +898,8 @@ LIBBUTL_MODEXPORT namespace butl const std::string& pattern, bool interm)>&, const dir_path& start = dir_path (), - path_match_flags = path_match_flags::follow_symlinks); + path_match_flags = path_match_flags::follow_symlinks, + const std::function<bool (const dir_entry&)>& dangling = nullptr); // Same as above, but behaves as if the directory tree being searched // through contains only the specified entry. The start directory is used if diff --git a/libbutl/filesystem.ixx b/libbutl/filesystem.ixx index f7c3777..b3f9224 100644 --- a/libbutl/filesystem.ixx +++ b/libbutl/filesystem.ixx @@ -1,6 +1,9 @@ // file : libbutl/filesystem.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file +#include <libbutl/utility.hxx> // operator<<(ostream,exception), + // throw_generic_error() + namespace butl { inline bool @@ -8,7 +11,7 @@ namespace butl { // @@ Could 0 size be a valid and faster way? // - return dir_iterator (d, false /* ignore_dangling */) == dir_iterator (); + return dir_iterator (d, dir_iterator::no_follow) == dir_iterator (); } inline bool @@ -38,6 +41,23 @@ namespace butl return e ? rmdir_status::success : rmdir_status::not_exist; } + LIBBUTL_SYMEXPORT optional<rmfile_status> + try_rmfile_maybe_ignore_error (const path&, bool ignore_error); + + inline rmfile_status + try_rmfile (const path& p, bool ignore_error) + { + auto r (try_rmfile_maybe_ignore_error (p, ignore_error)); + return r ? *r : rmfile_status::success; + } + + inline optional<rmfile_status> + try_rmfile_ignore_error (const path& p) + { + return try_rmfile_maybe_ignore_error (p, true); + } + + inline path followsymlink (const path& p) { @@ -53,7 +73,7 @@ namespace butl // template <typename P> inline auto_rm<P>:: - auto_rm (auto_rm&& x) + auto_rm (auto_rm&& x) noexcept : path (std::move (x.path)), active (x.active) { x.active = false; @@ -61,7 +81,7 @@ namespace butl template <typename P> inline auto_rm<P>& auto_rm<P>:: - operator= (auto_rm&& x) + operator= (auto_rm&& x) noexcept { if (this != &x) { @@ -117,54 +137,28 @@ namespace butl static_cast<std::uint16_t> (y)); } - // path_match_flags - // - inline path_match_flags operator& (path_match_flags x, path_match_flags y) - { - return x &= y; - } - - inline path_match_flags operator| (path_match_flags x, path_match_flags y) - { - return x |= y; - } - - inline path_match_flags operator&= (path_match_flags& x, path_match_flags y) - { - return x = static_cast<path_match_flags> ( - static_cast<std::uint16_t> (x) & - static_cast<std::uint16_t> (y)); - } - - inline path_match_flags operator|= (path_match_flags& x, path_match_flags y) - { - return x = static_cast<path_match_flags> ( - static_cast<std::uint16_t> (x) | - static_cast<std::uint16_t> (y)); - } - // dir_entry // inline entry_type dir_entry:: ltype () const { - return t_ != entry_type::unknown ? t_ : (t_ = type (false)); + return t_ ? *t_ : *(t_ = type (false /* follow_symlinks */)); } inline entry_type dir_entry:: type () const { entry_type t (ltype ()); - return t != entry_type::symlink - ? t - : lt_ != entry_type::unknown ? lt_ : (lt_ = type (true)); + return t != entry_type::symlink ? t : + lt_ ? *lt_ : + *(lt_ = type (true /* follow_symlinks */)); } // dir_iterator // inline dir_iterator:: dir_iterator (dir_iterator&& x) noexcept - : e_ (std::move (x.e_)), h_ (x.h_), ignore_dangling_ (x.ignore_dangling_) + : e_ (std::move (x.e_)), h_ (x.h_), mode_ (x.mode_) { #ifndef _WIN32 x.h_ = nullptr; diff --git a/libbutl/ft/lang.hxx b/libbutl/ft/lang.hxx index 567f5a4..82971d2 100644 --- a/libbutl/ft/lang.hxx +++ b/libbutl/ft/lang.hxx @@ -7,9 +7,14 @@ // __cpp_thread_local (extension) // // If this macro is undefined then one may choose to fallback to __thread. -// Note, however, that it only for values that do not require dynamic +// Note, however, that it only works for values that do not require dynamic // (runtime) initialization. // +// Note that thread_local with dynamic allocation/destruction appears to be +// broken when we use our own implementation of C++14 threads on MinGW. So +// we restrict ourselves to __thread which appears to be functioning, at +// least in the POSIX threads GCC configuration. +// #ifndef __cpp_thread_local // // Apparently Apple's Clang "temporarily disabled" C++11 thread_local until @@ -20,7 +25,7 @@ # if __apple_build_version__ >= 8000000 # define __cpp_thread_local 201103 # endif -# else +# elif !defined(LIBBUTL_MINGW_STDTHREAD) # define __cpp_thread_local 201103 # endif #endif diff --git a/libbutl/git.cxx b/libbutl/git.cxx index b9dd9bc..f37e16a 100644 --- a/libbutl/git.cxx +++ b/libbutl/git.cxx @@ -1,43 +1,11 @@ // file : libbutl/git.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/git.mxx> -#endif +#include <libbutl/git.hxx> -// C includes. - -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> - -#include <cstddef> // size_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.git; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -import butl.semantic_version -#endif - -import butl.utility; // digit() -import butl.filesystem; // entry_exists() -#else -#include <libbutl/utility.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/filesystem.mxx> -#include <libbutl/semantic-version.mxx> -#endif +#include <libbutl/optional.hxx> +#include <libbutl/filesystem.hxx> // entry_exists() +#include <libbutl/semantic-version.hxx> using namespace std; @@ -68,7 +36,9 @@ namespace butl // MinGit: git version 2.16.1.windows.1 // if (s.compare (0, 12, "git version ") == 0) - return parse_semantic_version (s, 12, "" /* build_separators */); + return parse_semantic_version (s, 12, + semantic_version::allow_build, + "" /* build_separators */); return nullopt; } diff --git a/libbutl/git.mxx b/libbutl/git.hxx index 3f003be..add721e 100644 --- a/libbutl/git.mxx +++ b/libbutl/git.hxx @@ -1,35 +1,17 @@ -// file : libbutl/git.mxx -*- C++ -*- +// file : libbutl/git.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.git; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -import butl.semantic_version; -#else -#include <libbutl/path.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/semantic-version.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/semantic-version.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Return true if the specified directory is a git repository root (contains // the .git filesystem entry). diff --git a/libbutl/host-os-release.cxx b/libbutl/host-os-release.cxx new file mode 100644 index 0000000..f13f62c --- /dev/null +++ b/libbutl/host-os-release.cxx @@ -0,0 +1,323 @@ +// file : libbutl/host-os-release.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbutl/host-os-release.hxx> + +#include <sstream> +#include <stdexcept> // runtime_error + +#include <libbutl/path.hxx> +#include <libbutl/path-io.hxx> +#include <libbutl/utility.hxx> +#include <libbutl/process.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/filesystem.hxx> // file_exists() +#include <libbutl/string-parser.hxx> // parse_quoted() + +#ifdef _WIN32 +# include <libbutl/win32-utility.hxx> +#endif + +using namespace std; + +namespace butl +{ + // Note: exported for access from the test. + // + LIBBUTL_SYMEXPORT os_release + host_os_release_linux (path f = {}) + { + os_release r; + + // According to os-release(5), we should use /etc/os-release and fallback + // to /usr/lib/os-release if the former does not exist. It also lists the + // fallback values for individual variables, in case some are not present. + // + auto exists = [] (const path& f) + { + try + { + return file_exists (f); + } + catch (const system_error& e) + { + ostringstream os; + os << "unable to stat path " << f << ": " << e; + throw runtime_error (os.str ()); + } + }; + + if (!f.empty () + ? exists (f) + : (exists (f = path ("/etc/os-release")) || + exists (f = path ("/usr/lib/os-release")))) + { + try + { + ifdstream ifs (f, ifdstream::badbit); + + string l; + for (uint64_t ln (1); !eof (getline (ifs, l)); ++ln) + { + trim (l); + + // Skip blanks lines and comments. + // + if (l.empty () || l[0] == '#') + continue; + + // The variable assignments are in the "shell style" and so can be + // quoted/escaped. For now we only handle quoting, which is what all + // the instances seen in the wild seems to use. + // + size_t p (l.find ('=')); + if (p == string::npos) + continue; + + string n (l, 0, p); + l.erase (0, p + 1); + + using string_parser::parse_quoted; + using string_parser::invalid_string; + + try + { + if (n == "ID_LIKE") + { + r.like_ids.clear (); + + vector<string> vs (parse_quoted (l, true /* unquote */)); + for (const string& v: vs) + { + for (size_t b (0), e (0); next_word (v, b, e); ) + { + r.like_ids.push_back (string (v, b, e - b)); + } + } + } + else if (string* p = (n == "ID" ? &r.name_id : + n == "VERSION_ID" ? &r.version_id : + n == "VARIANT_ID" ? &r.variant_id : + n == "NAME" ? &r.name : + n == "VERSION_CODENAME" ? &r.version_codename : + n == "VARIANT" ? &r.variant : + nullptr)) + { + vector<string> vs (parse_quoted (l, true /* unquote */)); + switch (vs.size ()) + { + case 0: *p = ""; break; + case 1: *p = move (vs.front ()); break; + default: throw invalid_string (0, "multiple values"); + } + } + } + catch (const invalid_string& e) + { + ostringstream os; + os << "invalid " << n << " value in " << f << ':' << ln << ": " + << e; + throw runtime_error (os.str ()); + } + } + + ifs.close (); + } + catch (const ios::failure& e) + { + ostringstream os; + os << "unable to read from " << f << ": " << e; + throw runtime_error (os.str ()); + } + } + + // Assign fallback values. + // + if (r.name_id.empty ()) r.name_id = "linux"; + if (r.name.empty ()) r.name = "Linux"; + + return r; + } + + static os_release + host_os_release_macos () + { + // Run sw_vers -productVersion to get Mac OS version. + // + try + { + process pr; + try + { + fdpipe pipe (fdopen_pipe ()); + + pr = process_start (0, pipe, 2, "sw_vers", "-productVersion"); + + pipe.out.close (); + ifdstream is (move (pipe.in), fdstream_mode::skip, ifdstream::badbit); + + // The output should be one line containing the version. + // + optional<string> v; + for (string l; !eof (getline (is, l)); ) + { + if (l.empty () || v) + { + v = nullopt; + break; + } + + v = move (l); + } + + is.close (); // Detect errors. + + if (pr.wait ()) + { + if (!v) + throw runtime_error ("unexpected sw_vers -productVersion output"); + + return os_release {"macos", {}, move (*v), "", "Mac OS", "", ""}; + } + + } + catch (const ios::failure& e) + { + if (pr.wait ()) + { + ostringstream os; + os << "error reading sw_vers output: " << e; + throw runtime_error (os.str ()); + } + + // Fall through. + } + + // We should only get here if the child exited with an error status. + // + assert (!pr.wait ()); + throw runtime_error ("process sw_vers exited with non-zero code"); + } + catch (const process_error& e) + { + ostringstream os; + os << "unable to execute sw_vers: " << e; + throw runtime_error (os.str ()); + } + } + + static os_release + host_os_release_windows () + { +#ifdef _WIN32 + // The straightforward way to get the version would be the GetVersionEx() + // Win32 function. However, if the application is built with a certain + // assembly manifest, this function will return the version the + // application was built for rather than what's actually running. + // + // The other plausible options are to call the `ver` program and parse it + // output (of questionable regularity) or to call RtlGetVersion(). The + // latter combined with GetProcAddress() seems to be a widely-used + // approach, so we are going with that (seeing that we employ a similar + // technique in quite a few places). + // + HMODULE nh (GetModuleHandle ("ntdll.dll")); + if (nh == nullptr) + throw runtime_error ("unable to get handle to ntdll.dll"); + + using RtlGetVersion = LONG /*NTSTATUS*/ (WINAPI*)(PRTL_OSVERSIONINFOW); + + RtlGetVersion gv ( + function_cast<RtlGetVersion> ( + GetProcAddress (nh, "RtlGetVersion"))); + + // RtlGetVersion() is available from Windows 2000 which is way before + // anything we might possibly care about (e.g., XP or 7). + // + if (gv == nullptr) + throw runtime_error ("unable to get address of RtlGetVersion()"); + + RTL_OSVERSIONINFOW vi; + vi.dwOSVersionInfoSize = sizeof (vi); + gv (&vi); // Always succeeds, according to documentation. + + // Ok, the real mess starts here. Here is how the commonly known Windows + // versions correspond to the major/minor/build numbers and how we will + // map them (note that there are also Server versions in the mix; see the + // OSVERSIONINFOEXW struct documentation for the complete picture): + // + // major minor build mapped + // Windows 11 10 0 >=22000 11 + // Windows 10 10 0 <22000 10 + // Windows 8.1 6 3 8.1 + // Windows 8 6 2 8 + // Windows 7 6 1 7 + // Windows Vista 6 0 6 + // Windows XP Pro/64-bit 5 2 5.2 + // Windows XP 5 1 5.1 + // Windows 2000 5 0 5 + // + // Based on this it's probably not wise to try to map any future versions + // automatically. + // + string v; + if (vi.dwMajorVersion == 10 && vi.dwMinorVersion == 0) + { + v = vi.dwBuildNumber >= 22000 ? "11" : "10"; + } + else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 3) v = "8.1"; + else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 2) v = "8"; + else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 1) v = "7"; + else if (vi.dwMajorVersion == 6 && vi.dwMinorVersion == 0) v = "6"; + else if (vi.dwMajorVersion == 5 && vi.dwMinorVersion == 2) v = "5.2"; + else if (vi.dwMajorVersion == 5 && vi.dwMinorVersion == 1) v = "5.1"; + else if (vi.dwMajorVersion == 5 && vi.dwMinorVersion == 0) v = "5"; + else throw ("unknown windows version " + + std::to_string (vi.dwMajorVersion) + '.' + + std::to_string (vi.dwMinorVersion) + '.' + + std::to_string (vi.dwBuildNumber)); + + return os_release {"windows", {}, move (v), "", "Windows", "", ""}; +#else + throw runtime_error ("unexpected host operating system"); +#endif + } + + optional<os_release> + host_os_release (const target_triplet& h) + { + const string& c (h.class_); + const string& s (h.system); + + if (c == "linux") + return host_os_release_linux (); + + if (c == "macos") + return host_os_release_macos (); + + if (c == "windows") + return host_os_release_windows (); + + if (c == "bsd") + { + // @@ TODO: ideally we would want to run uname and obtain the actual + // version we are runnig on rather than what we've been built for. + // (Think also how this will affect tests). + // + if (s == "freebsd") + return os_release {"freebsd", {}, h.version, "", "FreeBSD", "", ""}; + + if (s == "netbsd") + return os_release {"netbsd", {}, h.version, "", "NetBSD", "", ""}; + + if (s == "openbsd") + return os_release {"openbsd", {}, h.version, "", "OpenBSD", "", ""}; + + // Assume some other BSD. + // + return os_release {s, {}, h.version, "", s, "", ""}; + } + + return nullopt; + } +} diff --git a/libbutl/host-os-release.hxx b/libbutl/host-os-release.hxx new file mode 100644 index 0000000..058afdc --- /dev/null +++ b/libbutl/host-os-release.hxx @@ -0,0 +1,86 @@ +// file : libbutl/host-os-release.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <string> +#include <vector> + +#include <libbutl/optional.hxx> +#include <libbutl/target-triplet.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + // Information extracted from /etc/os-release on Linux. See os-release(5) + // for background. For other platforms we derive the equivalent information + // from other sources. Some examples: + // + // {"debian", {}, "10", "", + // "Debian GNU/Linux", "buster", ""} + // + // {"fedora", {}, "35", "workstation", + // "Fedora Linux", "", "Workstation Edition"} + // + // {"ubuntu", {"debian"}, "20.04", "", + // "Ubuntu", "focal", ""} + // + // {"macos", {}, "12.5", "", + // "Mac OS", "", ""} + // + // {"freebsd", {}, "13.1", "", + // "FreeBSD", "", ""} + // + // {"windows", {}, "10", "", + // "Windows", "", ""} + // + // Note that for Mac OS, the version is the Mac OS version (as printed by + // sw_vers) rather than Darwin version (as printed by uname). + // + // For Windows we currently do not distinguish the Server edition and the + // version mapping is as follows: + // + // Windows 11 11 + // Windows 10 10 + // Windows 8.1 8.1 + // Windows 8 8 + // Windows 7 7 + // Windows Vista 6 + // Windows XP Pro/64-bit 5.2 + // Windows XP 5.1 + // Windows 2000 5 + // + // Note that version_id may be empty, for example, on Debian testing: + // + // {"debian", {}, "", "", + // "Debian GNU/Linux", "", ""} + // + // Note also that we don't extract PRETTY_NAME because its content is + // unpredictable. For example, it may include variant, as in "Fedora Linux + // 35 (Workstation Edition)". Instead, construct it from the individual + // components as appropriate, normally "$name $version ($version_codename)". + // + struct os_release + { + std::string name_id; // ID + std::vector<std::string> like_ids; // ID_LIKE + std::string version_id; // VERSION_ID + std::string variant_id; // VARIANT_ID + + std::string name; // NAME + std::string version_codename; // VERSION_CODENAME + std::string variant; // VARIANT + }; + + // Return the release information for the specified host or nullopt if the + // specific host is unknown/unsupported. Throw std::runtime_error if + // anything goes wrong. + // + // Note that "host" here implies that we may be running programs, reading + // files, examining environment variables, etc., of the machine we are + // running on. + // + LIBBUTL_SYMEXPORT optional<os_release> + host_os_release (const target_triplet& host); +} diff --git a/libbutl/json/event.hxx b/libbutl/json/event.hxx new file mode 100644 index 0000000..77185cc --- /dev/null +++ b/libbutl/json/event.hxx @@ -0,0 +1,27 @@ +#pragma once + +#include <cstddef> +#include <cstdint> + +namespace butl +{ + namespace json + { + // Parsing/serialization event. + // + enum class event: std::uint8_t + { + begin_object = 1, + end_object, + begin_array, + end_array, + name, + string, + number, + boolean, + null + }; + + constexpr std::size_t event_count = 9; + } +} diff --git a/libbutl/json/parser.cxx b/libbutl/json/parser.cxx new file mode 100644 index 0000000..8ef7422 --- /dev/null +++ b/libbutl/json/parser.cxx @@ -0,0 +1,645 @@ +#define PDJSON_SYMEXPORT static // See below. + +#include <libbutl/json/parser.hxx> + +#include <istream> + +// There is an issue (segfault) with using std::current_exception() and +// std::rethrow_exception() with older versions of libc++ on Linux. While the +// exact root cause hasn't been determined, the suspicion is that something +// gets messed up if we "smuggle" std::exception_ptr through extern "C" call +// frames (we cannot even destroy such an exception without a segfault). We +// also could not determine in which version exactly this has been fixed but +// we know that libc++ 6.0.0 doesn't appear to have this issue (though we are +// not entirely sure the issue is (only) in libc++; libgcc_s could also be +// involved). +// +// The workaround is to just catch (and note) the exception and then throw a +// new instance of generic std::istream::failure. In order not to drag the +// below test into the header, we wrap exception_ptr with optional<> and use +// NULL to indicate the presence of the exception when the workaround is +// required. +// +// Note that if/when we drop this workaround, we should also get rid of +// optional<> in stream::exception member. +// +#undef LIBBUTL_JSON_NO_EXCEPTION_PTR + +#if defined (__linux__) && defined(__clang__) +# if __has_include(<__config>) +# include <__config> // _LIBCPP_VERSION +# if _LIBCPP_VERSION < 6000 +# define LIBBUTL_JSON_NO_EXCEPTION_PTR 1 +# endif +# endif +#endif + +namespace butl +{ + namespace json + { + using namespace std; + + parser:: + ~parser () + { + json_close (impl_); + } + + static int + stream_get (void* x) + { + auto& s (*static_cast<parser::stream*> (x)); + + // In the multi-value mode reading of whitespaces/separators is split + // between our code and pdjson's. As a result, these functions may end + // up being called more than once after EOF is reached. Which is + // something iostream does not handle gracefully. + // + if (!s.is->eof ()) + { + try + { + // We first peek not to trip failbit on EOF. + // + if (s.is->peek () != istream::traits_type::eof ()) + return static_cast<char> (s.is->get ()); + } + catch (...) + { +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + s.exception = current_exception (); +#else + s.exception = nullptr; +#endif + } + } + + return EOF; + } + + static int + stream_peek (void* x) + { + auto& s (*static_cast<parser::stream*> (x)); + + if (!s.is->eof ()) + { + try + { + auto c (s.is->peek ()); + if (c != istream::traits_type::eof ()) + return static_cast<char> (c); + } + catch (...) + { +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + s.exception = current_exception (); +#else + s.exception = nullptr; +#endif + } + } + + return EOF; + } + + // NOTE: watch out for exception safety (specifically, doing anything that + // might throw after opening the stream). + // + parser:: + parser (istream& is, const char* n, bool mv, const char* sep) noexcept + : input_name (n), + stream_ {&is, nullopt}, + multi_value_ (mv), + separators_ (sep), + raw_s_ (nullptr), + raw_n_ (0) + { + json_open_user (impl_, &stream_get, &stream_peek, &stream_); + json_set_streaming (impl_, multi_value_); + } + + parser:: + parser (const void* t, + size_t s, + const char* n, + bool mv, + const char* sep) noexcept + : input_name (n), + stream_ {nullptr, nullopt}, + multi_value_ (mv), + separators_ (sep), + raw_s_ (nullptr), + raw_n_ (0) + { + json_open_buffer (impl_, t, s); + json_set_streaming (impl_, multi_value_); + } + + optional<event> parser:: + next () + { + name_p_ = value_p_ = location_p_ = false; + + // Note that for now we don't worry about the state of the parser if + // next_impl() throws assuming it is not going to be reused. + // + if (peeked_) + { + parsed_ = peeked_; + peeked_ = nullopt; + } + else + parsed_ = next_impl (); + + return translate (*parsed_); + } + + optional<event> parser:: + peek () + { + if (!peeked_) + { + if (parsed_) + { + cache_parsed_data (); + cache_parsed_location (); + } + peeked_ = next_impl (); + } + return translate (*peeked_); + } + + static inline const char* + event_name (event e) + { + switch (e) + { + case event::begin_object: return "beginning of object"; + case event::end_object: return "end of object"; + case event::begin_array: return "beginning of array"; + case event::end_array: return "end of array"; + case event::name: return "member name"; + case event::string: return "string value"; + case event::number: return "numeric value"; + case event::boolean: return "boolean value"; + case event::null: return "null value"; + } + + return ""; + } + + bool parser:: + next_expect (event p, optional<event> s) + { + optional<event> e (next ()); + bool r; + if (e && ((r = *e == p) || (s && *e == *s))) + return r; + + string d ("expected "); + d += event_name (p); + + if (s) + { + d += " or "; + d += event_name (*s); + } + + if (e) + { + d += " instead of "; + d += event_name (*e); + } + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + void parser:: + next_expect_name (const char* n, bool su) + { + for (;;) + { + next_expect (event::name); + + if (name () == n) + return; + + if (!su) + break; + + next_expect_value_skip (); + } + + string d ("expected object member name '"); + d += n; + d += "' instead of '"; + d += name (); + d += '\''; + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + void parser:: + next_expect_value_skip () + { + optional<event> e (next ()); + + if (e) + { + switch (*e) + { + case event::begin_object: + case event::begin_array: + { + // Skip until matching end_object/array keeping track of nesting. + // We are going to rely on the fact that we should either get such + // an event or next() should throw. + // + event be (*e); + event ee (be == event::begin_object + ? event::end_object + : event::end_array); + + for (size_t n (0);; ) + { + event e (*next ()); + + if (e == ee) + { + if (n == 0) + break; + + --n; + } + else if (e == be) + ++n; + } + + return; + } + case event::string: + case event::number: + case event::boolean: + case event::null: + return; + case event::name: + case event::end_object: + case event::end_array: + break; + } + } + + string d ("expected value"); + + if (e) + { + d += " instead of "; + d += event_name (*e); + } + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + std::uint64_t parser:: + line () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_lineno (const_cast<json_stream*> (impl_))); + } + + return line_; + } + + std::uint64_t parser:: + column () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_column (const_cast<json_stream*> (impl_))); + } + + return column_; + } + + std::uint64_t parser:: + position () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_position (const_cast<json_stream*> (impl_))); + } + + return position_; + } + + json_type parser:: + next_impl () + { + raw_s_ = nullptr; + raw_n_ = 0; + json_type e; + + // Read characters between values skipping required separators and JSON + // whitespaces. Return whether a required separator was encountered as + // well as the first non-separator/whitespace character (which, if EOF, + // should trigger a check for input/output errors). + // + // Note that the returned non-separator will not have been extracted + // from the input (so position, column, etc. will still refer to its + // predecessor). + // + auto skip_separators = [this] () -> pair<bool, int> + { + bool r (separators_ == nullptr); + + int c; + for (; (c = json_source_peek (impl_)) != EOF; json_source_get (impl_)) + { + // User separator. + // + if (separators_ != nullptr && *separators_ != '\0') + { + if (strchr (separators_, c) != nullptr) + { + r = true; + continue; + } + } + + // JSON separator. + // + if (json_isspace (c)) + { + if (separators_ != nullptr && *separators_ == '\0') + r = true; + + continue; + } + + break; + } + + return make_pair (r, c); + }; + + // In the multi-value mode skip any instances of required separators + // (and any other JSON whitespace) preceding the first JSON value. + // + if (multi_value_ && !parsed_ && !peeked_) + { + if (skip_separators ().second == EOF && stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + } + + e = json_next (impl_); + + // First check for a pending input/output error. + // + if (stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + + // There are two ways to view separation between two values: as following + // the first value or as preceding the second value. And one aspect that + // is determined by this is whether a separation violation is a problem + // with the first value or with the second, which becomes important if + // the user bails out before parsing the second value. + // + // Consider these two unseparated value (yes, in JSON they are two + // values, leading zeros are not allowed in JSON numbers): + // + // 01 + // + // If the user bails out after parsing 0 in a stream that should have + // been newline-delimited, they most likely would want to get an error + // since this is most definitely an invalid value rather than two + // values that are not properly separated. So in this light we handle + // separators at the end of the first value. + // + switch (e) + { + case JSON_DONE: + { + // Deal with the following value separators. + // + // Note that we must not do this for the second JSON_DONE (or the + // first one in case there are no values) that signals the end of + // input. + // + if (multi_value_ && + (parsed_ || peeked_) && + (peeked_ ? *peeked_ : *parsed_) != JSON_DONE) + { + auto p (skip_separators ()); + + if (p.second == EOF && stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + + // Note that we don't require separators after the last value. + // + if (!p.first && p.second != EOF) + { + json_source_get (impl_); // Consume to update column number. + goto fail_separation; + } + + json_reset (impl_); + } + break; + } + case JSON_ERROR: goto fail_json; + case JSON_STRING: + case JSON_NUMBER: + raw_s_ = json_get_string (impl_, &raw_n_); + raw_n_--; // Includes terminating `\0`. + break; + case JSON_TRUE: raw_s_ = "true"; raw_n_ = 4; break; + case JSON_FALSE: raw_s_ = "false"; raw_n_ = 5; break; + case JSON_NULL: raw_s_ = "null"; raw_n_ = 4; break; + default: break; + } + + return e; + + fail_json: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + json_get_error (impl_)); + + fail_separation: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + "missing separator between JSON values"); + + fail_stream: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + "unable to read JSON input text"); + + fail_rethrow: +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + rethrow_exception (move (*stream_.exception)); +#else + throw istream::failure ("unable to read"); +#endif + } + + optional<event> parser:: + translate (json_type e) const noexcept + { + switch (e) + { + case JSON_DONE: return nullopt; + case JSON_OBJECT: return event::begin_object; + case JSON_OBJECT_END: return event::end_object; + case JSON_ARRAY: return event::begin_array; + case JSON_ARRAY_END: return event::end_array; + case JSON_STRING: + { + // This can be a value or, inside an object, a name from the + // name/value pair. + // + size_t n; + return json_get_context (const_cast<json_stream*> (impl_), &n) == + JSON_OBJECT && + n % 2 == 1 + ? event::name + : event::string; + } + case JSON_NUMBER: return event::number; + case JSON_TRUE: return event::boolean; + case JSON_FALSE: return event::boolean; + case JSON_NULL: return event::null; + case JSON_ERROR: assert (false); // Should've been handled by caller. + } + + return nullopt; // Should never reach. + } + + void parser:: + cache_parsed_data () + { + name_p_ = value_p_ = false; + if (const optional<event> e = translate (*parsed_)) + { + if (e == event::name) + { + name_.assign (raw_s_, raw_n_); + name_p_ = true; + } + else if (value_event (e)) + { + value_.assign (raw_s_, raw_n_); + value_p_ = true; + } + } + } + + void parser:: + cache_parsed_location () noexcept + { + line_ = static_cast<uint64_t> (json_get_lineno (impl_)); + column_ = static_cast<uint64_t> (json_get_column (impl_)); + position_ = static_cast<uint64_t> (json_get_position (impl_)); + location_p_ = true; + } + + bool parser:: + value_event (optional<event> e) noexcept + { + if (!e) + return false; + + switch (*e) + { + case event::string: + case event::number: + case event::boolean: + case event::null: + return true; + default: + return false; + } + } + + [[noreturn]] void parser:: + throw_invalid_value (const char* type, const char* v, size_t n) const + { + string d (string ("invalid ") + type + " value: '"); + d.append (v, n); + d += '\''; + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + } // namespace json +} // namespace butl + +// Include the implementation into our translation unit (instead of compiling +// it separately) to (hopefully) get function inlining without LTO. +// +// Let's keep it last since the implementation defines a couple of macros. +// +#if defined(__clang__) || defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +extern "C" +{ +#define PDJSON_STACK_INC 16 +#define PDJSON_STACK_MAX 2048 +#include "pdjson.c" +} diff --git a/libbutl/json/parser.hxx b/libbutl/json/parser.hxx new file mode 100644 index 0000000..95d9c4e --- /dev/null +++ b/libbutl/json/parser.hxx @@ -0,0 +1,705 @@ +#pragma once + +#ifdef BUILD2_BOOTSTRAP +# error JSON parser not available during bootstrap +#endif + +#include <iosfwd> +#include <string> +#include <cstddef> // size_t +#include <cstdint> // uint64_t +#include <utility> // pair +#include <exception> // exception_ptr +#include <stdexcept> // invalid_argument + +#include <libbutl/optional.hxx> // butl::optional is std::optional or similar. + +#include <libbutl/json/event.hxx> + +#include <libbutl/json/pdjson.h> // Implementation details. + +#include <libbutl/export.hxx> + +namespace butl +{ + // Using the RFC8259 terminology: JSON (input) text, JSON value, object + // member. + // + namespace json + { + class invalid_json_input: public std::invalid_argument + { + public: + std::string name; + std::uint64_t line; + std::uint64_t column; + std::uint64_t position; + + invalid_json_input (std::string name, + std::uint64_t line, + std::uint64_t column, + std::uint64_t position, + const std::string& description); + + invalid_json_input (std::string name, + std::uint64_t line, + std::uint64_t column, + std::uint64_t position, + const char* description); + }; + + class LIBBUTL_SYMEXPORT parser + { + public: + const char* input_name; + + // Construction. + // + + // Parse JSON input text from std::istream. + // + // The name argument is used to identify the input being parsed. Note + // that the stream, name, and separators are kept as references so they + // must outlive the parser instance. + // + // If stream exceptions are enabled then the std::ios_base::failure + // exception is used to report input/output errors (badbit and failbit). + // Otherwise, those are reported as the invalid_json_input exception. + // + // If multi_value is true, enable the multi-value mode in which case the + // input stream may contain multiple JSON values (more precisely, zero + // or more). If false (the default), parsing will fail unless there is + // exactly one JSON value in the input stream. + // + // If multi_value is true, the separators argument specifies the + // required separator characters between JSON values. At least one of + // them must be present between every pair of JSON values (in addition + // to any number of JSON whitespaces). No separators are required after + // the last JSON value (but any found will be skipped). + // + // Specifically, if it is NULL, then no separation is required (that is, + // both `{...}{...}` and `{...} {...}` would be valid). If it is empty, + // then at least one JSON whitespace is required. And if it is non- + // empty, then at least one of its characters must be present (for + // example, "\n\t" would require at least one newline or TAB character + // between JSON values). + // + // Note that a separator need not be valid JSON whitespace: any + // character is acceptable (though it probably shouldn't be an object, + // array, or string delimiter and should not occur within a non-self- + // delimited top-level value, such as `true`, `false`, `null`, or a + // number). All instances of required separators before and after a + // value are skipped. Therefore JSON Text Sequences (RFC 7464; AKA + // Record Separator-delimited JSON), which requires the RS (0x1E) + // character before each value, can be handled as well. + // + parser (std::istream&, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (std::istream&, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (std::istream&, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Parse a memory buffer that contains the entire JSON input text. + // + // The name argument is used to identify the input being parsed. Note + // that the buffer, name, and separators are kept as references so they + // must outlive the parser instance. + // + parser (const void* text, + std::size_t size, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const void* text, + std::size_t size, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const void*, + std::size_t, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Similar to the above but parse a string. + // + parser (const std::string& text, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const std::string& text, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const std::string&, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Similar to the above but parse a C-string. + // + parser (const char* text, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const char* text, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const char*, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + parser (parser&&) = delete; + parser (const parser&) = delete; + + parser& operator= (parser&&) = delete; + parser& operator= (const parser&) = delete; + + // Event iteration. + // + + // Return the next event or nullopt if end of input is reached. + // + // In the single-value parsing mode (default) the parsing code could + // look like this: + // + // while (optional<event> e = p.next ()) + // { + // switch (*e) + // { + // // ... + // } + // } + // + // In the multi-value mode the parser additionally returns nullopt after + // every JSON value parsed (so there will be two nullopt's after the + // last JSON value, the second indicating the end of input). + // + // One way to perform multi-value parsing is with the help of the peek() + // function (see below): + // + // while (p.peek ()) + // { + // while (optional<event> e = p.next ()) + // { + // switch (*e) + // { + // //... + // } + // } + // } + // + // Note that while the single-value mode will always parse exactly one + // value, the multi-value mode will accept zero values in which case a + // single nullopt is returned. + // + optional<event> + next (); + + // The range-based for loop support. + // + // In the single-value parsing mode (default) the parsing code could + // look like this: + // + // for (event e: p) + // { + // switch (e) + // { + // //... + // } + // } + // + // And in the multi-value mode (see next() for more information) like + // this: + // + // while (p.peek ()) + // { + // for (event e: p) + // { + // switch (e) + // { + // //... + // } + // } + // } + // + // Note that generally, the iterator interface doesn't make much sense + // for the parser so for now we have an implementation that is just + // enough for the range-based for. + // + struct iterator; + + iterator begin () {return iterator (this, next ());} + iterator end () {return iterator (nullptr, nullopt);} + + // Return the next event without considering it parsed. In other words, + // after this call, any subsequent calls to peek() and the next call to + // next() (if any) will all return the same event. + // + // Note that the name, value, and line corresponding to the peeked event + // are not accessible with name(), value() and line(); these functions + // will still return values corresponding to the most recent call to + // next(). The peeked values, however, can be accessed in the raw form + // using data(). + // + optional<event> + peek (); + + + // Event data access. + // + + // Return the object member name. + // + const std::string& + name (); + + // Any value (string, number, boolean, and null) can be retrieved as a + // string. Calling this function after any non-value events is illegal. + // + // Note that the value is returned as a non-const string reference and + // you are allowed to move the value out of it. However, this should not + // be done unnecessarily or in cases where the small string optimization + // is likely since the string's buffer is reused to store subsequent + // values. + // + std::string& + value (); + + // Convert the value to an integer, floating point, or bool. Throw + // invalid_json_input if the conversion is impossible without a loss. + // + template <typename T> + T + value () const; + + // Return the value or object member name in the raw form. + // + // Calling this function on non-value/name events is legal in which case + // NULL is returned. Note also that the returned data corresponds to the + // most recent event, whether peeked or parsed. + // + std::pair<const char*, std::size_t> + data () const {return std::make_pair (raw_s_, raw_n_);} + + + // Higher-level API suitable for parsing specific JSON vocabularies. + // + // The API summary: + // + // void next_expect (event); + // bool next_expect (event primary, event secondary); + // + // void next_expect_name (string name, bool skip_unknown = false); + // + // std::string& next_expect_string (); + // T next_expect_string<T> (); + // std::string& next_expect_number (); + // T next_expect_number<T> (); + // std::string& next_expect_boolean (); + // T next_expect_boolean<T>(); + // + // std::string* next_expect_string_null (); + // optional<T> next_expect_string_null<T> (); + // std::string* next_expect_number_null (); + // optional<T> next_expect_number_null<T> (); + // std::string* next_expect_boolean_null (); + // optional<T> next_expect_boolean_null<T>(); + // + // std::string& next_expect_member_string (string name, bool = false); + // T next_expect_member_string<T> (string name, bool = false); + // std::string& next_expect_member_number (string name, bool = false); + // T next_expect_member_number<T> (string name, bool = false); + // std::string& next_expect_member_boolean (string name, bool = false); + // T next_expect_member_boolean<T>(string name, bool = false); + // + // std::string* next_expect_member_string_null (string, bool = false); + // optional<T> next_expect_member_string_null<T> (string, bool = false); + // std::string* next_expect_member_number_null (string, bool = false); + // optional<T> next_expect_member_number_null<T> (string, bool = false); + // std::string* next_expect_member_boolean_null (string, bool = false); + // optional<T> next_expect_member_boolean_null<T>(string, bool = false); + // + // void next_expect_member_object (string name, bool = false); + // bool next_expect_member_object_null(string name, bool = false); + // + // void next_expect_member_array (string name, bool = false); + // bool next_expect_member_array_null(string name, bool = false); + // + // void next_expect_value_skip(); + + // Get the next event and make sure that it's what's expected: primary + // or, if specified, secondary event. If it is not either, then throw + // invalid_json_input with appropriate description. Return true if it is + // primary. + // + // The secondary expected event is primarily useful for handling + // optional members. For example: + // + // while (p.next_expect (event::name, event::end_object)) + // { + // // Handle object member. + // } + // + // Or homogeneous arrays: + // + // while (p.next_expect (event::string, event::end_array)) + // { + // // Handle array element. + // } + // + // Or values that can be null: + // + // if (p.next_expect (event::begin_object, event::null)) + // { + // // Parse object. + // } + // + bool + next_expect (event primary, optional<event> secondary = nullopt); + + // Get the next event and make sure it is event::name and the object + // member matches the specified name. If either is not, then throw + // invalid_json_input with appropriate description. If skip_unknown is + // true, then skip over unknown member names until a match is found. + // + void + next_expect_name (const char* name, bool skip_unknown = false); + + void + next_expect_name (const std::string&, bool = false); + + // Get the next event and make sure it is event::<type> returning its + // value similar to the value() functions. If it is not, then throw + // invalid_json_input with appropriate description. + // + std::string& + next_expect_string (); + + template <typename T> + T + next_expect_string (); + + std::string& + next_expect_number (); + + template <typename T> + T + next_expect_number (); + + std::string& + next_expect_boolean (); + + template <typename T> + T + next_expect_boolean (); + + // Similar to next_expect_<type>() but in addition to event::<type> also + // allow event::null, in which case returning no value. + // + std::string* + next_expect_string_null (); + + template <typename T> + optional<T> + next_expect_string_null (); + + std::string* + next_expect_number_null (); + + template <typename T> + optional<T> + next_expect_number_null (); + + std::string* + next_expect_boolean_null (); + + template <typename T> + optional<T> + next_expect_boolean_null (); + + // Call next_expect_name() followed by next_expect_<type>[_null]() + // returning its result. In other words, parse the entire object member + // with the specifed name and of type <type>, returning its value. + + // next_expect_member_string() + // + std::string& + next_expect_member_string (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_string (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_string (const char*, bool = false); + + template <typename T> + T + next_expect_member_string (const std::string&, bool = false); + + // next_expect_member_number() + // + std::string& + next_expect_member_number (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_number (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_number (const char*, bool = false); + + template <typename T> + T + next_expect_member_number (const std::string&, bool = false); + + // next_expect_member_boolean() + // + std::string& + next_expect_member_boolean (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_boolean (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_boolean (const char*, bool = false); + + template <typename T> + T + next_expect_member_boolean (const std::string&, bool = false); + + // next_expect_member_string_null() + // + std::string* + next_expect_member_string_null (const char*, bool = false); + + std::string* + next_expect_member_string_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_string_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_string_null (const std::string&, bool = false); + + // next_expect_member_number_null() + // + std::string* + next_expect_member_number_null (const char*, bool = false); + + std::string* + next_expect_member_number_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_number_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_number_null (const std::string&, bool = false); + + // next_expect_member_boolean_null() + // + std::string* + next_expect_member_boolean_null (const char*, bool = false); + + std::string* + next_expect_member_boolean_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_boolean_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_boolean_null (const std::string&, bool = false); + + // Call next_expect_name() followed by next_expect(event::begin_object). + // In the _null version also allow event::null, in which case return + // false. + // + void + next_expect_member_object (const char* name, bool skip_unknown = false); + + void + next_expect_member_object (const std::string&, bool = false); + + bool + next_expect_member_object_null (const char*, bool = false); + + bool + next_expect_member_object_null (const std::string&, bool = false); + + // Call next_expect_name() followed by next_expect(event::begin_array). + // In the _null version also allow event::null, in which case return + // false. + // + void + next_expect_member_array (const char* name, bool skip_unknown = false); + + void + next_expect_member_array (const std::string&, bool = false); + + bool + next_expect_member_array_null (const char*, bool = false); + + bool + next_expect_member_array_null (const std::string&, bool = false); + + // Get the next event and make sure it is the beginning of a value + // (begin_object, begin_array, string, number, boolean, null). If it is + // not, then throw invalid_json_input with appropriate description. + // Otherwise, skip until the end of the value, recursively in case of + // object and array. + // + // This function is primarily useful for skipping unknown object + // members, for example: + // + // while (p.next_expect (event::name, event::end_object)) + // { + // if (p.name () == "known") + // { + // // Handle known member. + // } + // else + // p.next_expect_value_skip (); + // } + // + void + next_expect_value_skip (); + + // Parsing location. + // + + // Return the line number (1-based) corresponding to the most recently + // parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + line () const noexcept; + + // Return the column number (1-based) corresponding to the beginning of + // the most recently parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + column () const noexcept; + + // Return the position (byte offset) pointing immediately after the most + // recently parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + position () const noexcept; + + // Implementation details. + // + public: + struct iterator + { + using value_type = event; + + explicit + iterator (parser* p = nullptr, optional<event> e = nullopt) + : p_ (p), e_ (e) {} + + event operator* () const {return *e_;} + iterator& operator++ () {e_ = p_->next (); return *this;} + + // Comparison only makes sense when comparing to end (eof). + // + bool operator== (iterator y) const {return !e_ && !y.e_;} + bool operator!= (iterator y) const {return !(*this == y);} + + private: + parser* p_; + optional<event> e_; + }; + + struct stream + { + std::istream* is; + optional<std::exception_ptr> exception; + }; + + [[noreturn]] void + throw_invalid_value (const char* type, const char*, std::size_t) const; + + ~parser (); + + private: + // Functionality shared by next() and peek(). + // + json_type + next_impl (); + + // Translate the event produced by the most recent call to next_impl(). + // + // Note that the underlying parser state determines whether name or + // value is returned when translating JSON_STRING. + // + optional<event> + translate (json_type) const noexcept; + + // Cache state (name/value) produced by the most recent call to + // next_impl(). + // + void + cache_parsed_data (); + + // Cache the location numbers as determined by the most recent call to + // next_impl(). + // + void + cache_parsed_location () noexcept; + + // Return true if this is a value event (string, number, boolean, or + // null). + // + static bool + value_event (optional<event>) noexcept; + + stream stream_; + + bool multi_value_; + const char* separators_; + + // The *_p_ members indicate whether the value is present (cached). + // Note: not using optional not to reallocate the string's buffer. + // + std::string name_; bool name_p_ = false; + std::string value_; bool value_p_ = false; + std::uint64_t line_, column_, position_; bool location_p_ = false; + + optional<json_type> parsed_; // Current parsed event if any. + optional<json_type> peeked_; // Current peeked event if any. + + ::json_stream impl_[1]; + + // Cached raw value. + // + const char* raw_s_; + std::size_t raw_n_; + }; + } +} + +#include <libbutl/json/parser.ixx> diff --git a/libbutl/json/parser.ixx b/libbutl/json/parser.ixx new file mode 100644 index 0000000..cf6dca3 --- /dev/null +++ b/libbutl/json/parser.ixx @@ -0,0 +1,552 @@ +#include <cerrno> +#include <limits> // numeric_limits +#include <utility> // move() +#include <cassert> +#include <cstdlib> // strto*() +#include <type_traits> // enable_if, is_* +#include <cstring> // strlen() + +namespace butl +{ + namespace json + { + inline invalid_json_input:: + invalid_json_input (std::string n, + std::uint64_t l, + std::uint64_t c, + std::uint64_t p, + const std::string& d) + : invalid_json_input (move (n), l, c, p, d.c_str ()) + { + } + + inline invalid_json_input:: + invalid_json_input (std::string n, + std::uint64_t l, + std::uint64_t c, + std::uint64_t p, + const char* d) + : invalid_argument (d), + name (std::move (n)), + line (l), column (c), position (p) + { + } + + inline parser:: + parser (std::istream& is, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (is, n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const void* t, + std::size_t s, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t, s, n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const std::string& t, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t.data (), t.size (), n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const std::string& t, + const char* n, + bool mv, + const char* sep) noexcept + : parser (t.data (), t.size (), n, mv, sep) + { + } + + inline parser:: + parser (const char* t, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t, std::strlen (t), n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const char* t, + const char* n, + bool mv, + const char* sep) noexcept + : parser (t, std::strlen (t), n, mv, sep) + { + } + + inline const std::string& parser:: + name () + { + if (!name_p_) + { + assert (parsed_ && !peeked_ && !value_p_); + cache_parsed_data (); + assert (name_p_); + } + return name_; + } + + inline std::string& parser:: + value () + { + if (!value_p_) + { + assert (parsed_ && !peeked_ && !name_p_); + cache_parsed_data (); + assert (value_p_); + } + return value_; + } + + // Note: one day we will be able to use C++17 from_chars() which was made + // exactly for this. + // + template <typename T> + inline typename std::enable_if<std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t, const parser&) + { + return *b == 't'; + } + + template <typename T> + inline typename std::enable_if< + std::is_integral<T>::value && + std::is_signed<T>::value && + !std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + std::int64_t v (strtoll (b, &e, 10)); // Can't throw. + + if (e == b || e != b + n || errno == ERANGE || + v < std::numeric_limits<T>::min () || + v > std::numeric_limits<T>::max ()) + p.throw_invalid_value ("signed integer", b, n); + + return static_cast<T> (v); + } + + template <typename T> + inline typename std::enable_if< + std::is_integral<T>::value && + std::is_unsigned<T>::value && + !std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + std::uint64_t v (strtoull (b, &e, 10)); // Can't throw. + + if (e == b || e != b + n || errno == ERANGE || + v > std::numeric_limits<T>::max ()) + p.throw_invalid_value ("unsigned integer", b, n); + + return static_cast<T> (v); + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, float>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtof (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("float", b, n); + + return r; + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, double>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtod (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("double", b, n); + + return r; + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, long double>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtold (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("long double", b, n); + + return r; + } + + template <typename T> + inline T parser:: + value () const + { + if (!value_p_) + { + assert (parsed_ && !peeked_ && value_event (translate (*parsed_))); + return parse_value<T> (raw_s_, raw_n_, *this); + } + + return parse_value<T> (value_.data (), value_.size (), *this); + } + + inline void parser:: + next_expect_name (const std::string& n, bool su) + { + next_expect_name (n.c_str (), su); + } + + // next_expect_<type>() + // + inline std::string& parser:: + next_expect_string () + { + next_expect (event::string); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_string () + { + next_expect (event::string); + return value<T> (); + } + + inline std::string& parser:: + next_expect_number () + { + next_expect (event::number); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_number () + { + next_expect (event::number); + return value<T> (); + } + + inline std::string& parser:: + next_expect_boolean () + { + next_expect (event::boolean); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_boolean () + { + next_expect (event::boolean); + return value<T> (); + } + + // next_expect_<type>_null() + // + inline std::string* parser:: + next_expect_string_null () + { + return next_expect (event::string, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_string_null () + { + return next_expect (event::string, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + inline std::string* parser:: + next_expect_number_null () + { + return next_expect (event::number, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_number_null () + { + return next_expect (event::number, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + inline std::string* parser:: + next_expect_boolean_null () + { + return next_expect (event::boolean, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_boolean_null () + { + return next_expect (event::boolean, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + // next_expect_member_string() + // + inline std::string& parser:: + next_expect_member_string (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string (); + } + + inline std::string& parser:: + next_expect_member_string (const std::string& n, bool su) + { + return next_expect_member_string (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_string (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_string (const std::string& n, bool su) + { + return next_expect_member_string<T> (n.c_str (), su); + } + + // next_expect_member_number() + // + inline std::string& parser:: + next_expect_member_number (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number (); + } + + inline std::string& parser:: + next_expect_member_number (const std::string& n, bool su) + { + return next_expect_member_number (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_number (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_number (const std::string& n, bool su) + { + return next_expect_member_number<T> (n.c_str (), su); + } + + // next_expect_member_boolean() + // + inline std::string& parser:: + next_expect_member_boolean (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean (); + } + + inline std::string& parser:: + next_expect_member_boolean (const std::string& n, bool su) + { + return next_expect_member_boolean (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_boolean (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_boolean (const std::string& n, bool su) + { + return next_expect_member_boolean<T> (n.c_str (), su); + } + + // next_expect_member_string_null() + // + inline std::string* parser:: + next_expect_member_string_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string_null (); + } + + inline std::string* parser:: + next_expect_member_string_null (const std::string& n, bool su) + { + return next_expect_member_string_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_string_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_string_null (const std::string& n, bool su) + { + return next_expect_member_string_null<T> (n.c_str (), su); + } + + // next_expect_member_number_null() + // + inline std::string* parser:: + next_expect_member_number_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number_null (); + } + + inline std::string* parser:: + next_expect_member_number_null (const std::string& n, bool su) + { + return next_expect_member_number_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_number_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_number_null (const std::string& n, bool su) + { + return next_expect_member_number_null<T> (n.c_str (), su); + } + + // next_expect_member_boolean_null() + // + inline std::string* parser:: + next_expect_member_boolean_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean_null (); + } + + inline std::string* parser:: + next_expect_member_boolean_null (const std::string& n, bool su) + { + return next_expect_member_boolean_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_boolean_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_boolean_null (const std::string& n, bool su) + { + return next_expect_member_boolean_null<T> (n.c_str (), su); + } + + // next_expect_member_object[_null]() + // + inline void parser:: + next_expect_member_object (const char* n, bool su) + { + next_expect_name (n, su); + next_expect (event::begin_object); + } + + inline void parser:: + next_expect_member_object (const std::string& n, bool su) + { + next_expect_member_object (n.c_str (), su); + } + + inline bool parser:: + next_expect_member_object_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect (event::begin_object, event::null); + } + + inline bool parser:: + next_expect_member_object_null (const std::string& n, bool su) + { + return next_expect_member_object_null (n.c_str (), su); + } + + // next_expect_member_array[_null]() + // + inline void parser:: + next_expect_member_array (const char* n, bool su) + { + next_expect_name (n, su); + next_expect (event::begin_array); + } + + inline void parser:: + next_expect_member_array (const std::string& n, bool su) + { + next_expect_member_array (n.c_str (), su); + } + + inline bool parser:: + next_expect_member_array_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect (event::begin_array, event::null); + } + + inline bool parser:: + next_expect_member_array_null (const std::string& n, bool su) + { + return next_expect_member_array_null (n.c_str (), su); + } + } +} diff --git a/libbutl/json/pdjson.c b/libbutl/json/pdjson.c new file mode 100644 index 0000000..ae10c95 --- /dev/null +++ b/libbutl/json/pdjson.c @@ -0,0 +1,1044 @@ +#ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 200112L +#elif _POSIX_C_SOURCE < 200112L +# error incompatible _POSIX_C_SOURCE level +#endif + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#ifndef PDJSON_H +# include "pdjson.h" +#endif + +#define JSON_FLAG_ERROR (1u << 0) +#define JSON_FLAG_STREAMING (1u << 1) + +#if defined(_MSC_VER) && (_MSC_VER < 1900) + +#define json_error(json, format, ...) \ + if (!(json->flags & JSON_FLAG_ERROR)) { \ + json->flags |= JSON_FLAG_ERROR; \ + _snprintf_s(json->errmsg, sizeof(json->errmsg), \ + _TRUNCATE, \ + format, \ + __VA_ARGS__); \ + } \ + +#else + +#define json_error(json, format, ...) \ + if (!(json->flags & JSON_FLAG_ERROR)) { \ + json->flags |= JSON_FLAG_ERROR; \ + snprintf(json->errmsg, sizeof(json->errmsg), \ + format, \ + __VA_ARGS__); \ + } \ + +#endif /* _MSC_VER */ + +/* See also PDJSON_STACK_MAX below. */ +#ifndef PDJSON_STACK_INC +# define PDJSON_STACK_INC 4 +#endif + +struct json_stack { + enum json_type type; + long count; +}; + +static enum json_type +push(json_stream *json, enum json_type type) +{ + json->stack_top++; + +#ifdef PDJSON_STACK_MAX + if (json->stack_top > PDJSON_STACK_MAX) { + json_error(json, "%s", "maximum depth of nesting reached"); + return JSON_ERROR; + } +#endif + + if (json->stack_top >= json->stack_size) { + struct json_stack *stack; + size_t size = (json->stack_size + PDJSON_STACK_INC) * sizeof(*json->stack); + stack = (struct json_stack *)json->alloc.realloc(json->stack, size); + if (stack == NULL) { + json_error(json, "%s", "out of memory"); + return JSON_ERROR; + } + + json->stack_size += PDJSON_STACK_INC; + json->stack = stack; + } + + json->stack[json->stack_top].type = type; + json->stack[json->stack_top].count = 0; + + return type; +} + +/* Note: c is assumed not to be EOF. */ +static enum json_type +pop(json_stream *json, int c, enum json_type expected) +{ + if (json->stack == NULL || json->stack[json->stack_top].type != expected) { + json_error(json, "unexpected byte '%c'", c); + return JSON_ERROR; + } + json->stack_top--; + return expected == JSON_ARRAY ? JSON_ARRAY_END : JSON_OBJECT_END; +} + +static int buffer_peek(struct json_source *source) +{ + if (source->position < source->source.buffer.length) + return source->source.buffer.buffer[source->position]; + else + return EOF; +} + +static int buffer_get(struct json_source *source) +{ + int c = source->peek(source); + if (c != EOF) + source->position++; + return c; +} + +static int stream_get(struct json_source *source) +{ + int c = fgetc(source->source.stream.stream); + if (c != EOF) + source->position++; + return c; +} + +static int stream_peek(struct json_source *source) +{ + int c = fgetc(source->source.stream.stream); + ungetc(c, source->source.stream.stream); + return c; +} + +static void init(json_stream *json) +{ + json->lineno = 1; + json->linepos = 0; + json->lineadj = 0; + json->linecon = 0; + json->colno = 0; + json->flags = JSON_FLAG_STREAMING; + json->errmsg[0] = '\0'; + json->ntokens = 0; + json->next = (enum json_type)0; + + json->stack = NULL; + json->stack_top = -1; + json->stack_size = 0; + + json->data.string = NULL; + json->data.string_size = 0; + json->data.string_fill = 0; + json->source.position = 0; + + json->alloc.malloc = malloc; + json->alloc.realloc = realloc; + json->alloc.free = free; +} + +static enum json_type +is_match(json_stream *json, const char *pattern, enum json_type type) +{ + int c; + for (const char *p = pattern; *p; p++) { + if (*p != (c = json->source.get(&json->source))) { + if (c != EOF) { + json_error(json, "expected '%c' instead of byte '%c'", *p, c); + } else { + json_error(json, "expected '%c' instead of end of text", *p); + } + return JSON_ERROR; + } + } + return type; +} + +static int pushchar(json_stream *json, int c) +{ + if (json->data.string_fill == json->data.string_size) { + size_t size = json->data.string_size * 2; + char *buffer = (char *)json->alloc.realloc(json->data.string, size); + if (buffer == NULL) { + json_error(json, "%s", "out of memory"); + return -1; + } else { + json->data.string_size = size; + json->data.string = buffer; + } + } + json->data.string[json->data.string_fill++] = c; + return 0; +} + +static int init_string(json_stream *json) +{ + json->data.string_fill = 0; + if (json->data.string == NULL) { + json->data.string_size = 1024; + json->data.string = (char *)json->alloc.malloc(json->data.string_size); + if (json->data.string == NULL) { + json_error(json, "%s", "out of memory"); + return -1; + } + } + json->data.string[0] = '\0'; + return 0; +} + +static int encode_utf8(json_stream *json, unsigned long c) +{ + if (c < 0x80UL) { + return pushchar(json, c); + } else if (c < 0x0800UL) { + return !((pushchar(json, (c >> 6 & 0x1F) | 0xC0) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x010000UL) { + if (c >= 0xd800 && c <= 0xdfff) { + json_error(json, "invalid codepoint %06lx", c); + return -1; + } + return !((pushchar(json, (c >> 12 & 0x0F) | 0xE0) == 0) && + (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x110000UL) { + return !((pushchar(json, (c >> 18 & 0x07) | 0xF0) == 0) && + (pushchar(json, (c >> 12 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else { + json_error(json, "unable to encode %06lx as UTF-8", c); + return -1; + } +} + +static int hexchar(int c) +{ + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': + case 'A': return 10; + case 'b': + case 'B': return 11; + case 'c': + case 'C': return 12; + case 'd': + case 'D': return 13; + case 'e': + case 'E': return 14; + case 'f': + case 'F': return 15; + default: + return -1; + } +} + +static long +read_unicode_cp(json_stream *json) +{ + long cp = 0; + int shift = 12; + + for (size_t i = 0; i < 4; i++) { + int c = json->source.get(&json->source); + int hc; + + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if ((hc = hexchar(c)) == -1) { + json_error(json, "invalid escape Unicode byte '%c'", c); + return -1; + } + + cp += hc * (1 << shift); + shift -= 4; + } + + + return cp; +} + +static int read_unicode(json_stream *json) +{ + long cp, h, l; + + if ((cp = read_unicode_cp(json)) == -1) { + return -1; + } + + if (cp >= 0xd800 && cp <= 0xdbff) { + /* This is the high portion of a surrogate pair; we need to read the + * lower portion to get the codepoint + */ + h = cp; + + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if (c != '\\') { + json_error(json, "invalid continuation for surrogate pair '%c', " + "expected '\\'", c); + return -1; + } + + c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if (c != 'u') { + json_error(json, "invalid continuation for surrogate pair '%c', " + "expected 'u'", c); + return -1; + } + + if ((l = read_unicode_cp(json)) == -1) { + return -1; + } + + if (l < 0xdc00 || l > 0xdfff) { + json_error(json, "surrogate pair continuation \\u%04lx out " + "of range (dc00-dfff)", l); + return -1; + } + + cp = ((h - 0xd800) * 0x400) + ((l - 0xdc00) + 0x10000); + } else if (cp >= 0xdc00 && cp <= 0xdfff) { + json_error(json, "dangling surrogate \\u%04lx", cp); + return -1; + } + + return encode_utf8(json, cp); +} + +static int +read_escaped(json_stream *json) +{ + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in escape"); + return -1; + } else if (c == 'u') { + if (read_unicode(json) != 0) + return -1; + } else { + switch (c) { + case '\\': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case '/': + case '"': + { + const char *codes = "\\bfnrt/\""; + const char *p = strchr(codes, c); + if (pushchar(json, "\\\b\f\n\r\t/\""[p - codes]) != 0) + return -1; + } + break; + default: + json_error(json, "invalid escaped byte '%c'", c); + return -1; + } + } + return 0; +} + +static int +char_needs_escaping(int c) +{ + if ((c >= 0) && (c < 0x20 || c == 0x22 || c == 0x5c)) { + return 1; + } + + return 0; +} + +static int +utf8_seq_length(char byte) +{ + unsigned char u = (unsigned char) byte; + if (u < 0x80) return 1; + + if (0x80 <= u && u <= 0xBF) + { + // second, third or fourth byte of a multi-byte + // sequence, i.e. a "continuation byte" + return 0; + } + else if (u == 0xC0 || u == 0xC1) + { + // overlong encoding of an ASCII byte + return 0; + } + else if (0xC2 <= u && u <= 0xDF) + { + // 2-byte sequence + return 2; + } + else if (0xE0 <= u && u <= 0xEF) + { + // 3-byte sequence + return 3; + } + else if (0xF0 <= u && u <= 0xF4) + { + // 4-byte sequence + return 4; + } + else + { + // u >= 0xF5 + // Restricted (start of 4-, 5- or 6-byte sequence) or invalid UTF-8 + return 0; + } +} + +static int +is_legal_utf8(const unsigned char *bytes, int length) +{ + if (0 == bytes || 0 == length) return 0; + + unsigned char a; + const unsigned char* srcptr = bytes + length; + switch (length) + { + default: + return 0; + // Everything else falls through when true. + case 4: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + /* FALLTHRU */ + case 3: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + /* FALLTHRU */ + case 2: + a = (*--srcptr); + switch (*bytes) + { + case 0xE0: + if (a < 0xA0 || a > 0xBF) return 0; + break; + case 0xED: + if (a < 0x80 || a > 0x9F) return 0; + break; + case 0xF0: + if (a < 0x90 || a > 0xBF) return 0; + break; + case 0xF4: + if (a < 0x80 || a > 0x8F) return 0; + break; + default: + if (a < 0x80 || a > 0xBF) return 0; + break; + } + /* FALLTHRU */ + case 1: + if (*bytes >= 0x80 && *bytes < 0xC2) return 0; + } + return *bytes <= 0xF4; +} + +static int +read_utf8(json_stream* json, int next_char) +{ + int count = utf8_seq_length(next_char); + if (!count) + { + json_error(json, "%s", "invalid UTF-8 character"); + return -1; + } + + char buffer[4]; + buffer[0] = next_char; + int i; + for (i = 1; i < count; ++i) + { + if ((next_char = json->source.get(&json->source)) == EOF) + break; + + buffer[i] = next_char; + json->lineadj++; + } + + if (i != count || !is_legal_utf8((unsigned char*) buffer, count)) + { + json_error(json, "%s", "invalid UTF-8 text"); + return -1; + } + + for (i = 0; i < count; ++i) + { + if (pushchar(json, buffer[i]) != 0) + return -1; + } + return 0; +} + +static enum json_type +read_string(json_stream *json) +{ + if (init_string(json) != 0) + return JSON_ERROR; + while (1) { + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal"); + return JSON_ERROR; + } else if (c == '"') { + if (pushchar(json, '\0') == 0) + return JSON_STRING; + else + return JSON_ERROR; + } else if (c == '\\') { + if (read_escaped(json) != 0) + return JSON_ERROR; + } else if ((unsigned) c >= 0x80) { + if (read_utf8(json, c) != 0) + return JSON_ERROR; + } else { + if (char_needs_escaping(c)) { + json_error(json, "%s", "unescaped control character in string"); + return JSON_ERROR; + } + + if (pushchar(json, c) != 0) + return JSON_ERROR; + } + } + return JSON_ERROR; +} + +static int +is_digit(int c) +{ + return c >= 48 /*0*/ && c <= 57 /*9*/; +} + +static int +read_digits(json_stream *json) +{ + int c; + unsigned nread = 0; + while (is_digit(c = json->source.peek(&json->source))) { + if (pushchar(json, json->source.get(&json->source)) != 0) + return -1; + + nread++; + } + + if (nread == 0) { + if (c != EOF) { + json_error(json, "expected digit instead of byte '%c'", c); + } else { + json_error(json, "%s", "expected digit instead of end of text"); + } + return -1; + } + + return 0; +} + +static enum json_type +read_number(json_stream *json, int c) +{ + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (c == '-') { + c = json->source.get(&json->source); + if (is_digit(c)) { + return read_number(json, c); + } else { + if (c != EOF) { + json_error(json, "unexpected byte '%c' in number", c); + } else { + json_error(json, "%s", "unexpected end of text in number"); + } + return JSON_ERROR; + } + } else if (strchr("123456789", c) != NULL) { + c = json->source.peek(&json->source); + if (is_digit(c)) { + if (read_digits(json) != 0) + return JSON_ERROR; + } + } + /* Up to decimal or exponent has been read. */ + c = json->source.peek(&json->source); + if (strchr(".eE", c) == NULL) { + if (pushchar(json, '\0') != 0) + return JSON_ERROR; + else + return JSON_NUMBER; + } + if (c == '.') { + json->source.get(&json->source); // consume . + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (read_digits(json) != 0) + return JSON_ERROR; + } + /* Check for exponent. */ + c = json->source.peek(&json->source); + if (c == 'e' || c == 'E') { + json->source.get(&json->source); // consume e/E + if (pushchar(json, c) != 0) + return JSON_ERROR; + c = json->source.peek(&json->source); + if (c == '+' || c == '-') { + json->source.get(&json->source); // consume + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (read_digits(json) != 0) + return JSON_ERROR; + } else if (is_digit(c)) { + if (read_digits(json) != 0) + return JSON_ERROR; + } else { + json->source.get(&json->source); // consume (for column) + if (c != EOF) { + json_error(json, "unexpected byte '%c' in number", c); + } else { + json_error(json, "%s", "unexpected end of text in number"); + } + return JSON_ERROR; + } + } + if (pushchar(json, '\0') != 0) + return JSON_ERROR; + else + return JSON_NUMBER; +} + +bool +json_isspace(int c) +{ + switch (c) { + case 0x09: + case 0x0a: + case 0x0d: + case 0x20: + return true; + } + + return false; +} + +static void newline(json_stream *json) +{ + json->lineno++; + json->linepos = json->source.position; + json->lineadj = 0; + json->linecon = 0; +} + +/* Returns the next non-whitespace character in the stream. + * + * Note that this is the only function (besides user-facing json_source_get()) + * that needs to worry about newline housekeeping. + */ +static int next(json_stream *json) +{ + int c; + while (json_isspace(c = json->source.get(&json->source))) + if (c == '\n') + newline(json); + return c; +} + +static enum json_type +read_value(json_stream *json, int c) +{ + enum json_type type; + size_t colno = json_get_column(json); + + json->ntokens++; + + switch (c) { + case EOF: + json_error(json, "%s", "unexpected end of text"); + type = JSON_ERROR; + break; + case '{': + type = push(json, JSON_OBJECT); + break; + case '[': + type = push(json, JSON_ARRAY); + break; + case '"': + type = read_string(json); + break; + case 'n': + type = is_match(json, "ull", JSON_NULL); + break; + case 'f': + type = is_match(json, "alse", JSON_FALSE); + break; + case 't': + type = is_match(json, "rue", JSON_TRUE); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + type = init_string(json) == 0 ? read_number(json, c) : JSON_ERROR; + break; + default: + type = JSON_ERROR; + json_error(json, "unexpected byte '%c' in value", c); + break; + } + + if (type != JSON_ERROR) + json->colno = colno; + + return type; +} + +enum json_type json_peek(json_stream *json) +{ + enum json_type next; + if (json->next) + next = json->next; + else + next = json->next = json_next(json); + return next; +} + +enum json_type json_next(json_stream *json) +{ + if (json->flags & JSON_FLAG_ERROR) + return JSON_ERROR; + if (json->next != 0) { + enum json_type next = json->next; + json->next = (enum json_type)0; + return next; + } + + json->colno = 0; + + if (json->ntokens > 0 && json->stack_top == (size_t)-1) { + + /* In the streaming mode leave any trailing whitespaces in the stream. + * This allows the user to validate any desired separation between + * values (such as newlines) using json_source_get/peek() with any + * remaining whitespaces ignored as leading when we parse the next + * value. */ + if (!(json->flags & JSON_FLAG_STREAMING)) { + int c = next(json); + if (c != EOF) { + json_error(json, "expected end of text instead of byte '%c'", c); + return JSON_ERROR; + } + } + + return JSON_DONE; + } + int c = next(json); + if (json->stack_top == (size_t)-1) { + if (c == EOF && (json->flags & JSON_FLAG_STREAMING)) + return JSON_DONE; + + return read_value(json, c); + } + if (json->stack[json->stack_top].type == JSON_ARRAY) { + if (json->stack[json->stack_top].count == 0) { + if (c == ']') { + return pop(json, c, JSON_ARRAY); + } + json->stack[json->stack_top].count++; + return read_value(json, c); + } else if (c == ',') { + json->stack[json->stack_top].count++; + return read_value(json, next(json)); + } else if (c == ']') { + return pop(json, c, JSON_ARRAY); + } else { + if (c != EOF) { + json_error(json, "unexpected byte '%c'", c); + } else { + json_error(json, "%s", "unexpected end of text"); + } + return JSON_ERROR; + } + } else if (json->stack[json->stack_top].type == JSON_OBJECT) { + if (json->stack[json->stack_top].count == 0) { + if (c == '}') { + return pop(json, c, JSON_OBJECT); + } + + /* No member name/value pairs yet. */ + enum json_type value = read_value(json, c); + if (value != JSON_STRING) { + if (value != JSON_ERROR) + json_error(json, "%s", "expected member name or '}'"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return value; + } + } else if ((json->stack[json->stack_top].count % 2) == 0) { + /* Expecting comma followed by member name. */ + if (c != ',' && c != '}') { + json_error(json, "%s", "expected ',' or '}' after member value"); + return JSON_ERROR; + } else if (c == '}') { + return pop(json, c, JSON_OBJECT); + } else { + enum json_type value = read_value(json, next(json)); + if (value != JSON_STRING) { + if (value != JSON_ERROR) + json_error(json, "%s", "expected member name"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return value; + } + } + } else if ((json->stack[json->stack_top].count % 2) == 1) { + /* Expecting colon followed by value. */ + if (c != ':') { + json_error(json, "%s", "expected ':' after member name"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return read_value(json, next(json)); + } + } + } + json_error(json, "%s", "invalid parser state"); + return JSON_ERROR; +} + +void json_reset(json_stream *json) +{ + json->stack_top = -1; + json->ntokens = 0; + json->flags &= ~JSON_FLAG_ERROR; + json->errmsg[0] = '\0'; +} + +enum json_type json_skip(json_stream *json) +{ + enum json_type type = json_next(json); + size_t cnt_arr = 0; + size_t cnt_obj = 0; + + for (enum json_type skip = type; ; skip = json_next(json)) { + if (skip == JSON_ERROR || skip == JSON_DONE) + return skip; + + if (skip == JSON_ARRAY) { + ++cnt_arr; + } else if (skip == JSON_ARRAY_END && cnt_arr > 0) { + --cnt_arr; + } else if (skip == JSON_OBJECT) { + ++cnt_obj; + } else if (skip == JSON_OBJECT_END && cnt_obj > 0) { + --cnt_obj; + } + + if (!cnt_arr && !cnt_obj) + break; + } + + return type; +} + +enum json_type json_skip_until(json_stream *json, enum json_type type) +{ + while (1) { + enum json_type skip = json_skip(json); + + if (skip == JSON_ERROR || skip == JSON_DONE) + return skip; + + if (skip == type) + break; + } + + return type; +} + +const char *json_get_string(json_stream *json, size_t *length) +{ + if (length != NULL) + *length = json->data.string_fill; + if (json->data.string == NULL) + return ""; + else + return json->data.string; +} + +double json_get_number(json_stream *json) +{ + char *p = json->data.string; + return p == NULL ? 0 : strtod(p, NULL); +} + +const char *json_get_error(json_stream *json) +{ + return json->flags & JSON_FLAG_ERROR ? json->errmsg : NULL; +} + +size_t json_get_lineno(json_stream *json) +{ + return json->lineno; +} + +size_t json_get_position(json_stream *json) +{ + return json->source.position; +} + +size_t json_get_column(json_stream *json) +{ + return json->colno == 0 + ? json->source.position == 0 ? 1 : json->source.position - json->linepos - json->lineadj + : json->colno; +} + +size_t json_get_depth(json_stream *json) +{ + return json->stack_top + 1; +} + +/* Return the current parsing context, that is, JSON_OBJECT if we are inside + an object, JSON_ARRAY if we are inside an array, and JSON_DONE if we are + not yet/anymore in either. + + Additionally, for the first two cases, also return the number of parsing + events that have already been observed at this level with json_next/peek(). + In particular, inside an object, an odd number would indicate that the just + observed JSON_STRING event is a member name. +*/ +enum json_type json_get_context(json_stream *json, size_t *count) +{ + if (json->stack_top == (size_t)-1) + return JSON_DONE; + + if (count != NULL) + *count = json->stack[json->stack_top].count; + + return json->stack[json->stack_top].type; +} + +int json_source_get(json_stream *json) +{ + /* If the caller reads a multi-byte UTF-8 sequence, we expect them to read + * it in its entirety. We also assume that any invalid bytes within such a + * sequence belong to the same column (as opposed to starting a new column + * or some such). */ + + int c = json->source.get(&json->source); + if (json->linecon > 0) { + /* Expecting a continuation byte within a multi-byte UTF-8 sequence. */ + json->linecon--; + if (c != EOF) + json->lineadj++; + } else if (c == '\n') + newline(json); + else if (c >= 0xC2 && c <= 0xF4) /* First in multi-byte UTF-8 sequence. */ + json->linecon = utf8_seq_length(c) - 1; + + return c; +} + +int json_source_peek(json_stream *json) +{ + return json->source.peek(&json->source); +} + +void json_open_buffer(json_stream *json, const void *buffer, size_t size) +{ + init(json); + json->source.get = buffer_get; + json->source.peek = buffer_peek; + json->source.source.buffer.buffer = (const char *)buffer; + json->source.source.buffer.length = size; +} + +void json_open_string(json_stream *json, const char *string) +{ + json_open_buffer(json, string, strlen(string)); +} + +void json_open_stream(json_stream *json, FILE * stream) +{ + init(json); + json->source.get = stream_get; + json->source.peek = stream_peek; + json->source.source.stream.stream = stream; +} + +static int user_get(struct json_source *json) +{ + int c = json->source.user.get(json->source.user.ptr); + if (c != EOF) + json->position++; + return c; +} + +static int user_peek(struct json_source *json) +{ + return json->source.user.peek(json->source.user.ptr); +} + +void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user) +{ + init(json); + json->source.get = user_get; + json->source.peek = user_peek; + json->source.source.user.ptr = user; + json->source.source.user.get = get; + json->source.source.user.peek = peek; +} + +void json_set_allocator(json_stream *json, json_allocator *a) +{ + json->alloc = *a; +} + +void json_set_streaming(json_stream *json, bool streaming) +{ + if (streaming) + json->flags |= JSON_FLAG_STREAMING; + else + json->flags &= ~JSON_FLAG_STREAMING; +} + +void json_close(json_stream *json) +{ + json->alloc.free(json->stack); + json->alloc.free(json->data.string); +} diff --git a/libbutl/json/pdjson.h b/libbutl/json/pdjson.h new file mode 100644 index 0000000..ac698e4 --- /dev/null +++ b/libbutl/json/pdjson.h @@ -0,0 +1,147 @@ +#ifndef PDJSON_H +#define PDJSON_H + +#ifndef PDJSON_SYMEXPORT +# define PDJSON_SYMEXPORT +#endif + +#ifdef __cplusplus +extern "C" { +#else +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) + #include <stdbool.h> +#else + #ifndef bool + #define bool int + #define true 1 + #define false 0 + #endif /* bool */ +#endif /* __STDC_VERSION__ */ +#endif /* __cplusplus */ + +#include <stdio.h> + +enum json_type { + JSON_ERROR = 1, JSON_DONE, + JSON_OBJECT, JSON_OBJECT_END, JSON_ARRAY, JSON_ARRAY_END, + JSON_STRING, JSON_NUMBER, JSON_TRUE, JSON_FALSE, JSON_NULL +}; + +struct json_allocator { + void *(*malloc)(size_t); + void *(*realloc)(void *, size_t); + void (*free)(void *); +}; + +typedef int (*json_user_io)(void *user); + +typedef struct json_stream json_stream; +typedef struct json_allocator json_allocator; + +PDJSON_SYMEXPORT void json_open_buffer(json_stream *json, const void *buffer, size_t size); +PDJSON_SYMEXPORT void json_open_string(json_stream *json, const char *string); +PDJSON_SYMEXPORT void json_open_stream(json_stream *json, FILE *stream); +PDJSON_SYMEXPORT void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user); +PDJSON_SYMEXPORT void json_close(json_stream *json); + +PDJSON_SYMEXPORT void json_set_allocator(json_stream *json, json_allocator *a); +PDJSON_SYMEXPORT void json_set_streaming(json_stream *json, bool mode); + +PDJSON_SYMEXPORT enum json_type json_next(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_peek(json_stream *json); +PDJSON_SYMEXPORT void json_reset(json_stream *json); +PDJSON_SYMEXPORT const char *json_get_string(json_stream *json, size_t *length); +PDJSON_SYMEXPORT double json_get_number(json_stream *json); + +PDJSON_SYMEXPORT enum json_type json_skip(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_skip_until(json_stream *json, enum json_type type); + +PDJSON_SYMEXPORT size_t json_get_lineno(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_position(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_column(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_depth(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_get_context(json_stream *json, size_t *count); +PDJSON_SYMEXPORT const char *json_get_error(json_stream *json); + +PDJSON_SYMEXPORT int json_source_get(json_stream *json); +PDJSON_SYMEXPORT int json_source_peek(json_stream *json); +PDJSON_SYMEXPORT bool json_isspace(int c); + +/* internal */ + +struct json_source { + int (*get)(struct json_source *); + int (*peek)(struct json_source *); + size_t position; + union { + struct { + FILE *stream; + } stream; + struct { + const char *buffer; + size_t length; + } buffer; + struct { + void *ptr; + json_user_io get; + json_user_io peek; + } user; + } source; +}; + +struct json_stream { + size_t lineno; + + /* While counting lines is straightforward, columns are tricky because we + * have to count codepoints, not bytes. We could have peppered the code + * with increments in all the relevant places but that seems inelegant. + * So instead we calculate the column dynamically, based on the current + * position. + * + * Specifically, we will remember the position at the beginning of each + * line (linepos) and, assuming only the ASCII characters on the line, the + * column will be the difference between the current position and linepos. + * Of course there could also be multi-byte UTF-8 sequences which we will + * handle by keeping an adjustment (lineadj) -- the number of continuation + * bytes encountered on this line so far. Finally, for json_source_get() + * we also have to keep the number of remaining continuation bytes in the + * current multi-byte UTF-8 sequence (linecon). + * + * This is not the end of the story, however: with only the just described + * approach we will always end up with the column of the latest character + * read which is not what we want when returning potentially multi- + * character value events (string, number, etc); in these cases we want to + * return the column of the first character (note that if the value itself + * is invalid and we are returning JSON_ERROR, we still want the current + * column). So to handle this we will cache the start column (colno) for + * such events. + */ + size_t linepos; /* Position at the beginning of the current line. */ + size_t lineadj; /* Adjustment for multi-byte UTF-8 sequences. */ + size_t linecon; /* Number of remaining UTF-8 continuation bytes. */ + size_t colno; /* Start column for value events or 0. */ + + struct json_stack *stack; + size_t stack_top; + size_t stack_size; + enum json_type next; + unsigned flags; + + struct { + char *string; + size_t string_fill; + size_t string_size; + } data; + + size_t ntokens; + + struct json_source source; + struct json_allocator alloc; + char errmsg[128]; +}; + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif diff --git a/libbutl/json/serializer.cxx b/libbutl/json/serializer.cxx new file mode 100644 index 0000000..fbd569a --- /dev/null +++ b/libbutl/json/serializer.cxx @@ -0,0 +1,671 @@ +#include <cstdio> // snprintf +#include <cstdarg> // va_list +#include <cstring> // memcpy +#include <ostream> + +#include <libbutl/json/serializer.hxx> + +using namespace std; + +namespace butl +{ + namespace json + { + using buffer = buffer_serializer::buffer; + using error_code = invalid_json_output::error_code; + + template <typename T> + static void + dynarray_overflow (void* d, event, buffer& b, size_t ex) + { + T& v (*static_cast<T*> (d)); + v.resize (b.capacity + ex); + v.resize (v.capacity ()); + // const_cast is required for std::string pre C++17. + // + b.data = const_cast<typename T::value_type*> (v.data ()); + b.capacity = v.size (); + } + + template <typename T> + static void + dynarray_flush (void* d, event, buffer& b) + { + T& v (*static_cast<T*> (d)); + v.resize (b.size); + b.data = const_cast<typename T::value_type*> (v.data ()); + b.capacity = b.size; + } + + buffer_serializer:: + buffer_serializer (string& s, size_t i) + : buffer_serializer (const_cast<char*> (s.data ()), size_, s.size (), + dynarray_overflow<string>, + dynarray_flush<string>, + &s, + i) + { + size_ = s.size (); + } + + buffer_serializer:: + buffer_serializer (vector<char>& v, size_t i) + : buffer_serializer (v.data (), size_, v.size (), + dynarray_overflow<vector<char>>, + dynarray_flush<vector<char>>, + &v, + i) + { + size_ = v.size (); + } + + static void + ostream_overflow (void* d, event e, buffer& b, size_t) + { + ostream& s (*static_cast<ostream*> (d)); + s.write (static_cast<char*> (b.data), b.size); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + b.size = 0; + } + + static void + ostream_flush (void* d, event e, buffer& b) + { + ostream_overflow (d, e, b, 0); + + ostream& s (*static_cast<ostream*> (d)); + s.flush (); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + } + + stream_serializer:: + stream_serializer (ostream& os, size_t i) + : buffer_serializer (tmp_, sizeof (tmp_), + ostream_overflow, + ostream_flush, + &os, + i) + { + } + + bool buffer_serializer:: + next (optional<event> e, pair<const char*, size_t> val, bool check) + { + if (absent_ == 2) + goto fail_complete; + + if (e == nullopt) + { + if (!state_.empty ()) + goto fail_incomplete; + + absent_++; + return false; + } + + absent_ = 0; // Clear inter-value absent event. + + { + state* st (state_.empty () ? nullptr : &state_.back ()); + + auto name_expected = [] (const state& s) + { + return s.type == event::begin_object && s.count % 2 == 0; + }; + + auto make_str = [] (const char* s, size_t n) + { + return make_pair (s, n); + }; + + // When it comes to pretty-printing, the common way to do it is along + // these lines: + // + // { + // "str": "value", + // "obj": { + // "arr": [ + // 1, + // 2, + // 3 + // ] + // }, + // "num": 123 + // } + // + // Empty objects and arrays are printed without a newline: + // + // { + // "obj": {}, + // "arr": [] + // } + // + // There are two types of separators: between name and value, which is + // always ": ", and before/after value inside an object or array which + // is either newline followed by indentation, or comma followed by + // newline followed by indentation (we also have separation between + // top-level values but that's orthogonal to pretty-printing). + // + // Based on this observation, we are going to handle the latter case by + // starting with the ",\n" string (in this->sep_) and pushing/popping + // indentation spaces as we enter/leave objects and arrays. We handle + // the cases where we don't need the comma by simply skipping it in the + // C-string pointer. + // + bool pp (indent_ != 0); + + pair<const char*, size_t> sep; + if (st != nullptr) + { + // The name-value separator. + // + if (st->type == event::begin_object && st->count % 2 == 1) + { + sep = !pp ? make_str (":", 1) : make_str (": ", 2); + } + // We don't need the comma if we are closing the object or array. + // + else if (e == event::end_array || e == event::end_object) + { + // But in this case we need to unindent one level prior to writing + // the brace. Also handle the empty object/array as a special case. + // + sep = !pp || st->count == 0 + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1 - indent_); + } + // Or if this is the first value (note: must come after end_*). + // + else if (st->count == 0) + { + sep = !pp + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1); + } + else + { + sep = !pp + ? make_str (",", 1) + : make_str (sep_.c_str (), sep_.size ()); + } + } + else if (values_ != 0) // Subsequent top-level value. + { + // Top-level value separation. For now we always separate them with + // newlines, which is the most common/sensible way. + // + sep = make_str ("\n", 1); + } + + switch (*e) + { + case event::begin_array: + case event::begin_object: + { + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::begin_array ? "[" : "{", 1), + false); + + if (st != nullptr) + st->count++; + + if (pp) + sep_.append (indent_, ' '); + + state_.push_back (state {*e, 0}); + break; + } + case event::end_array: + case event::end_object: + { + if (st == nullptr || (e == event::end_array + ? st->type != event::begin_array + : !name_expected (*st))) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::end_array ? "]" : "}", 1), + false); + + if (pp) + sep_.erase (sep_.size () - indent_); + + state_.pop_back (); + break; + } + case event::name: + case event::string: + { + if (e == event::name + ? (st == nullptr || !name_expected (*st)) + : (st != nullptr && name_expected (*st))) + goto fail_unexpected_event; + + write (*e, sep, val, check, '"'); + + if (st != nullptr) + st->count++; + break; + } + case event::null: + case event::boolean: + { + if (e == event::null && val.first == nullptr) + val = {"null", 4}; + else if (check) + { + auto eq = [&val] (const char* v, size_t n) + { + return val.second == n && memcmp (val.first, v, n) == 0; + }; + + if (e == event::null) + { + if (!eq ("null", 4)) + goto fail_null; + } + else + { + if (!eq ("true", 4) && !eq ("false", 5)) + goto fail_bool; + } + } + } + // Fall through. + case event::number: + { + // Note: this event is also used by value_json_text(). + + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, sep, val, check); + + if (st != nullptr) + st->count++; + break; + } + } + } + + if (state_.empty ()) + { + values_++; + if (flush_ != nullptr) + flush_ (data_, *e, buf_); + + return false; + } + + return true; + + fail_complete: + throw invalid_json_output ( + e, error_code::invalid_value, "value sequence is complete"); + fail_incomplete: + throw invalid_json_output ( + e, error_code::invalid_value, "value is incomplete"); + fail_null: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid null value"); + fail_bool: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid boolean value"); + fail_unexpected_event: + throw invalid_json_output ( + e, error_code::unexpected_event, "unexpected event"); + } + + // JSON escape sequences for control characters <= 0x1F. + // + static const char* json_escapes[] = + {"\\u0000", "\\u0001", "\\u0002", "\\u0003", "\\u0004", "\\u0005", + "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000B", + "\\f", "\\r", "\\u000E", "\\u000F", "\\u0010", "\\u0011", + "\\u0012", "\\u0013", "\\u0014", "\\u0015", "\\u0016", "\\u0017", + "\\u0018", "\\u0019", "\\u001A", "\\u001B", "\\u001C", "\\u001D", + "\\u001E", "\\u001F"}; + + void buffer_serializer:: + write (event e, + pair<const char*, size_t> sep, + pair<const char*, size_t> val, + bool check, + char q) + { + // Assumptions: + // + // 1. A call to overflow should be able to provide enough capacity to + // write the entire separator (in other words, we are not going to + // bother with chunking the separator). + // + // 2. Similarly, a call to overflow should be able to provide enough + // capacity to write an entire UTF-8 multi-byte sequence. + // + // 3. Performance-wise, we do not expect very long contiguous sequences + // of character that require escaping. + + // Total number of bytes remaining to be written and the capacity + // currently available. + // + size_t size (sep.second + val.second + (q != '\0' ? 2 : 0)); + size_t cap (buf_.capacity - buf_.size); + + auto grow = [this, e, &size, &cap] (size_t min, size_t extra = 0) + { + if (overflow_ == nullptr) + return false; + + extra += size; + extra -= cap; + overflow_ (data_, e, buf_, extra > min ? extra : min); + cap = buf_.capacity - buf_.size; + + return cap >= min; + }; + + auto append = [this, &cap, &size] (const char* d, size_t s) + { + memcpy (static_cast<char*> (buf_.data) + buf_.size, d, s); + buf_.size += s; + cap -= s; + size -= s; + }; + + // Return the longest chunk of input that fits into the buffer and does + // not end in the middle of a multi-byte UTF-8 sequence. Assume value + // size and capacity are not 0. Return NULL in first if no chunk could + // be found that fits into the remaining space. In this case, second is + // the additional (to size) required space (used to handle escapes in + // the checked version). + // + // The basic idea is to seek in the input buffer to the capacity of the + // output buffer (unless the input is shorter than the output). If we + // ended up in the middle of a multi-byte UTF-8 sequence, then seek back + // until we end up at the UTF-8 sequence boundary. Note that this + // implementation assumes valid UTF-8. + // + auto chunk = [&cap, &val] () -> pair<const char*, size_t> + { + pair<const char*, size_t> r (nullptr, 0); + + if (cap >= val.second) + r = val; + else + { + // Start from the character past capacity and search for a UTF-8 + // sequence boundary. + // + for (const char* p (val.first + cap); p != val.first; --p) + { + const auto u (static_cast<uint8_t> (*p)); + if (u < 0x80 || u > 0xBF) // Not a continuation byte + { + r = {val.first, p - val.first}; + break; + } + } + } + + val.first += r.second; + val.second -= r.second; + + return r; + }; + + // Escaping and UTF-8-validating version of chunk(). + // + // There are three classes of mandatory escapes in a JSON string: + // + // - \\ and \" + // + // - \b \f \n \r \t for popular control characters + // + // - \u00NN for other control characters <= 0x1F + // + // If the input begins with a character that must be escaped, return + // only its escape sequence. Otherwise validate and return everything up + // to the end of input or buffer capacity, but cutting it short before + // the next character that must be escaped or the first UTF-8 sequence + // that would not fit. + // + // Return string::npos in second in case of a stray continuation byte or + // any byte in an invalid UTF-8 range (for example, an "overlong" 2-byte + // encoding of a 7-bit/ASCII character or a 4-, 5-, or 6-byte sequence + // that would encode a codepoint beyond the U+10FFFF Unicode limit). + // + auto chunk_checked = [&cap, &size, &val] () -> pair<const char*, size_t> + { + pair<const char*, size_t> r (nullptr, 0); + + // Check whether the first character needs to be escaped. + // + const uint8_t c (val.first[0]); + if (c == '"') + r = {"\\\"", 2}; + else if (c == '\\') + r = {"\\\\", 2}; + else if (c <= 0x1F) + { + auto s (json_escapes[c]); + r = {s, s[1] == 'u' ? 6 : 2}; + } + + if (r.first != nullptr) + { + // Return in second the additional (to size) space required. + // + if (r.second > cap) + return {nullptr, r.second - 1}; + + // If we had to escape the character then adjust size accordingly + // (see append() above). + // + size += r.second - 1; + + val.first += 1; + val.second -= 1; + return r; + } + + // First character doesn't need to be escaped. Return as much of the + // rest of the input as possible. + // + size_t i (0); + for (size_t n (min (cap, val.second)); i != n; i++) + { + const uint8_t c1 (val.first[i]); + + if (c1 == '"' || c1 == '\\' || c1 <= 0x1F) // Needs to be escaped. + break; + else if (c1 >= 0x80) // Not ASCII, so validate as a UTF-8 sequence. + { + size_t i1 (i); // Position of the first byte. + + // The control flow here is to continue if valid and to fall + // through to return on error. + // + if (c1 >= 0xC2 && c1 <= 0xDF) // 2-byte sequence. + { + if (i + 2 <= val.second) // Sequence is complete in JSON value. + { + if (i + 2 > cap) // Sequence won't fit. + break; + + const uint8_t c2 (val.first[++i]); + + if (c2 >= 0x80 && c2 <= 0xBF) + continue; + } + } + else if (c1 >= 0xE0 && c1 <= 0xEF) // 3-byte sequence. + { + if (i + 3 <= val.second) + { + if (i + 3 > cap) + break; + + const uint8_t c2 (val.first[++i]), c3 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF) + { + switch (c1) + { + case 0xE0: if (c2 >= 0xA0 && c2 <= 0xBF) continue; break; + case 0xED: if (c2 >= 0x80 && c2 <= 0x9F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + else if (c1 >= 0xF0 && c1 <= 0xF4) // 4-byte sequence. + { + if (i + 4 <= val.second) + { + if (i + 4 > cap) + break; + + const uint8_t c2 (val.first[++i]), + c3 (val.first[++i]), + c4 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF && + c4 >= 0x80 && c4 <= 0xBF) + { + switch (c1) + { + case 0xF0: if (c2 >= 0x90 && c2 <= 0xBF) continue; break; + case 0xF4: if (c2 >= 0x80 && c2 <= 0x8F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + + r = {val.first, string::npos}; + + // Update val to point to the beginning of the invalid sequence. + // + val.first += i1; + val.second -= i1; + + return r; + } + } + + if (i != 0) // We have a chunk. + { + r = {val.first, i}; + + val.first += i; + val.second -= i; + } + + return r; + }; + + // Value's original size (used to calculate the offset of the errant + // character in case of a validation failure). + // + const size_t vn (val.second); + + // Write the separator, if any. + // + if (sep.second != 0) + { + if (cap < sep.second && !grow (sep.second)) + goto fail_nospace; + + append (sep.first, sep.second); + } + + // Write the value's opening quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + // Write the value, unless empty. + // + while (val.second != 0) + { + pair<const char*, size_t> ch (nullptr, 0); + + if (cap != 0) + ch = check ? chunk_checked () : chunk (); + + if (ch.first == nullptr) + { + // The minimum extra bytes we need the overflow function to be able + // to provide is based on these sequences that we do not break: + // + // - 4 bytes for a UTF-8 sequence + // - 6 bytes for an escaped Unicode sequence (\uXXXX). + // + if (!grow (6, ch.second)) + goto fail_nospace; + } + else if (ch.second != string::npos) + append (ch.first, ch.second); + else + goto fail_utf8; + } + + // Write the value's closing quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + return; + + // Note: keep descriptions consistent with the parser. + // + fail_utf8: + throw invalid_json_output (e, + e == event::name ? error_code::invalid_name + : error_code::invalid_value, + "invalid UTF-8 text", + vn - val.second); + + fail_nospace: + throw invalid_json_output ( + e, error_code::buffer_overflow, "insufficient space in buffer"); + } + + size_t buffer_serializer:: + to_chars_impl (char* b, size_t n, const char* f, ...) + { + va_list a; + va_start (a, f); + const int r (vsnprintf (b, n, f, a)); + va_end (a); + + if (r < 0 || r >= static_cast<int> (n)) + { + throw invalid_json_output (event::number, + error_code::invalid_value, + "unable to convert number to string"); + } + + return static_cast<size_t> (r); + } + } +} diff --git a/libbutl/json/serializer.hxx b/libbutl/json/serializer.hxx new file mode 100644 index 0000000..5192cb4 --- /dev/null +++ b/libbutl/json/serializer.hxx @@ -0,0 +1,413 @@ +#pragma once + +#ifdef BUILD2_BOOTSTRAP +# error JSON serializer not available during bootstrap +#endif + +#include <array> +#include <iosfwd> +#include <string> +#include <vector> +#include <cstddef> // size_t, nullptr_t +#include <utility> // pair +#include <stdexcept> // invalid_argument +#include <type_traits> // enable_if, is_* + +#include <libbutl/optional.hxx> // butl::optional is std::optional or similar. + +#include <libbutl/json/event.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + // Using the RFC8259 terminology: JSON (output) text, JSON value, object + // member. + // + namespace json + { + class invalid_json_output: public std::invalid_argument + { + public: + using event_type = json::event; + + enum class error_code + { + buffer_overflow, + unexpected_event, + invalid_name, + invalid_value + }; + + invalid_json_output (optional<event_type> event, + error_code code, + const char* description, + std::size_t offset = std::string::npos); + + invalid_json_output (optional<event_type> event, + error_code code, + const std::string& description, + std::size_t offset = std::string::npos); + + // Event that triggered the error. If the error is in the value, then + // offset points to the offending byte (for example, the beginning of an + // invalid UTF-8 byte sequence). Otherwise, offset is string::npos. + // + optional<event_type> event; + error_code code; + std::size_t offset; + }; + + // The serializer makes sure the resulting JSON is syntactically but not + // necessarily semantically correct. For example, it's possible to + // serialize a number event with non-numeric data. + // + // Note that unlike the parser, the serializer is always in the multi- + // value mode allowing the serialization of zero or more values. Note also + // that while values are separated with newlines, there is no trailing + // newline after the last (or only) value and the user is expected to add + // it manually if needed. + // + // Also note that while RFC8259 recommends object members to have unique + // names, the serializer does not enforce this. + // + class LIBBUTL_SYMEXPORT buffer_serializer + { + public: + // Serialize to string growing it as necessary. + // + // The indentation argument specifies the number of indentation spaces + // that should be used for pretty-printing. If 0 is passed, no + // pretty-printing is performed. + // + explicit + buffer_serializer (std::string&, std::size_t indentation = 2); + + // Serialize to vector of characters growing it as necessary. + // + explicit + buffer_serializer (std::vector<char>&, std::size_t indentation = 2); + + // Serialize to a fixed array. + // + // The length of the output text written is tracked in the size + // argument. + // + // If the array is not big enough to store the entire output text, the + // next() call that reaches the limit will throw invalid_json_output. + // + template <std::size_t N> + buffer_serializer (std::array<char, N>&, std::size_t& size, + std::size_t indentation = 2); + + // Serialize to a fixed buffer. + // + // The length of the output text written is tracked in the size + // argument. + // + // If the buffer is not big enough to store the entire output text, the + // next() call that reaches the limit will throw invalid_json_output. + // + buffer_serializer (void* buf, std::size_t& size, std::size_t capacity, + std::size_t indentation = 2); + + // The overflow function is called when the output buffer is out of + // space. The extra argument is a hint indicating the extra space likely + // to be required. + // + // Possible strategies include re-allocating a larger buffer or flushing + // the contents of the original buffer to the output destination. In + // case of a reallocation, the implementation is responsible for copying + // the contents of the original buffer over. + // + // The flush function is called when the complete JSON value has been + // serialized to the buffer. It can be used to write the contents of the + // buffer to the output destination. Note that flush is not called after + // the second absent (nullopt) event (or the only absent event; see + // next() for details). + // + // Both functions are passed the original buffer, its size (the amount + // of output text), and its capacity. They return (by modifying the + // argument) the replacement buffer and its size and capacity (these may + // refer to the original buffer). If space cannot be made available, the + // implementation can throw an appropriate exception (for example, + // std::bad_alloc or std::ios_base::failure). Any exceptions thrown is + // propagated to the user. + // + struct buffer + { + void* data; + std::size_t& size; + std::size_t capacity; + }; + + using overflow_function = void (void* data, + event, + buffer&, + std::size_t extra); + using flush_function = void (void* data, event, buffer&); + + // Serialize using a custom buffer and overflow/flush functions (both + // are optional). + // + buffer_serializer (void* buf, std::size_t capacity, + overflow_function*, + flush_function*, + void* data, + std::size_t indentation = 2); + + // As above but the length of the output text written is tracked in the + // size argument. + // + buffer_serializer (void* buf, std::size_t& size, std::size_t capacity, + overflow_function*, + flush_function*, + void* data, + std::size_t indentation = 2); + + // Begin/end an object. + // + // The member_begin_object() version is a shortcut for: + // + // member_name (name, check); + // begin_object (); + // + void + begin_object (); + + void + member_begin_object (const char*, bool check = true); + + void + member_begin_object (const std::string&, bool check = true); + + void + end_object (); + + // Serialize an object member (name and value). + // + // If check is false, then don't check whether the name (or value, if + // it's a string) is valid UTF-8 and don't escape any characters. + // + template <typename T> + void + member (const char* name, const T& value, bool check = true); + + template <typename T> + void + member (const std::string& name, const T& value, bool check = true); + + // Serialize an object member name. + // + // If check is false, then don't check whether the name is valid UTF-8 + // and don't escape any characters. + // + void + member_name (const char*, bool check = true); + + void + member_name (const std::string&, bool check = true); + + // Begin/end an array. + // + // The member_begin_array() version is a shortcut for: + // + // member_name (name, check); + // begin_array (); + // + void + begin_array (); + + void + member_begin_array (const char*, bool check = true); + + void + member_begin_array (const std::string&, bool check = true); + + void + end_array (); + + // Serialize a string. + // + // If check is false, then don't check whether the value is valid UTF-8 + // and don't escape any characters. + // + // Note that a NULL C-string pointer is serialized as a null value. + // + void + value (const char*, bool check = true); + + void + value (const std::string&, bool check = true); + + // Serialize a number. + // + template <typename T> + typename std::enable_if<std::is_integral<T>::value || + std::is_floating_point<T>::value>::type + value (T); + + // Serialize a boolean value. + // + void + value (bool); + + // Serialize a null value. + // + void + value (std::nullptr_t); + + // Serialize value as a pre-serialized JSON value. + // + // Note that the value is expected to be a valid (and suitable) UTF-8- + // encoded JSON text. Note also that if pretty-printing is enabled, + // the resulting output may not be correctly indented. + // + void + value_json_text (const char*); + + void + value_json_text (const std::string&); + + // Serialize next JSON event. + // + // If check is false, then don't check whether the value is valid UTF-8 + // and don't escape any characters. + // + // Return true if more events are required to complete the (top-level) + // value (that is, it is currently incomplete) and false otherwise. + // Throw invalid_json_output exception in case of an invalid event or + // value. + // + // At the end of the value an optional absent (nullopt) event can be + // serialized to verify the value is complete. If it is incomplete an + // invalid_json_output exception is thrown. An optional followup absent + // event can be serialized to indicate the completion of a multi-value + // sequence (one and only absent event indicates a zero value sequence). + // If anything is serialized to a complete value sequence an + // invalid_json_output exception is thrown. + // + // Note that this function was designed to be easily invoked with the + // output from parser::next() and parser::data(). For example, for a + // single-value mode: + // + // optional<event> e; + // do + // { + // e = p.next (); + // s.next (e, p.data ()); + // } + // while (e); + // + // For a multi-value mode: + // + // while (p.peek ()) + // { + // optional<event> e; + // do + // { + // e = p.next (); + // s.next (e, p.data ()); + // } + // while (e); + // } + // s.next (nullopt); // End of value sequence. + // + bool + next (optional<event> event, + std::pair<const char*, std::size_t> value = {}, + bool check = true); + + private: + void + write (event, + std::pair<const char*, std::size_t> sep, + std::pair<const char*, std::size_t> val, + bool check, char quote = '\0'); + + // Forward a value(v, check) call to value(v) ignoring the check + // argument. Used in the member() implementation. + // + template <typename T> + void + value (const T& v, bool /*check*/) + { + value (v); + } + + // Convert numbers to string. + // + static std::size_t to_chars (char*, std::size_t, int); + static std::size_t to_chars (char*, std::size_t, long); + static std::size_t to_chars (char*, std::size_t, long long); + static std::size_t to_chars (char*, std::size_t, unsigned int); + static std::size_t to_chars (char*, std::size_t, unsigned long); + static std::size_t to_chars (char*, std::size_t, unsigned long long); + static std::size_t to_chars (char*, std::size_t, double); + static std::size_t to_chars (char*, std::size_t, long double); + + static std::size_t to_chars_impl (char*, size_t, const char* fmt, ...); + + buffer buf_; + std::size_t size_; + overflow_function* overflow_; + flush_function* flush_; + void* data_; + + // State of a "structured type" (array or object; as per the RFC + // terminology). + // + struct state + { + const event type; // Type kind (begin_array or begin_object). + std::size_t count; // Number of events serialized inside this type. + }; + + // Stack of nested structured type states. + // + // @@ TODO: would have been nice to use small_vector. + // + std::vector<state> state_; + + // The number of consecutive absent events (nullopt) serialized thus + // far. + // + // Note: initialized to 1 to naturally handle a single absent event + // (declares an empty value sequence complete). + // + std::size_t absent_ = 1; + + // The number of spaces with which to indent (once for each level of + // nesting). If zero, pretty-printing is disabled. + // + std::size_t indent_; + + // Separator and indentation before/after value inside an object or + // array (see pretty-printing implementation for details). + // + std::string sep_; + + // The number of complete top-level values serialized thus far. + // + std::size_t values_ = 0; + }; + + class LIBBUTL_SYMEXPORT stream_serializer: public buffer_serializer + { + public: + // Serialize to std::ostream. + // + // If stream exceptions are enabled then the std::ios_base::failure + // exception is used to report input/output errors (badbit and failbit). + // Otherwise, those are reported as the invalid_json_output exception. + // + explicit + stream_serializer (std::ostream&, std::size_t indentation = 2); + + protected: + char tmp_[4096]; + }; + } +} + +#include <libbutl/json/serializer.ixx> diff --git a/libbutl/json/serializer.ixx b/libbutl/json/serializer.ixx new file mode 100644 index 0000000..a719ef6 --- /dev/null +++ b/libbutl/json/serializer.ixx @@ -0,0 +1,247 @@ +#include <cstring> // strlen() + +namespace butl +{ + namespace json + { + inline invalid_json_output:: + invalid_json_output (optional<event_type> e, + error_code c, + const char* d, + std::size_t o) + : std::invalid_argument (d), event (e), code (c), offset (o) + { + } + + inline invalid_json_output:: + invalid_json_output (optional<event_type> e, + error_code c, + const std::string& d, + std::size_t o) + : invalid_json_output (e, c, d.c_str (), o) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t& s, std::size_t c, + overflow_function* o, flush_function* f, void* d, + std::size_t i) + : buf_ {b, s, c}, + overflow_ (o), + flush_ (f), + data_ (d), + indent_ (i), + sep_ (indent_ != 0 ? ",\n" : "") + { + } + + template <std::size_t N> + inline buffer_serializer:: + buffer_serializer (std::array<char, N>& a, std::size_t& s, std::size_t i) + : buffer_serializer (a.data (), s, a.size (), + nullptr, nullptr, nullptr, + i) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t& s, std::size_t c, std::size_t i) + : buffer_serializer (b, s, c, nullptr, nullptr, nullptr, i) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t c, + overflow_function* o, flush_function* f, void* d, + std::size_t i) + : buffer_serializer (b, size_, c, o, f, d, i) + { + size_ = 0; + } + + inline void buffer_serializer:: + begin_object () + { + next (event::begin_object); + } + + inline void buffer_serializer:: + end_object () + { + next (event::end_object); + } + + inline void buffer_serializer:: + member_name (const char* n, bool c) + { + next (event::name, {n, n != nullptr ? std::strlen (n) : 0}, c); + } + + inline void buffer_serializer:: + member_name (const std::string& n, bool c) + { + next (event::name, {n.c_str (), n.size ()}, c); + } + + inline void buffer_serializer:: + member_begin_object (const char* n, bool c) + { + member_name (n, c); + begin_object (); + } + + inline void buffer_serializer:: + member_begin_object (const std::string& n, bool c) + { + member_name (n, c); + begin_object (); + } + + template <typename T> + inline void buffer_serializer:: + member (const char* n, const T& v, bool c) + { + member_name (n, c); + value (v, c); + } + + template <typename T> + inline void buffer_serializer:: + member (const std::string& n, const T& v, bool c) + { + member_name (n, c); + value (v, c); + } + + inline void buffer_serializer:: + begin_array () + { + next (event::begin_array); + } + + inline void buffer_serializer:: + member_begin_array (const char* n, bool c) + { + member_name (n, c); + begin_array (); + } + + inline void buffer_serializer:: + member_begin_array (const std::string& n, bool c) + { + member_name (n, c); + begin_array (); + } + + inline void buffer_serializer:: + end_array () + { + next (event::end_array); + } + + inline void buffer_serializer:: + value (const char* v, bool c) + { + if (v != nullptr) + next (event::string, {v, std::strlen (v)}, c); + else + next (event::null); + } + + inline void buffer_serializer:: + value (const std::string& v, bool c) + { + next (event::string, {v.c_str (), v.size ()}, c); + } + + template <typename T> + typename std::enable_if<std::is_integral<T>::value || + std::is_floating_point<T>::value>::type + buffer_serializer:: + value (T v) + { + // The largest 128-bit integer has 39 digits, and long floating point + // numbers will fit because they are output in scientific notation. + // + char b[40]; + const std::size_t n (to_chars (b, sizeof (b), v)); + next (event::number, {b, n}); + } + + inline void buffer_serializer:: + value (bool b) + { + next (event::boolean, + b ? std::make_pair ("true", 4) : std::make_pair ("false", 5)); + } + + inline void buffer_serializer:: + value (std::nullptr_t) + { + next (event::null); + } + + inline void buffer_serializer:: + value_json_text (const char* v) + { + // Use event::number (which doesn't involve any quoting) with a disabled + // check. + // + next (event::number, {v, std::strlen (v)}, false /* check */); + } + + inline void buffer_serializer:: + value_json_text (const std::string& v) + { + next (event::number, {v.c_str (), v.size ()}, false /* check */); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, int v) + { + return to_chars_impl (b, s, "%d", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long v) + { + return to_chars_impl (b, s, "%ld", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long long v) + { + return to_chars_impl (b, s, "%lld", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned v) + { + return to_chars_impl (b, s, "%u", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned long v) + { + return to_chars_impl (b, s, "%lu", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned long long v) + { + return to_chars_impl (b, s, "%llu", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, double v) + { + return to_chars_impl (b, s, "%.10g", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long double v) + { + return to_chars_impl (b, s, "%.10Lg", v); + } + } +} diff --git a/libbutl/lz4-stream.cxx b/libbutl/lz4-stream.cxx new file mode 100644 index 0000000..8001770 --- /dev/null +++ b/libbutl/lz4-stream.cxx @@ -0,0 +1,281 @@ +// file : libbutl/lz4-stream.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbutl/lz4-stream.hxx> + +#include <cstring> // memcpy() +#include <stdexcept> // invalid_argument + +#include <libbutl/utility.hxx> // eof() + +using namespace std; + +namespace butl +{ + namespace lz4 + { + // istream + // + + // Read into the specified buffer returning the number of bytes read and + // the eof flag. + // + pair<size_t, bool> istreambuf:: + read (char* b, size_t c) + { + size_t n (0); + bool e (false); + + // @@ TODO: would it be faster to do a direct buffer copy if input + // stream is bufstreabuf-based (see sha*.cxx for code)? + do + { + e = eof (is_->read (b + n, c - n)); + n += static_cast<size_t> (is_->gcount ()); + } + while (!e && n != c); + + return make_pair (n, e); + } + + optional<uint64_t> istreambuf:: + open (std::istream& is, bool end) + { + assert (is.exceptions () == std::istream::badbit); + + is_ = &is; + end_ = end; + + // Read in the header and allocate the buffers. + // + // What if we hit EOF here? And could begin() return 0? Turns out the + // answer to both questions is yes: 0-byte content compresses to 15 + // bytes (with or without content size; 1-byte -- to 20/28 bytes). We + // can ignore EOF here since an attempt to read more will result in + // another EOF. And our load() is prepared to handle 0 hint. + // + // @@ We could end up leaving some of the input content from the header + // in the input buffer which the caller will have to way of using + // (e.g., in a stream of compressed contents). Doesn't look like + // there is much we can do (our streams don't support putback) other + // than document this limitation. + // + optional<uint64_t> r; + + d_.hn = read (d_.hb, sizeof (d_.hb)).first; + h_ = d_.begin (&r); + + ib_.reset ((d_.ib = new char[d_.ic])); + ob_.reset ((d_.ob = new char[d_.oc])); + + // Copy over whatever is left in the header buffer. + // + memcpy (d_.ib, d_.hb, (d_.in = d_.hn)); + + setg (d_.ob, d_.ob, d_.ob); + return r; + } + + void istreambuf:: + close () + { + if (is_open ()) + { + is_ = nullptr; + } + } + + istreambuf::int_type istreambuf:: + underflow () + { + int_type r (traits_type::eof ()); + + if (is_open ()) + { + if (gptr () < egptr () || load ()) + r = traits_type::to_int_type (*gptr ()); + } + + return r; + } + + bool istreambuf:: + load () + { + // Note that the first call to this function may be with h_ == 0 (see + // open() for details). In which case we just need to verify there is + // no just after the compressed content. + // + bool r; + + if (h_ == 0) + r = false; // EOF + else + { + // Note: next() may just buffer the data. + // + do + { + // Note that on the first call we may already have some data in the + // input buffer (leftover header data). + // + if (h_ > d_.in) + { + pair<size_t, bool> p (read (d_.ib + d_.in, h_ - d_.in)); + + d_.in += p.first; + + if (p.second && d_.in != h_) + throw invalid_argument ("incomplete LZ4 compressed content"); + } + + h_ = d_.next (); // Clears d_.in. + + } while (d_.on == 0 && h_ != 0); + + setg (d_.ob, d_.ob, d_.ob + d_.on); + off_ += d_.on; + r = (d_.on != 0); + } + + // If we don't expect any more compressed content and we were asked to + // end the underlying input stream, then verify there is no more input. + // + if (h_ == 0 && end_) + { + end_ = false; + + if (d_.in != 0 || + (!is_->eof () && + is_->good () && + is_->peek () != istream::traits_type::eof ())) + throw invalid_argument ("junk after LZ4 compressed content"); + } + + return r; + } + + // ostream + // + + void ostreambuf:: + write (char* b, std::size_t n) + { + os_->write (b, static_cast<streamsize> (n)); + } + + void ostreambuf:: + open (std::ostream& os, + int level, + int block_id, + optional<std::uint64_t> content_size) + { + assert (os.exceptions () == (std::ostream::badbit | + std::ostream::failbit)); + + os_ = &os; + + // Determine required buffer capacities. + // + c_.begin (level, block_id, content_size); + + ib_.reset ((c_.ib = new char[c_.ic])); + ob_.reset ((c_.ob = new char[c_.oc])); + + setp (c_.ib, c_.ib + c_.ic - 1); // Keep space for overflow's char. + end_ = false; + } + + void ostreambuf:: + close () + { + if (is_open ()) + { + if (!end_) + save (); + + os_ = nullptr; + } + } + + ostreambuf:: + ~ostreambuf () + { + close (); + } + + ostreambuf::int_type ostreambuf:: + overflow (int_type c) + { + int_type r (traits_type::eof ()); + + if (is_open () && c != traits_type::eof ()) + { + // Store last character in the space we reserved in open(). Note + // that pbump() doesn't do any checks. + // + *pptr () = traits_type::to_char_type (c); + pbump (1); + + save (); + r = c; + } + + return r; + } + + void ostreambuf:: + save () + { + c_.in = pptr () - pbase (); + off_ += c_.in; + + // We assume this is the end if the input buffer is not full. + // + end_ = (c_.in != c_.ic); + c_.next (end_); + + if (c_.on != 0) // next() may just buffer the data. + write (c_.ob, c_.on); + + setp (c_.ib, c_.ib + c_.ic - 1); + } + + streamsize ostreambuf:: + xsputn (const char_type* s, streamsize sn) + { + if (!is_open () || end_) + return 0; + + // To avoid futher 'signed/unsigned comparison' compiler warnings. + // + size_t n (static_cast<size_t> (sn)); + + // The plan is to keep copying the data into the input buffer and + // calling save() (our compressor API currently has no way of avoiding + // the copy). + // + while (n != 0) + { + // Amount of free space in the buffer (including the extra byte + // we've reserved). + // + size_t an (epptr () - pptr () + 1); + + size_t m (n > an ? an : n); + memcpy (pptr (), s, m); + pbump (static_cast<int> (m)); + + if (n < an) + break; // All fitted with at least 1 byte left. + + save (); + + s += m; + n -= m; + } + + return sn; + } + } +} diff --git a/libbutl/lz4-stream.hxx b/libbutl/lz4-stream.hxx new file mode 100644 index 0000000..b11c0a2 --- /dev/null +++ b/libbutl/lz4-stream.hxx @@ -0,0 +1,280 @@ +// file : libbutl/lz4-stream.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <memory> // unique_ptr +#include <cstddef> // size_t +#include <cstdint> // uint64_t +#include <utility> // move() +#include <istream> +#include <ostream> +#include <cassert> + +#include <libbutl/lz4.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/bufstreambuf.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + namespace lz4 + { + // istream + // + + class LIBBUTL_SYMEXPORT istreambuf: public bufstreambuf + { + public: + optional<std::uint64_t> + open (std::istream&, bool end); + + bool + is_open () const {return is_ != nullptr;} + + void + close (); + + public: + using base = bufstreambuf; + + // basic_streambuf input interface. + // + public: + virtual int_type + underflow () override; + + // Direct access to the get area. Use with caution. + // + using base::gptr; + using base::egptr; + using base::gbump; + + // Return the (logical) position of the next byte to be read. + // + using base::tellg; + + private: + std::pair<std::size_t, bool> + read (char*, std::size_t); + + bool + load (); + + private: + std::istream* is_ = nullptr; + bool end_; + decompressor d_; + std::unique_ptr<char[]> ib_; // Decompressor input buffer. + std::unique_ptr<char[]> ob_; // Decompressor output buffer. + std::size_t h_; // Decompressor next chunk hint. + }; + + // Typical usage: + // + // try + // { + // ifdstream ifs (..., fdopen_mode::binary, ifdstream::badbit); + // lz4::istream izs (ifs, true /* end */); + // ... // Read from izs. + // } + // catch (const invalid_argument& e) + // { + // ... // Invalid compressed content, call e.what() for description. + // } + // catch (/* ifdstream exceptions */) + // { + // ... + // } + // + // See class decompressor for details on semantics nad exceptions thrown. + // + // @@ TODO: get rid of badbit-only requirement. + // @@ TODO: re-openning support (will need compressor reset). + // + class LIBBUTL_SYMEXPORT istream: public std::istream + { + public: + explicit + istream (iostate e = badbit | failbit) + : std::istream (&buf_) + { + assert (e & badbit); + exceptions (e); + } + + // The underlying input stream is expected to throw on badbit but not + // failbit. If end is true, then on reaching the end of compressed data + // verify there is no more input. + // + // Note that this implementation does not support handing streams of + // compressed contents (end is false) that may include individual + // contents that uncompress to 0 bytes (see istreambuf::open() + // implementation for details). + // + istream (std::istream& is, bool end, iostate e = badbit | failbit) + : istream (e) + { + open (is, end); + } + + // Return decompressed content size, if available. + // + optional<std::uint64_t> + open (std::istream& is, bool end) + { + return buf_.open (is, end); + } + + bool + is_open () const + { + return buf_.is_open (); + } + + // Signal that no further uncompressed input will be read. + // + void + close () + { + return buf_.close (); + } + + private: + istreambuf buf_; + }; + + // ostream + // + + class LIBBUTL_SYMEXPORT ostreambuf: public bufstreambuf + { + public: + void + open (std::ostream&, + int compression_level, + int block_size_id, + optional<std::uint64_t> content_size); + + bool + is_open () const {return os_ != nullptr;} + + void + close (); + + virtual + ~ostreambuf () override; + + public: + using base = bufstreambuf; + + // basic_streambuf output interface. + // + // Note that syncing the input buffer before the end doesn't make much + // sense (it will just get buffered in the compressor). In fact, it can + // break our single-shot compression arrangement (for compatibility with + // the lz4 utility). Thus we inherit noop sync() from the base. + // + public: + virtual int_type + overflow (int_type) override; + + virtual std::streamsize + xsputn (const char_type*, std::streamsize) override; + + // Return the (logical) position of the next byte to be written. + // + using base::tellp; + + private: + void + write (char*, std::size_t); + + void + save (); + + private: + std::ostream* os_ = nullptr; + bool end_; + compressor c_; + std::unique_ptr<char[]> ib_; // Compressor input buffer. + std::unique_ptr<char[]> ob_; // Compressor output buffer. + }; + + // Typical usage: + // + // try + // { + // ofdstream ofs (..., fdopen_mode::binary); + // lz4::ostream ozs (ofs, 9, 4 /* 64KB */, nullopt /* content_size */); + // + // ... // Write to ozs. + // + // ozs.close (); + // ofs.close (); + // } + // catch (/* ofdstream exceptions */) + // { + // ... + // } + // + // See class compressor for details on semantics nad exceptions thrown. + // + // @@ TODO: re-openning support (will need compressor reset). + // + class LIBBUTL_SYMEXPORT ostream: public std::ostream + { + public: + explicit + ostream (iostate e = badbit | failbit) + : std::ostream (&buf_) + { + assert (e & badbit); + exceptions (e); + } + + // The underlying output stream is expected to throw on badbit or + // failbit. + // + // See compress() for the description of the compression level, block + // size and content size arguments. + // + ostream (std::ostream& os, + int compression_level, + int block_size_id, + optional<std::uint64_t> content_size, + iostate e = badbit | failbit) + : ostream (e) + { + open (os, compression_level, block_size_id, content_size); + } + + void + open (std::ostream& os, + int compression_level, + int block_size_id, + optional<std::uint64_t> content_size) + { + buf_.open (os, compression_level, block_size_id, content_size); + } + + bool + is_open () const + { + return buf_.is_open (); + } + + // Signal that no further uncompressed output will be written. + // + void + close () + { + return buf_.close (); + } + + private: + ostreambuf buf_; + }; + } +} diff --git a/libbutl/lz4.c b/libbutl/lz4.c new file mode 100644 index 0000000..3f0e430 --- /dev/null +++ b/libbutl/lz4.c @@ -0,0 +1,2495 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + +/*-************************************ +* Tuning parameters +**************************************/ +/* + * LZ4_HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#ifndef LZ4_HEAPMODE +# define LZ4_HEAPMODE 0 +#endif + +/* + * LZ4_ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define LZ4_ACCELERATION_DEFAULT 1 +/* + * LZ4_ACCELERATION_MAX : + * Any "acceleration" value higher than this threshold + * get treated as LZ4_ACCELERATION_MAX instead (fix #876) + */ +#define LZ4_ACCELERATION_MAX 65537 + + +/*-************************************ +* CPU Feature Detection +**************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets which assembly generation depends on alignment. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && \ + ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ +# undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + + +/*-************************************ +* Dependency +**************************************/ +/* + * LZ4_SRC_INCLUDED: + * Amalgamation flag, whether lz4.c is included + */ +#ifndef LZ4_SRC_INCLUDED +# define LZ4_SRC_INCLUDED 1 +#endif + +#ifndef LZ4_STATIC_LINKING_ONLY +#define LZ4_STATIC_LINKING_ONLY +#endif + +#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS +#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */ +#endif + +#define LZ4_STATIC_LINKING_ONLY /* LZ4_DISTANCE_MAX */ +#include "lz4.h" +/* see also "memory routines" below */ + + +/*-************************************ +* Compiler Options +**************************************/ +#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */ +# include <intrin.h> /* only present in VS2005+ */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif /* _MSC_VER */ + +#ifndef LZ4_FORCE_INLINE +# ifdef _MSC_VER /* Visual Studio */ +# define LZ4_FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define LZ4_FORCE_INLINE static inline +# endif +# else +# define LZ4_FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* LZ4_FORCE_INLINE */ + +/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, + * together with a simple 8-byte copy loop as a fall-back path. + * However, this optimization hurts the decompression speed by >30%, + * because the execution does not go to the optimized loop + * for typical compressible data, and all of the preamble checks + * before going to the fall-back path become useless overhead. + * This optimization happens only with the -O3 flag, and -O2 generates + * a simple 8-byte copy loop. + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 + * functions are annotated with __attribute__((optimize("O2"))), + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute + * of LZ4_wildCopy8 does not affect the compression speed. + */ +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__) +# define LZ4_FORCE_O2 __attribute__((optimize("O2"))) +# undef LZ4_FORCE_INLINE +# define LZ4_FORCE_INLINE static __inline __attribute__((optimize("O2"),always_inline)) +#else +# define LZ4_FORCE_O2 +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + +/* Should the alignment test prove unreliable, for some reason, + * it can be disabled by setting LZ4_ALIGN_TEST to 0 */ +#ifndef LZ4_ALIGN_TEST /* can be externally provided */ +# define LZ4_ALIGN_TEST 1 +#endif + + +/*-************************************ +* Memory routines +**************************************/ +#ifdef LZ4_USER_MEMORY_FUNCTIONS +/* memory management functions can be customized by user project. + * Below functions must exist somewhere in the Project + * and be available at link time */ +void* LZ4_malloc(size_t s); +void* LZ4_calloc(size_t n, size_t s); +void LZ4_free(void* p); +# define ALLOC(s) LZ4_malloc(s) +# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s) +# define FREEMEM(p) LZ4_free(p) +#else +# include <stdlib.h> /* malloc, calloc, free */ +# define ALLOC(s) malloc(s) +# define ALLOC_AND_ZERO(s) calloc(1,s) +# define FREEMEM(p) free(p) +#endif + +#include <string.h> /* memset, memcpy */ +#define MEM_INIT(p,v,s) memset((p),(v),(s)) + + +/*-************************************ +* Common Constants +**************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */ +#define FASTLOOP_SAFE_DISTANCE 64 +static const int LZ4_minLength = (MFLIMIT+1); + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define LZ4_DISTANCE_ABSOLUTE_MAX 65535 +#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */ +# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" +#endif + +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) + + +/*-************************************ +* Error detection +**************************************/ +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1) +# include <assert.h> +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + +#define LZ4_STATIC_ASSERT(c) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use after variable declarations */ + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) +# include <stdio.h> + static int g_debuglog_enable = 1; +# define DEBUGLOG(l, ...) { \ + if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \ + fprintf(stderr, __FILE__ ": "); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + +static int LZ4_isAligned(const void* ptr, size_t alignment) +{ + return ((size_t)ptr & (alignment -1)) == 0; +} + + +/*-************************************ +* Types +**************************************/ +#include <limits.h> +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else +# if UINT_MAX != 4294967295UL +# error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4" +# endif + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +typedef enum { + notLimited = 0, + limitedOutput = 1, + fillOutput = 2 +} limitedOutput_directive; + + +/*-************************************ +* Reading and writing into memory +**************************************/ + +/** + * LZ4 relies on memcpy with a constant size being inlined. In freestanding + * environments, the compiler can't assume the implementation of memcpy() is + * standard compliant, so it can't apply its specialized memcpy() inlining + * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze + * memcpy() as if it were standard compliant, so it can inline it in freestanding + * environments. This is needed when decompressing the Linux Kernel, for example. + */ +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) +#else +#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size) +#endif + +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) LZ4_unalign; + +static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalign*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access using memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +LZ4_FORCE_INLINE +void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e); +} + +static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; +static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + + +#ifndef LZ4_FAST_DEC_LOOP +# if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +# define LZ4_FAST_DEC_LOOP 1 +# elif defined(__aarch64__) && !defined(__clang__) + /* On aarch64, we disable this optimization for clang because on certain + * mobile chipsets, performance is reduced with clang. For information + * refer to https://github.com/lz4/lz4/pull/707 */ +# define LZ4_FAST_DEC_LOOP 1 +# else +# define LZ4_FAST_DEC_LOOP 0 +# endif +#endif + +#if LZ4_FAST_DEC_LOOP + +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + assert(srcPtr + offset == dstPtr); + if (offset < 8) { + LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */ + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + srcPtr += inc32table[offset]; + LZ4_memcpy(dstPtr+4, srcPtr, 4); + srcPtr -= dec64table[offset]; + dstPtr += 8; + } else { + LZ4_memcpy(dstPtr, srcPtr, 8); + dstPtr += 8; + srcPtr += 8; + } + + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); +} + +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd + * this version copies two times 16 bytes (instead of one time 32 bytes) + * because it must be compatible with offsets >= 16. */ +LZ4_FORCE_INLINE void +LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e); +} + +/* LZ4_memcpy_using_offset() presumes : + * - dstEnd >= dstPtr + MINMATCH + * - there is at least 8 bytes available to write after dstEnd */ +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + BYTE v[8]; + + assert(dstEnd >= dstPtr + MINMATCH); + + switch(offset) { + case 1: + MEM_INIT(v, *srcPtr, 8); + break; + case 2: + LZ4_memcpy(v, srcPtr, 2); + LZ4_memcpy(&v[2], srcPtr, 2); + LZ4_memcpy(&v[4], v, 4); + break; + case 4: + LZ4_memcpy(v, srcPtr, 4); + LZ4_memcpy(&v[4], srcPtr, 4); + break; + default: + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); + return; + } + + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + while (dstPtr < dstEnd) { + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + } +} +#endif + + +/*-************************************ +* Common functions +**************************************/ +static unsigned LZ4_NbCommonBytes (reg_t val) +{ + assert(val != 0); + if (LZ4_isLittleEndian()) { + if (sizeof(val) == 8) { +# if defined(_MSC_VER) && (_MSC_VER >= 1800) && defined(_M_AMD64) && !defined(LZ4_FORCE_SW_BITCOUNT) + /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ + return (unsigned)_tzcnt_u64(val) >> 3; +# elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64(&r, (U64)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctzll((U64)val) >> 3; +# else + const U64 m = 0x0101010101010101ULL; + val ^= val - 1; + return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56); +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward(&r, (U32)val); + return (unsigned)r >> 3; +# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctz((U32)val) >> 3; +# else + const U32 m = 0x01010101; + return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24; +# endif + } + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clzll((U64)val) >> 3; +# else +#if 1 + /* this method is probably faster, + * but adds a 128 bytes lookup table */ + static const unsigned char ctz7_tab[128] = { + 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + }; + U64 const mask = 0x0101010101010101ULL; + U64 const t = (((val >> 8) - mask) | val) & mask; + return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; +#else + /* this method doesn't consume memory space like the previous one, + * but it contains several branches, + * that may end up slowing execution */ + static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. + Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. + Note that this code path is never triggered in 32-bits mode. */ + unsigned r; + if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +#endif +# endif + } else /* 32 bits */ { +# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ + ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clz((U32)val) >> 3; +# else + val >>= 8; + val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | + (val + 0x00FF0000)) >> 24; + return (unsigned)val ^ 3; +# endif + } + } +} + + +#define STEPSIZE sizeof(reg_t) +LZ4_FORCE_INLINE +unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + if (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { + pIn+=STEPSIZE; pMatch+=STEPSIZE; + } else { + return LZ4_NbCommonBytes(diff); + } } + + while (likely(pIn < pInLimit-(STEPSIZE-1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; } + pIn += LZ4_NbCommonBytes(diff); + return (unsigned)(pIn - pStart); + } + + if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++; + return (unsigned)(pIn - pStart); +} + + +#ifndef LZ4_COMMONDEFS_ONLY +/*-************************************ +* Local Constants +**************************************/ +static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1)); +static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run slower on incompressible data */ + + +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; + +/** + * This enum distinguishes several different modes of accessing previous + * content in the stream. + * + * - noDict : There is no preceding content. + * - withPrefix64k : Table entries up to ctx->dictSize before the current blob + * blob being compressed are valid and refer to the preceding + * content (of length ctx->dictSize), which is available + * contiguously preceding in memory the content currently + * being compressed. + * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere + * else in memory, starting at ctx->dictionary with length + * ctx->dictSize. + * - usingDictCtx : Like usingExtDict, but everything concerning the preceding + * content is in a separate context, pointed to by + * ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table + * entries in the current context that refer to positions + * preceding the beginning of the current compression are + * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx + * ->dictSize describe the location and size of the preceding + * content, and matches are found by looking in the ctx + * ->dictCtx->hashTable. + */ +typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + + +/*-************************************ +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState(void) { return LZ4_STREAMSIZE; } + + +/*-************************************ +* Internal Definitions used in Tests +**************************************/ +#if defined (__cplusplus) +extern "C" { +#endif + +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize); + +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, + int compressedSize, int maxOutputSize, + const void* dictStart, size_t dictSize); + +#if defined (__cplusplus) +} +#endif + +/*-****************************** +* Compression functions +********************************/ +LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) { + const U64 prime5bytes = 889523592379ULL; + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + } else { + const U64 prime8bytes = 11400714785074694791ULL; + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); + } +} + +LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) +{ + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + return LZ4_hash4(LZ4_read32(p), tableType); +} + +LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) + { + default: /* fallthrough */ + case clearedTable: { /* illegal! */ assert(0); return; } + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; } + } +} + +LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) + { + default: /* fallthrough */ + case clearedTable: /* fallthrough */ + case byPtr: { /* illegal! */ assert(0); return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; } + case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; } + } +} + +LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, + void* tableBase, tableType_t const tableType, + const BYTE* srcBase) +{ + switch (tableType) + { + case clearedTable: { /* illegal! */ assert(0); return; } + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +/* LZ4_getIndexOnHash() : + * Index of match position registered in hash table. + * hash position must be calculated by using base+index, or dictBase+index. + * Assumption 1 : only valid if tableType == byU32 or byU16. + * Assumption 2 : h is presumed valid (within limits of hash table) + */ +LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType) +{ + LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2); + if (tableType == byU32) { + const U32* const hashTable = (const U32*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-2))); + return hashTable[h]; + } + if (tableType == byU16) { + const U16* const hashTable = (const U16*) tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE-1))); + return hashTable[h]; + } + assert(0); return 0; /* forbidden case */ +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; } + if (tableType == byU32) { const U32* const hashTable = (const U32*) tableBase; return hashTable[h] + srcBase; } + { const U16* const hashTable = (const U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +LZ4_FORCE_INLINE const BYTE* +LZ4_getPosition(const BYTE* p, + const void* tableBase, tableType_t tableType, + const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +LZ4_FORCE_INLINE void +LZ4_prepareTable(LZ4_stream_t_internal* const cctx, + const int inputSize, + const tableType_t tableType) { + /* If the table hasn't been used, it's guaranteed to be zeroed out, and is + * therefore safe to use no matter what mode we're in. Otherwise, we figure + * out if it's safe to leave as is or whether it needs to be reset. + */ + if ((tableType_t)cctx->tableType != clearedTable) { + assert(inputSize >= 0); + if ((tableType_t)cctx->tableType != tableType + || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) + || ((tableType == byU32) && cctx->currentOffset > 1 GB) + || tableType == byPtr + || inputSize >= 4 KB) + { + DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx); + MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE); + cctx->currentOffset = 0; + cctx->tableType = (U32)clearedTable; + } else { + DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)"); + } + } + + /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster + * than compressing without a gap. However, compressing with + * currentOffset == 0 is faster still, so we preserve that case. + */ + if (cctx->currentOffset != 0 && tableType == byU32) { + DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset"); + cctx->currentOffset += 64 KB; + } + + /* Finally, clear history */ + cctx->dictCtx = NULL; + cctx->dictionary = NULL; + cctx->dictSize = 0; +} + +/** LZ4_compress_generic() : + * inlined, to ensure branches are decided at compilation time. + * Presumed already validated at this stage: + * - source != NULL + * - inputSize > 0 + */ +LZ4_FORCE_INLINE int LZ4_compress_generic_validated( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + int *inputConsumed, /* only written when outputDirective == fillOutput */ + const int maxOutputSize, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + int result; + const BYTE* ip = (const BYTE*) source; + + U32 const startIndex = cctx->currentOffset; + const BYTE* base = (const BYTE*) source - startIndex; + const BYTE* lowLimit; + + const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx; + const BYTE* const dictionary = + dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary; + const U32 dictSize = + dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize; + const U32 dictDelta = (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0; /* make indexes in dictCtx comparable with index in current context */ + + int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx); + U32 const prefixIdxLimit = startIndex - dictSize; /* used when dictDirective == dictSmall */ + const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1; + const BYTE* const matchlimit = iend - LASTLITERALS; + + /* the dictCtx currentOffset is indexed on the start of the dictionary, + * while a dictionary in the current context precedes the currentOffset */ + const BYTE* dictBase = !dictionary ? NULL : (dictDirective == usingDictCtx) ? + dictionary + dictSize - dictCtx->currentOffset : + dictionary + dictSize - startIndex; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 offset = 0; + U32 forwardH; + + DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType); + assert(ip != NULL); + /* If init conditions are not met, we don't have to mark stream + * as having dirty context, since no action was taken yet */ + if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */ + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) { return 0; } /* Size too large (not within 64K limit) */ + if (tableType==byPtr) assert(dictDirective==noDict); /* only supported use case with byPtr */ + assert(acceleration >= 1); + + lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0); + + /* Update context state */ + if (dictDirective == usingDictCtx) { + /* Subsequent linked blocks can't use the dictionary. */ + /* Instead, they use the block we just compressed. */ + cctx->dictCtx = NULL; + cctx->dictSize = (U32)inputSize; + } else { + cctx->dictSize += (U32)inputSize; + } + cctx->currentOffset += (U32)inputSize; + cctx->tableType = (U32)tableType; + + if (inputSize<LZ4_minLength) goto _last_literals; /* Input too small, no compression (all literals) */ + + /* First Byte */ + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + const BYTE* match; + BYTE* token; + const BYTE* filledIp; + + /* Find a match */ + if (tableType == byPtr) { + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); + + } while ( (match+LZ4_DISTANCE_MAX < ip) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + + } else { /* byU32, byU16 */ + + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + U32 const current = (U32)(forwardIp - base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex <= current); + assert(forwardIp - base < (ptrdiff_t)(2 GB - 1)); + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; + assert(ip < mflimitPlusOne); + + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + matchIndex += dictDelta; /* make dictCtx index comparable with current context */ + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else if (dictDirective==usingExtDict) { + if (matchIndex < startIndex) { + DEBUGLOG(7, "extDict candidate: matchIndex=%5u < startIndex=%5u", matchIndex, startIndex); + assert(startIndex - matchIndex >= MINMATCH); + match = dictBase + matchIndex; + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else { /* single continuous memory segment */ + match = base + matchIndex; + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + + DEBUGLOG(7, "candidate at pos=%u (offset=%u \n", matchIndex, current - matchIndex); + if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; } /* match outside of valid area */ + assert(matchIndex < current); + if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) + && (matchIndex+LZ4_DISTANCE_MAX < current)) { + continue; + } /* too far */ + assert((current - matchIndex) <= LZ4_DISTANCE_MAX); /* match now expected within distance */ + + if (LZ4_read32(match) == LZ4_read32(ip)) { + if (maybe_extMem) offset = current - matchIndex; + break; /* match found */ + } + + } while(1); + } + + /* Catch up */ + filledIp = ip; + while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputDirective == limitedOutput) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) { + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + if ((outputDirective == fillOutput) && + (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) { + op--; + goto _last_literals; + } + if (litLength >= RUN_MASK) { + int len = (int)(litLength - RUN_MASK); + *token = (RUN_MASK<<ML_BITS); + for(; len >= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength<<ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy8(op, anchor, op+litLength); + op+=litLength; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source)); + } + +_next_match: + /* at this stage, the following variables must be correctly set : + * - ip : at start of LZ operation + * - match : at start of previous pattern occurence; can be within current prefix, or within extDict + * - offset : if maybe_ext_memSegment==1 (constant) + * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise + * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written + */ + + if ((outputDirective == fillOutput) && + (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) { + /* the match was too close to the end, rewind and go to last literals */ + op = token; + goto _last_literals; + } + + /* Encode Offset */ + if (maybe_extMem) { /* static test */ + DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, (int)(ip - (const BYTE*)source)); + assert(offset <= LZ4_DISTANCE_MAX && offset > 0); + LZ4_writeLE16(op, (U16)offset); op+=2; + } else { + DEBUGLOG(6, " with offset=%u (same segment)", (U32)(ip - match)); + assert(ip-match <= LZ4_DISTANCE_MAX); + LZ4_writeLE16(op, (U16)(ip - match)); op+=2; + } + + /* Encode MatchLength */ + { unsigned matchCode; + + if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx) + && (lowLimit==dictionary) /* match within extDict */ ) { + const BYTE* limit = ip + (dictEnd-match); + assert(dictEnd > match); + if (limit > matchlimit) limit = matchlimit; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += (size_t)matchCode + MINMATCH; + if (ip==limit) { + unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + DEBUGLOG(6, " with matchLength=%u starting in extDict", matchCode+MINMATCH); + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += (size_t)matchCode + MINMATCH; + DEBUGLOG(6, " with matchLength=%u", matchCode+MINMATCH); + } + + if ((outputDirective) && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) { + if (outputDirective == fillOutput) { + /* Match description too long : reduce it */ + U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255; + ip -= matchCode - newMatchCode; + assert(newMatchCode < matchCode); + matchCode = newMatchCode; + if (unlikely(ip <= filledIp)) { + /* We have already filled up to filledIp so if ip ends up less than filledIp + * we have positions in the hash table beyond the current position. This is + * a problem if we reuse the hash table. So we have to remove these positions + * from the hash table. + */ + const BYTE* ptr; + DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip)); + for (ptr = ip; ptr <= filledIp; ++ptr) { + U32 const h = LZ4_hashPosition(ptr, tableType); + LZ4_clearHash(h, cctx->hashTable, tableType); + } + } + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) { + op+=4; + LZ4_write32(op, 0xFFFFFFFF); + matchCode -= 4*255; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + /* Ensure we have enough space for the last literals. */ + assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit)); + + anchor = ip; + + /* Test end of chunk */ + if (ip >= mflimitPlusOne) break; + + /* Fill table */ + LZ4_putPosition(ip-2, cctx->hashTable, tableType, base); + + /* Test next position */ + if (tableType == byPtr) { + + match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + if ( (match+LZ4_DISTANCE_MAX >= ip) + && (LZ4_read32(match) == LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + } else { /* byU32, byU16 */ + + U32 const h = LZ4_hashPosition(ip, tableType); + U32 const current = (U32)(ip-base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex < current); + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + matchIndex += dictDelta; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else if (dictDirective==usingExtDict) { + if (matchIndex < startIndex) { + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; /* required for match length counter */ + } + } else { /* single memory segment */ + match = base + matchIndex; + } + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + assert(matchIndex < current); + if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) + && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current)) + && (LZ4_read32(match) == LZ4_read32(ip)) ) { + token=op++; + *token=0; + if (maybe_extMem) offset = current - matchIndex; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source)); + goto _next_match; + } + } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRun = (size_t)(iend - anchor); + if ( (outputDirective) && /* Check output buffer overflow */ + (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) { + if (outputDirective == fillOutput) { + /* adapt lastRun to fill 'dst' */ + assert(olimit >= op); + lastRun = (size_t)(olimit-op) - 1/*token*/; + lastRun -= (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/ + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ + } + } + DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun<<ML_BITS); + } + LZ4_memcpy(op, anchor, lastRun); + ip = anchor + lastRun; + op += lastRun; + } + + if (outputDirective == fillOutput) { + *inputConsumed = (int) (((const char*)ip)-source); + } + result = (int)(((char*)op) - dest); + assert(result > 0); + DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result); + return result; +} + +/** LZ4_compress_generic() : + * inlined, to ensure branches are decided at compilation time; + * takes care of src == (NULL, 0) + * and forward the rest to LZ4_compress_generic_validated */ +LZ4_FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const src, + char* const dst, + const int srcSize, + int *inputConsumed, /* only written when outputDirective == fillOutput */ + const int dstCapacity, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", + srcSize, dstCapacity); + + if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; } /* Unsupported srcSize, too large (or negative) */ + if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */ + if (outputDirective != notLimited && dstCapacity <= 0) return 0; /* no output, can't write anything */ + DEBUGLOG(5, "Generating an empty block"); + assert(outputDirective == notLimited || dstCapacity >= 1); + assert(dst != NULL); + dst[0] = 0; + if (outputDirective == fillOutput) { + assert (inputConsumed != NULL); + *inputConsumed = 0; + } + return 1; + } + assert(src != NULL); + + return LZ4_compress_generic_validated(cctx, src, dst, srcSize, + inputConsumed, /* only written into if outputDirective == fillOutput */ + dstCapacity, outputDirective, + tableType, dictDirective, dictIssue, acceleration); +} + + +int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse; + assert(ctx != NULL); + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + +/** + * LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. It is only safe + * to call if the state buffer is known to be correctly initialized already + * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of + * "correctly initialized"). + */ +int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration) +{ + LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)state)->internal_donotuse; + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + + if (dstCapacity >= LZ4_compressBound(srcSize)) { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); + } + } else { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); + } + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + int result; +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctxPtr = ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctxPtr == NULL) return 0; +#else + LZ4_stream_t ctx; + LZ4_stream_t* const ctxPtr = &ctx; +#endif + result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* src, char* dst, int srcSize, int maxOutputSize) +{ + return LZ4_compress_fast(src, dst, srcSize, maxOutputSize, 1); +} + + +/* Note!: This function leaves the stream in an unclean/broken state! + * It is not safe to subsequently use the same state with a _fastReset() or + * _continue() call without resetting it. */ +static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ + void* const s = LZ4_initStream(state, sizeof (*state)); + assert(s != NULL); (void)s; + + if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } else { + if (*srcSizePtr < LZ4_64Klimit) { + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1); + } else { + tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; + return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1); + } } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctx == NULL) return 0; +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/*-****************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + DEBUGLOG(4, "LZ4_createStream %p", lz4s); + if (lz4s == NULL) return NULL; + LZ4_initStream(lz4s, sizeof(*lz4s)); + return lz4s; +} + +static size_t LZ4_stream_t_alignment(void) +{ +#if LZ4_ALIGN_TEST + typedef struct { char c; LZ4_stream_t t; } t_a; + return sizeof(t_a) - sizeof(LZ4_stream_t); +#else + return 1; /* effectively disabled */ +#endif +} + +LZ4_stream_t* LZ4_initStream (void* buffer, size_t size) +{ + DEBUGLOG(5, "LZ4_initStream"); + if (buffer == NULL) { return NULL; } + if (size < sizeof(LZ4_stream_t)) { return NULL; } + if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL; + MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal)); + return (LZ4_stream_t*)buffer; +} + +/* resetStream is now deprecated, + * prefer initStream() which is more general */ +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream); + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal)); +} + +void LZ4_resetStream_fast(LZ4_stream_t* ctx) { + LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream); + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse; + const tableType_t tableType = byU32; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict); + + /* It's necessary to reset the context, + * and not just continue it with prepareTable() + * to avoid any risk of generating overflowing matchIndex + * when compressing using this dictionary */ + LZ4_resetStream(LZ4_dict); + + /* We always increment the offset by 64 KB, since, if the dict is longer, + * we truncate it to the last 64k, and if it's shorter, we still want to + * advance by a whole window length so we can provide the guarantee that + * there are only valid offsets in the window, which allows an optimization + * in LZ4_compress_fast_continue() where it uses noDictIssue even when the + * dictionary isn't a full 64k. */ + dict->currentOffset += 64 KB; + + if (dictSize < (int)HASH_UNIT) { + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + base = dictEnd - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->tableType = (U32)tableType; + + while (p <= dictEnd-HASH_UNIT) { + LZ4_putPosition(p, dict->hashTable, tableType, base); + p+=3; + } + + return (int)dict->dictSize; +} + +void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream) { + const LZ4_stream_t_internal* dictCtx = dictionaryStream == NULL ? NULL : + &(dictionaryStream->internal_donotuse); + + DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", + workingStream, dictionaryStream, + dictCtx != NULL ? dictCtx->dictSize : 0); + + if (dictCtx != NULL) { + /* If the current offset is zero, we will never look in the + * external dictionary context, since there is no value a table + * entry can take that indicate a miss. In that case, we need + * to bump the offset to something non-zero. + */ + if (workingStream->internal_donotuse.currentOffset == 0) { + workingStream->internal_donotuse.currentOffset = 64 KB; + } + + /* Don't actually attach an empty dictionary. + */ + if (dictCtx->dictSize == 0) { + dictCtx = NULL; + } + } + workingStream->internal_donotuse.dictCtx = dictCtx; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize) +{ + assert(nextSize >= 0); + if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + DEBUGLOG(4, "LZ4_renormDictT"); + for (i=0; i<LZ4_HASH_SIZE_U32; i++) { + if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, + const char* source, char* dest, + int inputSize, int maxOutputSize, + int acceleration) +{ + const tableType_t tableType = byU32; + LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse; + const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i)", inputSize); + + LZ4_renormDictT(streamPtr, inputSize); /* avoid index overflow */ + if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; + + /* invalidate tiny dictionaries */ + if ( (streamPtr->dictSize-1 < 4-1) /* intentional underflow */ + && (dictEnd != (const BYTE*)source) ) { + DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary); + streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)source; + dictEnd = (const BYTE*)source; + } + + /* Check overlapping input/dictionary space */ + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) { + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration); + else + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration); + } + + /* external dictionary mode */ + { int result; + if (streamPtr->dictCtx) { + /* We depend here on the fact that dictCtx'es (produced by + * LZ4_loadDict) guarantee that their tables contain no references + * to offsets between dictCtx->currentOffset - 64 KB and + * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe + * to use noDictIssue even when the dict isn't a full 64 KB. + */ + if (inputSize > 4 KB) { + /* For compressing large blobs, it is faster to pay the setup + * cost to copy the dictionary's tables into the active context, + * so that the compression loop is only looking into one table. + */ + LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr)); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration); + } + } else { + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); + } + } + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force-test external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse; + int result; + + LZ4_renormDictT(streamPtr, srcSize); + + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1); + } else { + result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + } + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)srcSize; + + return result; +} + + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; } + + if (safeBuffer == NULL) assert(dictSize == 0); + if (dictSize > 0) + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/*-******************************* + * Decompression functions + ********************************/ + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; + +#undef MIN +#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) + +/* Read the variable-length literal or match length. + * + * ip - pointer to use as input. + * lencheck - end ip. Return an error if ip advances >= lencheck. + * loop_check - check ip >= lencheck in body of loop. Returns loop_error if so. + * initial_check - check ip >= lencheck before start of loop. Returns initial_error if so. + * error (output) - error code. Should be set to 0 before call. + */ +typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error; +LZ4_FORCE_INLINE unsigned +read_variable_length(const BYTE**ip, const BYTE* lencheck, + int loop_check, int initial_check, + variable_length_error* error) +{ + U32 length = 0; + U32 s; + if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = initial_error; + return length; + } + do { + s = **ip; + (*ip)++; + length += s; + if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = loop_error; + return length; + } + } while (s==255); + + return length; +} + +/*! LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, + * in order to remove useless branches during compilation optimization. + */ +LZ4_FORCE_INLINE int +LZ4_decompress_generic( + const char* const src, + char* const dst, + int srcSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ + + endCondition_directive endOnInput, /* endOnOutputSize, endOnInputSize */ + earlyEnd_directive partialDecoding, /* full, partial */ + dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + if ((src == NULL) || (outputSize < 0)) { return -1; } + + { const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + outputSize; + BYTE* cpy; + + const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + + const BYTE* match; + size_t offset; + unsigned token; + size_t length; + + + DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + if ((endOnInput) && (unlikely(outputSize==0))) { + /* Empty output buffer */ + if (partialDecoding) return 0; + return ((srcSize==1) && (*ip==0)) ? 0 : -1; + } + if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); } + if ((endOnInput) && unlikely(srcSize==0)) { return -1; } + + /* Currently the fast loop shows a regression on qualcomm arm chips. */ +#if LZ4_FAST_DEC_LOOP + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(6, "skip fast decode loop"); + goto safe_decode; + } + + /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */ + while (1) { + /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); + if (endOnInput) { assert(ip < iend); } + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); + if (error == initial_error) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + + /* copy literals */ + cpy = op+length; + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if (endOnInput) { /* LZ4_decompress_safe() */ + if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } + LZ4_wildCopy32(op, ip, cpy); + } else { /* LZ4_decompress_fast() */ + if (cpy>oend-8) { goto safe_literal_copy; } + LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and only relies on end-of-block properties */ + } + ip += length; op = cpy; + } else { + cpy = op+length; + if (endOnInput) { /* LZ4_decompress_safe() */ + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* We don't need to check oend, since we check it once for each loop below */ + if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; } + /* Literals can only be 14, but hope compilers optimize if we copy by a register size */ + LZ4_memcpy(op, ip, 16); + } else { /* LZ4_decompress_fast() */ + /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and relies on end-of-block properties */ + LZ4_memcpy(op, ip, 8); + if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); } + } + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + assert(match <= op); + + /* get matchlength */ + length = token & ML_MASK; + + if (length == ML_MASK) { + variable_length_error error = ok; + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ + length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); + if (error != ok) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + } else { + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + + /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */ + if ((dict == withPrefix64k) || (match >= lowPrefix)) { + if (offset >= 8) { + assert(match >= lowPrefix); + assert(match <= op); + assert(op + 18 <= oend); + + LZ4_memcpy(op, match, 8); + LZ4_memcpy(op+8, match+8, 8); + LZ4_memcpy(op+16, match+16, 2); + op += length; + continue; + } } } + + if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) { + DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd"); + length = MIN(length, (size_t)(oend-op)); + } else { + goto _output_error; /* end-of-block condition violated */ + } } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) { *op++ = *copyFrom++; } + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + + assert((op <= oend) && (oend-op >= 32)); + if (unlikely(offset<16)) { + LZ4_memcpy_using_offset(op, match, cpy, offset); + } else { + LZ4_wildCopy32(op, match, cpy); + } + + op = cpy; /* wildcopy correction */ + } + safe_decode: +#endif + + /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ + while (1) { + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ( (endOnInput ? length != RUN_MASK : length <= 8) + /* strictly "less than" on input, to re-enter the loop with at least one byte */ + && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) { + /* Copy the literals */ + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* The second stage: prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ( (length != ML_MASK) + && (offset >= 8) + && (dict==withPrefix64k || match >= lowPrefix) ) { + /* Copy the match. */ + LZ4_memcpy(op + 0, match + 0, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); + if (error == initial_error) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + } + + /* copy literals */ + cpy = op+length; +#if LZ4_FAST_DEC_LOOP + safe_literal_copy: +#endif + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) + { + /* We've either hit the input parsing restriction or the output parsing restriction. + * In the normal scenario, decoding a full block, it must be the last sequence, + * otherwise it's an error (invalid input or dimensions). + * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow. + */ + if (partialDecoding) { + /* Since we are partial decoding we may be in this block because of the output parsing + * restriction, which is not valid since the output buffer is allowed to be undersized. + */ + assert(endOnInput); + DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end") + DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length); + DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op)); + DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip)); + /* Finishing in the middle of a literals segment, + * due to lack of input. + */ + if (ip+length > iend) { + length = (size_t)(iend-ip); + cpy = op + length; + } + /* Finishing in the middle of a literals segment, + * due to lack of output space. + */ + if (cpy > oend) { + cpy = oend; + assert(op<=oend); + length = (size_t)(oend-op); + } + } else { + /* We must be on the last sequence because of the parsing limitations so check + * that we exactly regenerate the original size (must be exact when !endOnInput). + */ + if ((!endOnInput) && (cpy != oend)) { goto _output_error; } + /* We must be on the last sequence (or invalid) because of the parsing limitations + * so check that we exactly consume the input and don't overrun the output buffer. + */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) { + DEBUGLOG(6, "should have been last run of literals") + DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend); + DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend); + goto _output_error; + } + } + memmove(op, ip, length); /* supports overlapping memory regions; only matters for in-place decompression scenarios */ + ip += length; + op += length; + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) { + break; + } + } else { + LZ4_wildCopy8(op, ip, cpy); /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + + _copy_match: + if (length == ML_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); + if (error != ok) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + +#if LZ4_FAST_DEC_LOOP + safe_match_copy: +#endif + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) length = MIN(length, (size_t)(oend-op)); + else goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + assert(match >= lowPrefix); + + /* copy match within block */ + cpy = op + length; + + /* partialDecoding : may end anywhere within the block */ + assert(op<=oend); + if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = MIN(length, (size_t)(oend-op)); + const BYTE* const matchEnd = match + mlen; + BYTE* const copyEnd = op + mlen; + if (matchEnd > op) { /* overlap copy */ + while (op < copyEnd) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) { break; } + continue; + } + + if (unlikely(offset<8)) { + LZ4_write32(op, 0); /* silence msan warning when offset==0 */ + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + LZ4_memcpy(op+4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_memcpy(op, match, 8); + match += 8; + } + op += 8; + + if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy8(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, 8); + if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); } + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + if (endOnInput) { + DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); + return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ + } else { + return (int) (((const char*)ip)-src); /* Nb of input bytes read */ + } + + /* Overflow error detected */ + _output_error: + return (int) (-(((const char*)ip)-src))-1; + } +} + + +/*===== Instantiate the API decoding functions. =====*/ + +LZ4_FORCE_O2 +int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, + endOnInputSize, decode_full_block, noDict, + (BYTE*)dest, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + endOnInputSize, partial_decode, + noDict, (BYTE*)dst, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/*===== Instantiate a few more decoding cases, used more than once. =====*/ + +LZ4_FORCE_O2 /* Exported, an obsolete API function. */ +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/* Another obsolete API function, paired with the previous one. */ +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + /* LZ4_decompress_fast doesn't validate match offsets, + * and thus serves well with any prefixed dictionary. */ + return LZ4_decompress_fast(source, dest, originalSize); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, noDict, + (BYTE*)dest-prefixSize, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, + int compressedSize, int maxOutputSize, + const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize, + const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, usingExtDict, + (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +/* The "double dictionary" mode, for use with e.g. ring buffers: the first part + * of the dictionary is passed as prefix, and the second via dictStart + dictSize. + * These routines are used only once, in LZ4_decompress_*_continue(). + */ +LZ4_FORCE_INLINE +int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize, + size_t prefixSize, const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, decode_full_block, usingExtDict, + (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_INLINE +int LZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize, + size_t prefixSize, const void* dictStart, size_t dictSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, decode_full_block, usingExtDict, + (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); +} + +/*===== streaming decompression functions =====*/ + +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t)); + LZ4_STATIC_ASSERT(LZ4_STREAMDECODESIZE >= sizeof(LZ4_streamDecode_t_internal)); /* A compilation error here means LZ4_STREAMDECODESIZE is not large enough */ + return lz4s; +} + +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) +{ + if (LZ4_stream == NULL) { return 0; } /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it was decoded. + * Loading a size of 0 is allowed (same effect as no dictionary). + * @return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/*! LZ4_decoderRingBufferSize() : + * when setting a ring buffer for streaming decompression (optional scenario), + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * Note : in a ring buffer scenario, + * blocks are presumed decompressed next to each other. + * When not enough space remains for next block (remainingSize < maxBlockSize), + * decoding resumes from beginning of ring buffer. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +int LZ4_decoderRingBufferSize(int maxBlockSize) +{ + if (maxBlockSize < 0) return 0; + if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0; + if (maxBlockSize < 16) maxBlockSize = 16; + return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize); +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +LZ4_FORCE_O2 +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixSize == 0) { + /* The first call, no dictionary yet. */ + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + /* They're rolling the current segment. */ + if (lz4sd->prefixSize >= 64 KB - 1) + result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + else if (lz4sd->extDictSize == 0) + result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize); + else + result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize, + lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)result; + lz4sd->prefixEnd += result; + } else { + /* The buffer wraps around, or they're switching to another buffer. */ + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +LZ4_FORCE_O2 +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + assert(originalSize >= 0); + + if (lz4sd->prefixSize == 0) { + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_fast(source, dest, originalSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0) + result = LZ4_decompress_fast(source, dest, originalSize); + else + result = LZ4_decompress_fast_doubleDict(source, dest, originalSize, + lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += (size_t)originalSize; + lz4sd->prefixEnd += originalSize; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_fast_extDict(source, dest, originalSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (dictStart+dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + if (dictSize==0 || dictStart+dictSize == dest) + return LZ4_decompress_fast(source, dest, originalSize); + assert(dictSize >= 0); + return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize); +} + + +/*=************************************************* +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_default(source, dest, inputSize, maxOutputSize); +} +int LZ4_compress(const char* src, char* dest, int srcSize) +{ + return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize)); +} +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); +} +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); +} +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity) +{ + return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1); +} +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) +{ + return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); +} + +/* +These decompression functions are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) +{ + return LZ4_decompress_fast(source, dest, outputSize); +} +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) +{ + return LZ4_decompress_safe(source, dest, isize, maxOutputSize); +} + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState(void) { return LZ4_STREAMSIZE; } + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + (void)inputBuffer; + LZ4_resetStream((LZ4_stream_t*)state); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + (void)inputBuffer; + return LZ4_createStream(); +} + +char* LZ4_slideInputBuffer (void* state) +{ + /* avoid const char * -> char * conversion warning */ + return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary; +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/libbutl/lz4.cxx b/libbutl/lz4.cxx new file mode 100644 index 0000000..2db7af2 --- /dev/null +++ b/libbutl/lz4.cxx @@ -0,0 +1,555 @@ +// file : libbutl/lz4.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbutl/lz4.hxx> + +// This careful macro dance makes sure that all the LZ4 C API functions are +// made static while making sure we include the headers in the same way as the +// implementation files that we include below. +// +#define LZ4LIB_VISIBILITY static +#define LZ4_STATIC_LINKING_ONLY +#define LZ4_PUBLISH_STATIC_FUNCTIONS +#define LZ4_DISABLE_DEPRECATE_WARNINGS +#include "lz4.h" +#include "lz4hc.h" + +#define LZ4FLIB_VISIBILITY static +#define LZ4F_STATIC_LINKING_ONLY +#define LZ4F_PUBLISH_STATIC_FUNCTIONS +#define LZ4F_DISABLE_DEPRECATE_WARNINGS +#include "lz4frame.h" + +#include <new> // bad_alloc +#include <memory> // unique_ptr +#include <cstring> // memcpy() +#include <cassert> +#include <stdexcept> // invalid_argument, logic_error + +#include <libbutl/utility.hxx> // eos() + +#if 0 +#include <libbutl/lz4-stream.hxx> +#endif + +using namespace std; + +namespace butl +{ + namespace lz4 + { + static inline size_t + block_size (LZ4F_blockSizeID_t id) + { + return (id == LZ4F_max4MB ? 4 * 1024 * 1024 : + id == LZ4F_max1MB ? 1 * 1024 * 1024 : + id == LZ4F_max256KB ? 256 * 1024 : + id == LZ4F_max64KB ? 64 * 1024 : 0); + } + + [[noreturn]] static void + throw_exception (LZ4F_errorCodes c) + { + using i = invalid_argument; + + switch (c) + { + case LZ4F_ERROR_GENERIC: throw i ("generic LZ4 error"); + case LZ4F_ERROR_maxBlockSize_invalid: throw i ("invalid LZ4 block size"); + case LZ4F_ERROR_blockMode_invalid: throw i ("invalid LZ4 block mode"); + case LZ4F_ERROR_contentChecksumFlag_invalid: throw i ("invalid LZ4 content checksum flag"); + case LZ4F_ERROR_compressionLevel_invalid: throw i ("invalid LZ4 compression level"); + case LZ4F_ERROR_headerVersion_wrong: throw i ("wrong LZ4 header version"); + case LZ4F_ERROR_blockChecksum_invalid: throw i ("invalid LZ4 block checksum"); + case LZ4F_ERROR_reservedFlag_set: throw i ("reserved LZ4 flag set"); + case LZ4F_ERROR_srcSize_tooLarge: throw i ("LZ4 input too large"); + case LZ4F_ERROR_dstMaxSize_tooSmall: throw i ("LZ4 output too small"); + case LZ4F_ERROR_frameHeader_incomplete: throw i ("incomplete LZ4 frame header"); + case LZ4F_ERROR_frameType_unknown: throw i ("unknown LZ4 frame type"); + case LZ4F_ERROR_frameSize_wrong: throw i ("wrong LZ4 frame size"); + case LZ4F_ERROR_decompressionFailed: throw i ("invalid LZ4 compressed content"); + case LZ4F_ERROR_headerChecksum_invalid: throw i ("invalid LZ4 header checksum"); + case LZ4F_ERROR_contentChecksum_invalid: throw i ("invalid LZ4 content checksum"); + + case LZ4F_ERROR_allocation_failed: throw bad_alloc (); + + // These seem to be programming errors. + // + case LZ4F_ERROR_srcPtr_wrong: // NULL pointer. + case LZ4F_ERROR_frameDecoding_alreadyStarted: // Incorrect call seq. + + // We should never get these. + // + case LZ4F_OK_NoError: + case LZ4F_ERROR_maxCode: + case _LZ4F_dummy_error_enum_for_c89_never_used: + break; + } + + assert (false); + throw logic_error (LZ4F_getErrorName ((LZ4F_errorCode_t)(-c))); + } + + // As above but for erroneous LZ4F_*() function result. + // + [[noreturn]] static inline void + throw_exception (size_t r) + { + throw_exception (LZ4F_getErrorCode (r)); + } + + // compression + // + + compressor:: + ~compressor () + { + if (LZ4F_cctx* ctx = static_cast<LZ4F_cctx*> (ctx_)) + { + LZ4F_errorCode_t e (LZ4F_freeCompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + } + + inline void compressor:: + init_preferences (void* vp) const + { + LZ4F_preferences_t* p (static_cast<LZ4F_preferences_t*> (vp)); + + p->autoFlush = 1; + p->favorDecSpeed = 0; + p->compressionLevel = level_; + p->frameInfo.blockMode = LZ4F_blockLinked; + p->frameInfo.blockSizeID = static_cast<LZ4F_blockSizeID_t> (block_id_); + p->frameInfo.blockChecksumFlag = LZ4F_noBlockChecksum; + p->frameInfo.contentChecksumFlag = LZ4F_contentChecksumEnabled; + p->frameInfo.contentSize = content_size_ + ? static_cast<unsigned long long> (*content_size_) + : 0; + } + + void compressor:: + begin (int level, + int block_id, + optional<uint64_t> content_size) + { + assert (block_id >= 4 && block_id <= 7); + + level_ = level; + block_id_ = block_id; + content_size_ = content_size; + + LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; + init_preferences (&prefs); + + // Input/output buffer capacities. + // + // To be binary compatible with the lz4 utility we have to compress + // files that fit into the block with a single *_compressFrame() call + // instead of *_compressBegin()/*_compressUpdate(). And to determine the + // output buffer capacity we must use *_compressFrameBound() instead of + // *_compressBound(). The problem is, at this stage (before filling the + // input buffer), we don't know which case it will be. + // + // However, in our case (autoFlush=1), *Bound() < *FrameBound() and so + // we can always use the latter at the cost of slight overhead. Also, + // using *FrameBound() allows us to call *Begin() and *Update() without + // flushing the buffer in between (this insight is based on studying the + // implementation of the *Bound() functions). + // + // Actually, we can use content_size (we can get away with much smaller + // buffers for small inputs). We just need to verify the caller is not + // lying to us (failed that, we may end up with strange error like + // insufficient output buffer space). + // + ic = block_size (prefs.frameInfo.blockSizeID); + + if (content_size_ && *content_size_ < ic) + { + // This is nuanced: we need to add an extra byte in order to detect + // EOF. + // + ic = static_cast<size_t> (*content_size_) + 1; + } + + oc = LZ4F_compressFrameBound (ic, &prefs); + + begin_ = true; + } + + void compressor:: + next (bool end) + { + LZ4F_cctx* ctx; + + // Unlike the decompression case below, compression cannot fail due to + // invalid content. So any LZ4F_*() function failure is either due to a + // programming bug or argument inconsistencies (e.g., content size does + // not match actual). + + if (begin_) + { + begin_ = false; + + LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; + init_preferences (&prefs); + + // If we've allocated smaller buffers based on content_size_, then + // verify the input size matches what's promised. + // + // Note also that LZ4F_compressFrame() does not fail if it doesn't + // match instead replacing it with the actual value. + // + size_t bs (block_size (prefs.frameInfo.blockSizeID)); + if (content_size_ && *content_size_ < bs) + { + if (!end || in != *content_size_) + throw_exception (LZ4F_ERROR_frameSize_wrong); + } + + // Must be < for lz4 compatibility (see EOF nuance above for the + // likely reason). + // + if (end && in < bs) + { + on = LZ4F_compressFrame (ob, oc, ib, in, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + in = 0; // All consumed. + return; + } + else + { + if (LZ4F_isError (LZ4F_createCompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); + + ctx_ = ctx; + + // Write the header. + // + on = LZ4F_compressBegin (ctx, ob, oc, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + // Fall through. + } + } + else + { + ctx = static_cast<LZ4F_cctx*> (ctx_); + on = 0; + } + + size_t n; + + if (in != 0) + { + n = LZ4F_compressUpdate (ctx, ob + on, oc - on, ib, in, nullptr); + if (LZ4F_isError (n)) + throw_exception (n); + + in = 0; // All consumed. + on += n; + } + + // Write the end marker. + // + if (end) + { + // Note that this call also verifies specified and actual content + // sizes match. + // + n = LZ4F_compressEnd (ctx, ob + on, oc - on, nullptr); + if (LZ4F_isError (n)) + throw_exception (n); + + on += n; + } + } + + uint64_t + compress (ofdstream& os, ifdstream& is, + int level, + int block_id, + optional<uint64_t> content_size) + { +#if 0 + char buf[1024 * 3 + 7]; + ostream cos (os, level, block_id, content_size); + + for (bool e (false); !e; ) + { + e = eof (is.read (buf, sizeof (buf))); + cos.write (buf, is.gcount ()); + //for (streamsize i (0), n (is.gcount ()); i != n; ++i) + // cos.put (buf[i]); + } + + cos.close (); + return content_size ? *content_size : 0; +#else + compressor c; + + // Input/output buffer guards. + // + unique_ptr<char[]> ibg; + unique_ptr<char[]> obg; + + // First determine required buffer capacities. + // + c.begin (level, block_id, content_size); + + ibg.reset ((c.ib = new char[c.ic])); + obg.reset ((c.ob = new char[c.oc])); + + // Read into the input buffer updating the eof flag. + // + // Note that we could try to do direct fd read/write but that would + // complicate things quite a bit (error handling, stream state, etc). + // + bool eof (false); + auto read = [&is, &c, &eof] () + { + eof = butl::eof (is.read (c.ib, c.ic)); + c.in = static_cast<size_t> (is.gcount ()); + }; + + // Write from the output buffer updating the total written. + // + uint64_t ot (0); + auto write = [&os, &c, &ot] () + { + os.write (c.ob, static_cast<streamsize> (c.on)); + ot += c.on; + }; + + // Keep reading, compressing, and writing chunks of content. + // + while (!eof) + { + read (); + + c.next (eof); + + if (c.on != 0) // next() may just buffer the data. + write (); + } + + return ot; +#endif + } + + // decompression + // + + static_assert (sizeof (decompressor::hb) == LZ4F_HEADER_SIZE_MAX, + "LZ4 header size mismatch"); + + decompressor:: + ~decompressor () + { + if (LZ4F_dctx* ctx = static_cast<LZ4F_dctx*> (ctx_)) + { + LZ4F_errorCode_t e (LZ4F_freeDecompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + } + + size_t decompressor:: + begin (optional<uint64_t>* content_size) + { + LZ4F_dctx* ctx; + + if (LZ4F_isError (LZ4F_createDecompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); + + ctx_ = ctx; + + LZ4F_frameInfo_t info = LZ4F_INIT_FRAMEINFO; + + // Input hint and end as signalled by the LZ4F_*() functions. + // + size_t h, e; + + h = LZ4F_getFrameInfo (ctx, &info, hb, &(e = hn)); + if (LZ4F_isError (h)) + throw_exception (h); + + if (content_size != nullptr) + { + if (info.contentSize != 0) + *content_size = static_cast<uint64_t> (info.contentSize); + else + *content_size = nullopt; + } + + // Use the block size for the output buffer capacity and compressed + // bound plus the header size for the input. The expectation is that + // LZ4F_decompress() should never hint for more than that. + // + oc = block_size (info.blockSizeID); + ic = LZ4F_compressBound (oc, nullptr) + LZ4F_BLOCK_HEADER_SIZE; + + assert (h <= ic); + + // Move over whatever is left in the header buffer to be beginning. + // + hn -= e; + memmove (hb, hb + e, hn); + + return h; + } + + size_t decompressor:: + next () + { + LZ4F_dctx* ctx (static_cast<LZ4F_dctx*> (ctx_)); + + size_t h, e; + + // Note that LZ4F_decompress() verifies specified and actual content + // sizes match (similar to compression). + // + h = LZ4F_decompress (ctx, ob, &(on = oc), ib, &(e = in), nullptr); + if (LZ4F_isError (h)) + throw_exception (h); + + // We expect LZ4F_decompress() to consume what it asked for. + // + assert (e == in && h <= ic); + in = 0; // All consumed. + + return h; + } + + uint64_t + decompress (ofdstream& os, ifdstream& is) + { + // Write the specified number of bytes from the output buffer updating + // the total written. + // + uint64_t ot (0); + auto write = [&os, &ot] (char* b, size_t n) + { + os.write (b, static_cast<streamsize> (n)); + ot += n; + }; + +#if 0 + char buf[1024 * 3 + 7]; + istream dis (is, true, istream::badbit); + + for (bool e (false); !e; ) + { + e = eof (dis.read (buf, sizeof (buf))); + write (buf, static_cast<size_t> (dis.gcount ())); + } +#else + // Read into the specified buffer returning the number of bytes read and + // updating the eof flag. + // + bool eof (false); + auto read = [&is, &eof] (char* b, size_t c) -> size_t + { + size_t n (0); + do + { + eof = butl::eof (is.read (b + n, c - n)); + n += static_cast<size_t> (is.gcount ()); + } + while (!eof && n != c); + + return n; + }; + + decompressor d; + + // Input/output buffer guards. + // + unique_ptr<char[]> ibg; + unique_ptr<char[]> obg; + + size_t h; // Input hint. + + // First read in the header and allocate the buffers. + // + // What if we hit EOF here? And could begin() return 0? Turns out the + // answer to both questions is yes: 0-byte content compresses to 15 + // bytes (with or without content size; 1-byte -- to 20/28 bytes). We + // can ignore EOF here since an attempt to read more will result in + // another EOF. And code below is prepared to handle 0 initial hint. + // + // @@ We could end up leaving some of the input content from the + // header in the input buffer which the caller will have to way + // of using/detecting. + // + d.hn = read (d.hb, sizeof (d.hb)); + h = d.begin (); + + ibg.reset ((d.ib = new char[d.ic])); + obg.reset ((d.ob = new char[d.oc])); + + // Copy over whatever is left in the header buffer and read up to + // the hinted size. + // + memcpy (d.ib, d.hb, (d.in = d.hn)); + + if (h > d.in) + d.in += read (d.ib + d.in, h - d.in); + + // Keep decompressing, writing, and reading chunks of compressed + // content. + // + while (h != 0) + { + h = d.next (); + + if (d.on != 0) // next() may just buffer the data. + write (d.ob, d.on); + + if (h != 0) + { + if (eof) + throw invalid_argument ("incomplete LZ4 compressed content"); + + d.in = read (d.ib, h); + } + } +#endif + + return ot; + } + } +} + +// Include the implementation into our translation unit. Let's keep it last +// since the implementation defines a bunch of macros. +// +#if defined(__clang__) || defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +// This header is only include in the implementation so we can include it +// here instead of the above. +// +#define XXH_PRIVATE_API // Makes API static and includes xxhash.c. +#include "xxhash.h" + +// Clang targeting MSVC prior to version 10 has difficulty with _tzcnt_u64() +// (see Clang bug 47099 for a potentially related issue). Including relevant +// headers (<immintrin.h>, <intrin.h>) does not appear to help. So for now we +// just disable the use of _tzcnt_u64(). +// +#if defined(_MSC_VER) && defined(__clang__) && __clang_major__ < 10 +# define LZ4_FORCE_SW_BITCOUNT +#endif + +// Note that the order of inclusion is important (see *_SRC_INCLUDED macros). +// +extern "C" +{ +#include "lz4.c" +#include "lz4hc.c" +#include "lz4frame.c" +} diff --git a/libbutl/lz4.h b/libbutl/lz4.h new file mode 100644 index 0000000..7ab1e48 --- /dev/null +++ b/libbutl/lz4.h @@ -0,0 +1,774 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef LZ4_H_2983827168210 +#define LZ4_H_2983827168210 + +/* --- Dependency --- */ +#include <stddef.h> /* size_t */ + + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core, + scalable with multi-cores CPU. It features an extremely fast decoder, with speed in + multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression functions. + It gives full buffer control to user. + Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md). + Decompressing such a compressed block requires additional metadata. + Exact metadata depends on exact decompression function. + For the typical case of LZ4_decompress_safe(), + metadata includes block's compressed size, and maximum bound of decompressed size. + Each application is free to encode and pass such metadata in whichever way it wants. + + lz4.h only handle blocks, it can not generate Frames. + + Blocks are different from Frames (doc/lz4_Frame_format.md). + Frames bundle both blocks and metadata in a specified manner. + Embedding metadata is required for compressed data to be self-contained and portable. + Frame format is delivered through a companion API, declared in lz4frame.h. + The `lz4` CLI can only manage frames. +*/ + +/*^*************************************************************** +* Export parameters +*****************************************************************/ +/* +* LZ4_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +* LZ4LIB_VISIBILITY : +* Control library symbols visibility. +*/ +#ifndef LZ4LIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define LZ4LIB_VISIBILITY +# endif +#endif +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define LZ4LIB_API LZ4LIB_VISIBILITY +#endif + +/*------ Version ------*/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 9 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 3 /* for tweaks, bug-fixes, or development */ + +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) + +LZ4LIB_API int LZ4_versionNumber (void); /**< library version number; useful to check dll version */ +LZ4LIB_API const char* LZ4_versionString (void); /**< library version string; useful to check dll version */ + + +/*-************************************ +* Tuning parameter +**************************************/ +/*! + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio. + * Reduced memory usage may improve speed, thanks to better cache locality. + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#ifndef LZ4_MEMORY_USAGE +# define LZ4_MEMORY_USAGE 14 +#endif + + +/*-************************************ +* Simple Functions +**************************************/ +/*! LZ4_compress_default() : + * Compresses 'srcSize' bytes from buffer 'src' + * into already allocated 'dst' buffer of size 'dstCapacity'. + * Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize). + * It also runs faster, so it's a recommended setting. + * If the function cannot compress 'src' into a more limited 'dst' budget, + * compression stops *immediately*, and the function result is zero. + * In which case, 'dst' content is undefined (invalid). + * srcSize : max supported value is LZ4_MAX_INPUT_SIZE. + * dstCapacity : size of buffer 'dst' (which must be already allocated) + * @return : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity) + * or 0 if compression fails + * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer). + */ +LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity); + +/*! LZ4_decompress_safe() : + * compressedSize : is the exact complete size of the compressed block. + * dstCapacity : is the size of destination buffer (which must be already allocated), presumed an upper bound of decompressed size. + * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity) + * If destination buffer is not large enough, decoding will stop and output an error code (negative value). + * If the source stream is detected malformed, the function will stop decoding and return a negative result. + * Note 1 : This function is protected against malicious data packets : + * it will never writes outside 'dst' buffer, nor read outside 'source' buffer, + * even if the compressed block is maliciously modified to order the decoder to do these actions. + * In such case, the decoder stops immediately, and considers the compressed block malformed. + * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them. + * The implementation is free to send / store / derive this information in whichever way is most beneficial. + * If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead. + */ +LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity); + + +/*-************************************ +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/*! LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is incorrect (too large or negative) +*/ +LZ4LIB_API int LZ4_compressBound(int inputSize); + +/*! LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows selection of "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). + Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c). +*/ +LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + + +/*! LZ4_compress_fast_extState() : + * Same as LZ4_compress_fast(), using an externally allocated memory space for its state. + * Use LZ4_sizeofState() to know how much memory must be allocated, + * and allocate it on 8-bytes boundaries (using `malloc()` typically). + * Then, provide this buffer as `void* state` to compression function. + */ +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + + +/*! LZ4_compress_destSize() : + * Reverse the logic : compresses as much data as possible from 'src' buffer + * into already allocated buffer 'dst', of size >= 'targetDestSize'. + * This function either compresses the entire 'src' content into 'dst' if it's large enough, + * or fill 'dst' buffer completely with as much data as possible from 'src'. + * note: acceleration parameter is fixed to "default". + * + * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'. + * New value is necessarily <= input value. + * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize) + * or 0 if compression fails. + * + * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed un v1.9.2+): + * the produced compressed content could, in specific circumstances, + * require to be decompressed into a destination buffer larger + * by at least 1 byte than the content to decompress. + * If an application uses `LZ4_compress_destSize()`, + * it's highly recommended to update liblz4 to v1.9.2 or better. + * If this can't be done or ensured, + * the receiving decompression function should provide + * a dstCapacity which is > decompressedSize, by at least 1 byte. + * See https://github.com/lz4/lz4/issues/859 for details + */ +LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize); + + +/*! LZ4_decompress_safe_partial() : + * Decompress an LZ4 compressed block, of size 'srcSize' at position 'src', + * into destination buffer 'dst' of size 'dstCapacity'. + * Up to 'targetOutputSize' bytes will be decoded. + * The function stops decoding on reaching this objective. + * This can be useful to boost performance + * whenever only the beginning of a block is required. + * + * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize) + * If source stream is detected malformed, function returns a negative result. + * + * Note 1 : @return can be < targetOutputSize, if compressed block contains less data. + * + * Note 2 : targetOutputSize must be <= dstCapacity + * + * Note 3 : this function effectively stops decoding on reaching targetOutputSize, + * so dstCapacity is kind of redundant. + * This is because in older versions of this function, + * decoding operation would still write complete sequences. + * Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize, + * it could write more bytes, though only up to dstCapacity. + * Some "margin" used to be required for this operation to work properly. + * Thankfully, this is no longer necessary. + * The function nonetheless keeps the same signature, in an effort to preserve API compatibility. + * + * Note 4 : If srcSize is the exact size of the block, + * then targetOutputSize can be any value, + * including larger than the block's decompressed size. + * The function will, at most, generate block's decompressed size. + * + * Note 5 : If srcSize is _larger_ than block's compressed size, + * then targetOutputSize **MUST** be <= block's decompressed size. + * Otherwise, *silent corruption will occur*. + */ +LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity); + + +/*-********************************************* +* Streaming Compression Functions +***********************************************/ +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/*! LZ4_resetStream_fast() : v1.9.0+ + * Use this to prepare an LZ4_stream_t for a new chain of dependent blocks + * (e.g., LZ4_compress_fast_continue()). + * + * An LZ4_stream_t must be initialized once before usage. + * This is automatically done when created by LZ4_createStream(). + * However, should the LZ4_stream_t be simply declared on stack (for example), + * it's necessary to initialize it first, using LZ4_initStream(). + * + * After init, start any new stream with LZ4_resetStream_fast(). + * A same LZ4_stream_t can be re-used multiple times consecutively + * and compress multiple streams, + * provided that it starts each new stream with LZ4_resetStream_fast(). + * + * LZ4_resetStream_fast() is much faster than LZ4_initStream(), + * but is not compatible with memory regions containing garbage data. + * + * Note: it's only useful to call LZ4_resetStream_fast() + * in the context of streaming compression. + * The *extState* functions perform their own resets. + * Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive. + */ +LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr); + +/*! LZ4_loadDict() : + * Use this function to reference a static dictionary into LZ4_stream_t. + * The dictionary must remain available during compression. + * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + * The same dictionary will have to be loaded on decompression side for successful decoding. + * Dictionary are useful for better compression of small data (KB range). + * While LZ4 accept any input as dictionary, + * results are generally better when using Zstandard's Dictionary Builder. + * Loading a size of 0 is allowed, and is the same as reset. + * @return : loaded dictionary size, in bytes (necessarily <= 64 KB) + */ +LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_compress_fast_continue() : + * Compress 'src' content using data from previously compressed blocks, for better compression ratio. + * 'dst' buffer must be already allocated. + * If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * + * @return : size of compressed block + * or 0 if there is an error (typically, cannot fit into 'dst'). + * + * Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block. + * Each block has precise boundaries. + * Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata. + * It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together. + * + * Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory ! + * + * Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB. + * Make sure that buffers are separated, by at least one byte. + * This construction ensures that each block only depends on previous block. + * + * Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB. + * + * Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed. + */ +LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_saveDict() : + * If last 64KB data cannot be guaranteed to remain available at its current memory location, + * save it into a safer place (char* safeBuffer). + * This is schematically equivalent to a memcpy() followed by LZ4_loadDict(), + * but is much faster, because LZ4_saveDict() doesn't need to rebuild tables. + * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error. + */ +LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize); + + +/*-********************************************** +* Streaming Decompression Functions +* Bufferless synchronous API +************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */ + +/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : + * creation / destruction of streaming decompression tracking context. + * A tracking context can be re-used multiple times. + */ +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/*! LZ4_setStreamDecode() : + * An LZ4_streamDecode_t context can be allocated once and re-used multiple times. + * Use this function to start decompression of a new stream of blocks. + * A dictionary can optionally be set. Use NULL or size 0 for a reset order. + * Dictionary is presumed stable : it must remain accessible and unmodified during next decompression. + * @return : 1 if OK, 0 if error + */ +LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/*! LZ4_decoderRingBufferSize() : v1.8.2+ + * Note : in a ring buffer scenario (optional), + * blocks are presumed decompressed next to each other + * up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize), + * at which stage it resumes from beginning of ring buffer. + * When setting such a ring buffer for streaming decompression, + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize); +#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */ + +/*! LZ4_decompress_*_continue() : + * These decoding functions allow decompression of consecutive blocks in "streaming" mode. + * A block is an unsplittable entity, it must be presented entirely to a decompression function. + * Decompression functions only accepts one block at a time. + * The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded. + * If less than 64KB of data has been decoded, all the data must be present. + * + * Special : if decompression side sets a ring buffer, it must respect one of the following conditions : + * - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize). + * maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes. + * In which case, encoding and decoding buffers do not need to be synchronized. + * Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize. + * - Synchronized mode : + * Decompression buffer size is _exactly_ the same as compression buffer size, + * and follows exactly same update rule (block boundaries at same positions), + * and decoding function is provided with exact decompressed size of each block (exception for last block of the stream), + * _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB). + * - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * In which case, encoding and decoding buffers do not need to be synchronized, + * and encoding ring buffer can have any size, including small ones ( < 64 KB). + * + * Whenever these conditions are not possible, + * save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression, + * then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block. +*/ +LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity); + + +/*! LZ4_decompress_*_usingDict() : + * These decoding functions work the same as + * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue() + * They are stand-alone, and don't need an LZ4_streamDecode_t structure. + * Dictionary is presumed stable : it must remain accessible and unmodified during decompression. + * Performance tip : Decompression speed can be substantially increased + * when dst == dictStart + dictSize. + */ +LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize); + +#endif /* LZ4_H_2983827168210 */ + + +/*^************************************* + * !!!!!! STATIC LINKING ONLY !!!!!! + ***************************************/ + +/*-**************************************************************************** + * Experimental section + * + * Symbols declared in this section must be considered unstable. Their + * signatures or semantics may change, or they may be removed altogether in the + * future. They are therefore only safe to depend on when the caller is + * statically linked against the library. + * + * To protect against unsafe usage, not only are the declarations guarded, + * the definitions are hidden by default + * when building LZ4 as a shared/dynamic library. + * + * In order to access these declarations, + * define LZ4_STATIC_LINKING_ONLY in your application + * before including LZ4's headers. + * + * In order to make their implementations accessible dynamically, you must + * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library. + ******************************************************************************/ + +#ifdef LZ4_STATIC_LINKING_ONLY + +#ifndef LZ4_STATIC_3504398509 +#define LZ4_STATIC_3504398509 + +#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS +#define LZ4LIB_STATIC_API LZ4LIB_API +#else +#define LZ4LIB_STATIC_API +#endif + + +/*! LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. + * It is only safe to call if the state buffer is known to be correctly initialized already + * (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized"). + * From a high level, the difference is that + * this function initializes the provided state with a call to something like LZ4_resetStream_fast() + * while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream(). + */ +LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_attach_dictionary() : + * This is an experimental API that allows + * efficient use of a static dictionary many times. + * + * Rather than re-loading the dictionary buffer into a working context before + * each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a + * working LZ4_stream_t, this function introduces a no-copy setup mechanism, + * in which the working stream references the dictionary stream in-place. + * + * Several assumptions are made about the state of the dictionary stream. + * Currently, only streams which have been prepared by LZ4_loadDict() should + * be expected to work. + * + * Alternatively, the provided dictionaryStream may be NULL, + * in which case any existing dictionary stream is unset. + * + * If a dictionary is provided, it replaces any pre-existing stream history. + * The dictionary contents are the only history that can be referenced and + * logically immediately precede the data compressed in the first subsequent + * compression call. + * + * The dictionary will only remain attached to the working stream through the + * first compression call, at the end of which it is cleared. The dictionary + * stream (and source buffer) must remain in-place / accessible / unchanged + * through the completion of the first compression call on the stream. + */ +LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream); + + +/*! In-place compression and decompression + * + * It's possible to have input and output sharing the same buffer, + * for highly contrained memory environments. + * In both cases, it requires input to lay at the end of the buffer, + * and decompression to start at beginning of the buffer. + * Buffer size must feature some margin, hence be larger than final size. + * + * |<------------------------buffer--------------------------------->| + * |<-----------compressed data--------->| + * |<-----------decompressed size------------------>| + * |<----margin---->| + * + * This technique is more useful for decompression, + * since decompressed size is typically larger, + * and margin is short. + * + * In-place decompression will work inside any buffer + * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize). + * This presumes that decompressedSize > compressedSize. + * Otherwise, it means compression actually expanded data, + * and it would be more efficient to store such data with a flag indicating it's not compressed. + * This can happen when data is not compressible (already compressed, or encrypted). + * + * For in-place compression, margin is larger, as it must be able to cope with both + * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX, + * and data expansion, which can happen when input is not compressible. + * As a consequence, buffer size requirements are much higher, + * and memory savings offered by in-place compression are more limited. + * + * There are ways to limit this cost for compression : + * - Reduce history size, by modifying LZ4_DISTANCE_MAX. + * Note that it is a compile-time constant, so all compressions will apply this limit. + * Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX, + * so it's a reasonable trick when inputs are known to be small. + * - Require the compressor to deliver a "maximum compressed size". + * This is the `dstCapacity` parameter in `LZ4_compress*()`. + * When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail, + * in which case, the return code will be 0 (zero). + * The caller must be ready for these cases to happen, + * and typically design a backup scheme to send data uncompressed. + * The combination of both techniques can significantly reduce + * the amount of margin required for in-place compression. + * + * In-place compression can work in any buffer + * which size is >= (maxCompressedSize) + * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success. + * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX, + * so it's possible to reduce memory requirements by playing with them. + */ + +#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) (((compressedSize) >> 8) + 32) +#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize) ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize)) /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */ + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ +# define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_COMPRESS_INPLACE_MARGIN (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */ +#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize) ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */ + +#endif /* LZ4_STATIC_3504398509 */ +#endif /* LZ4_STATIC_LINKING_ONLY */ + + + +#ifndef LZ4_H_98237428734687 +#define LZ4_H_98237428734687 + +/*-************************************************************ + * Private Definitions + ************************************************************** + * Do not use these definitions directly. + * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. + * Accessing members will expose user code to API and/or ABI break in future versions of the library. + **************************************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ + +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include <stdint.h> + typedef int8_t LZ4_i8; + typedef uint8_t LZ4_byte; + typedef uint16_t LZ4_u16; + typedef uint32_t LZ4_u32; +#else + typedef signed char LZ4_i8; + typedef unsigned char LZ4_byte; + typedef unsigned short LZ4_u16; + typedef unsigned int LZ4_u32; +#endif + +typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; +struct LZ4_stream_t_internal { + LZ4_u32 hashTable[LZ4_HASH_SIZE_U32]; + LZ4_u32 currentOffset; + LZ4_u32 tableType; + const LZ4_byte* dictionary; + const LZ4_stream_t_internal* dictCtx; + LZ4_u32 dictSize; +}; + +typedef struct { + const LZ4_byte* externalDict; + size_t extDictSize; + const LZ4_byte* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + + +/*! LZ4_stream_t : + * Do not use below internal definitions directly ! + * Declare or allocate an LZ4_stream_t instead. + * LZ4_stream_t can also be created using LZ4_createStream(), which is recommended. + * The structure definition can be convenient for static allocation + * (on stack, or as part of larger structure). + * Init this structure with LZ4_initStream() before first use. + * note : only use this definition in association with static linking ! + * this definition is not API/ABI safe, and may change in future versions. + */ +#define LZ4_STREAMSIZE 16416 /* static size, for inter-version compatibility */ +#define LZ4_STREAMSIZE_VOIDP (LZ4_STREAMSIZE / sizeof(void*)) +union LZ4_stream_u { + void* table[LZ4_STREAMSIZE_VOIDP]; + LZ4_stream_t_internal internal_donotuse; +}; /* previously typedef'd to LZ4_stream_t */ + + +/*! LZ4_initStream() : v1.9.0+ + * An LZ4_stream_t structure must be initialized at least once. + * This is automatically done when invoking LZ4_createStream(), + * but it's not when the structure is simply declared on stack (for example). + * + * Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t. + * It can also initialize any arbitrary buffer of sufficient size, + * and will @return a pointer of proper type upon initialization. + * + * Note : initialization fails if size and alignment conditions are not respected. + * In which case, the function will @return NULL. + * Note2: An LZ4_stream_t structure guarantees correct alignment and size. + * Note3: Before v1.9.0, use LZ4_resetStream() instead + */ +LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size); + + +/*! LZ4_streamDecode_t : + * information structure to track an LZ4 stream during decompression. + * init this structure using LZ4_setStreamDecode() before first use. + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * and may change in a future version ! + */ +#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ ) +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +union LZ4_streamDecode_u { + unsigned long long table[LZ4_STREAMDECODESIZE_U64]; + LZ4_streamDecode_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_streamDecode_t */ + + + +/*-************************************ +* Obsolete Functions +**************************************/ + +/*! Deprecation warnings + * + * Deprecated functions make the compiler generate a warning when invoked. + * This is meant to invite users to update their source code. + * Should deprecation warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc + * or _CRT_SECURE_NO_WARNINGS in Visual. + * + * Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS + * before including the header file. + */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define LZ4_DEPRECATED(message) [[deprecated(message)]] +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45)) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# else +# pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler") +# define LZ4_DEPRECATED(message) /* disabled */ +# endif +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ + +/*! Obsolete compression functions (since v1.7.3) */ +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress (const char* src, char* dest, int srcSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/*! Obsolete decompression functions (since v1.8.0) */ +LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize); +LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); + +/* Obsolete streaming functions (since v1.7.0) + * degraded functionality; do not use! + * + * In order to perform streaming compression, these functions depended on data + * that is no longer tracked in the state. They have been preserved as well as + * possible: using them will still produce a correct output. However, they don't + * actually retain any history between compression calls. The compression ratio + * achieved will therefore be no better than compressing each chunk + * independently. + */ +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("Use LZ4_resetStream() instead") LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_saveDict() instead") LZ4LIB_API char* LZ4_slideInputBuffer (void* state); + +/*! Obsolete streaming decoding functions (since v1.7.0) */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + +/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) : + * These functions used to be faster than LZ4_decompress_safe(), + * but this is no longer the case. They are now slower. + * This is because LZ4_decompress_fast() doesn't know the input size, + * and therefore must progress more cautiously into the input buffer to not read beyond the end of block. + * On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability. + * As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated. + * + * The last remaining LZ4_decompress_fast() specificity is that + * it can decompress a block without knowing its compressed size. + * Such functionality can be achieved in a more secure manner + * by employing LZ4_decompress_safe_partial(). + * + * Parameters: + * originalSize : is the uncompressed size to regenerate. + * `dst` must be already allocated, its size must be >= 'originalSize' bytes. + * @return : number of bytes read from source buffer (== compressed size). + * The function expects to finish at block's end exactly. + * If the source stream is detected malformed, the function stops decoding and returns a negative result. + * note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer. + * However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds. + * Also, since match offsets are not validated, match reads from 'src' may underflow too. + * These issues never happen if input (compressed) data is correct. + * But they may happen if input data is invalid (error or intentional tampering). + * As a consequence, use these functions in trusted environments with trusted data **only**. + */ +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead") +LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead") +LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead") +LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize); + +/*! LZ4_resetStream() : + * An LZ4_stream_t structure must be initialized at least once. + * This is done with LZ4_initStream(), or LZ4_resetStream(). + * Consider switching to LZ4_initStream(), + * invoking LZ4_resetStream() will trigger deprecation warnings in the future. + */ +LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); + + +#endif /* LZ4_H_98237428734687 */ + + +#if defined (__cplusplus) +} +#endif diff --git a/libbutl/lz4.hxx b/libbutl/lz4.hxx new file mode 100644 index 0000000..7886788 --- /dev/null +++ b/libbutl/lz4.hxx @@ -0,0 +1,205 @@ +// file : libbutl/lz4.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <cstdint> +#include <cstddef> + +#include <libbutl/optional.hxx> +#include <libbutl/fdstream.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + namespace lz4 + { + // Read the content from the input stream, compress it using the specified + // compression level and block size, and write the compressed content to + // the output stream. If content size is specified, then include it into + // the compressed content header. Return the compressed content size. + // + // This function may throw std::bad_alloc as well as exceptions thrown by + // fdstream read/write functions. It may also throw std::invalid_argument + // in case of argument inconsistencies (e.g., content size does not match + // actual) with what() returning the error description. The input stream + // is expected to throw on badbit (but not failbit). The output stream is + // expected to throw on badbit or failbit. + // + // The output and most likely the input streams must be in the binary + // mode. + // + // Valid values for the compression level are between 1 (fastest) and 12 + // (best compression level) though, practically, after 9 returns are + // diminished. + // + // Valid block sizes and their IDs: + // + // 4: 64KB + // 5: 256KB + // 6: 1MB + // 7: 4MB + // + // Note that due to the underlying API limitations, 0 content size is + // treated as absent and it's therefore impossible to compress 0-byte + // content with content size. + // + // This function produces compressed content identical to: + // + // lz4 -z -<compression_level> -B<block_size_id> -BD [--content-size] + // + LIBBUTL_SYMEXPORT std::uint64_t + compress (ofdstream&, + ifdstream&, + int compression_level, + int block_size_id, + optional<std::uint64_t> content_size); + + // Low-level iterative compression API. + // + // This API may throw std::bad_alloc in case of memory allocation errors + // and std::invalid_argument in case of argument inconsistencies (e.g., + // content size does not match actual) with what() returning the error + // description. + // + // See the implementation of the compress() function above for usage + // example. + // + // @@ TODO: reset support. + // + struct LIBBUTL_SYMEXPORT compressor + { + // Buffer, current size (part filled with data), and capacity. + // + char* ib; std::size_t in, ic; // Input. + char* ob; std::size_t on, oc; // Output. + + // As a first step call begin(). This function sets the required input + // and output buffer capacities (ic, oc). + // + // The caller normally allocates the input and output buffers and fills + // the input buffer. + // + void + begin (int compression_level, + int block_size_id, + optional<std::uint64_t> content_size); + + // Then call next() to compress the next chunk of input passing true on + // reaching EOF. Note that the input buffer should be filled to capacity + // unless end is true and the output buffer must be flushed before each + // subsequent call to next(). + // + void + next (bool end); + + // Not copyable or movable. + // + compressor (const compressor&) = delete; + compressor (compressor&&) = delete; + compressor& operator= (const compressor&) = delete; + compressor& operator= (compressor&&) = delete; + + // Implementation details. + // + compressor (): ctx_ (nullptr) {} + ~compressor (); + + public: + void + init_preferences (void*) const; + + void* ctx_; + int level_; + int block_id_; + optional<std::uint64_t> content_size_; + bool begin_; + }; + + + // Read the compressed content from the input stream, decompress it, and + // write the decompressed content to the output stream. Return the + // decompressed content size. + // + // This function may throw std::bad_alloc as well as exceptions thrown by + // fdstream read/write functions. It may also throw std::invalid_argument + // if the compressed content is invalid with what() returning the error + // description. The input stream is expected to throw on badbit but not + // failbit. The output stream is expected to throw on badbit or failbit. + // + // The input and most likely the output streams must be in the binary + // mode. + // + // Note that this function does not require the input stream to reach EOF + // at the end of compressed content. So if you have this requirement, you + // will need to enforce it yourself. + // + LIBBUTL_SYMEXPORT std::uint64_t + decompress (ofdstream&, ifdstream&); + + // Low-level iterative decompression API. + // + // This API may throw std::bad_alloc in case of memory allocation errors + // and std::invalid_argument if the compressed content is invalid with + // what() returning the error description. + // + // See the implementation of the decompress() function above for usage + // example. + // + // The LZ4F_*() decompression functions return a hint of how much data + // they want on the next call. So the plan is to allocate the input + // buffer large enough to hold anything that can be asked for and then + // fill it in in the asked chunks. This way we avoid having to shift the + // unread data around. + // + // @@ TODO: reset support. + // + struct LIBBUTL_SYMEXPORT decompressor + { + // Buffer, current size (part filled with data), and capacity. + // + char hb[19]; std::size_t hn ; // Header. + char* ib; std::size_t in, ic; // Input. + char* ob; std::size_t on, oc; // Output. + + // As a first step, fill in the header buffer and call begin(). This + // function sets the required input and output buffer capacities (ic, + // oc) and the number of bytes left in the header buffer (hn) and + // returns the number of bytes expected by the following call to next(). + // If content_size is not NULL, then it is set to the decompressed + // content size, if available. + // + // The caller normally allocates the input and output buffers, copies + // remaining header buffer data over to the input buffer, and then fills + // in the remainder of the input buffer up to what's expected by the + // call to next(). + // + std::size_t + begin (optional<std::uint64_t>* content_size = nullptr); + + // Then call next() to decompress the next chunk of input. This function + // returns the number of bytes expected by the following call to next() + // or 0 if no further input is expected. Note that the output buffer + // must be flushed before each subsequent call to next(). + // + std::size_t + next (); + + // Not copyable or movable. + // + decompressor (const decompressor&) = delete; + decompressor (decompressor&&) = delete; + decompressor& operator= (const decompressor&) = delete; + decompressor& operator= (decompressor&&) = delete; + + // Implementation details. + // + decompressor (): hn (0), in (0), on (0), ctx_ (nullptr) {} + ~decompressor (); + + public: + void* ctx_; + }; + } +} diff --git a/libbutl/lz4frame.c b/libbutl/lz4frame.c new file mode 100644 index 0000000..0db8c1e --- /dev/null +++ b/libbutl/lz4frame.c @@ -0,0 +1,1899 @@ +/* + * LZ4 auto-framing library + * Copyright (C) 2011-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 + */ + +/* LZ4F is a stand-alone API to create LZ4-compressed Frames + * in full conformance with specification v1.6.1 . + * This library rely upon memory management capabilities (malloc, free) + * provided either by <stdlib.h>, + * or redirected towards another library of user's choice + * (see Memory Routines below). + */ + + +/*-************************************ +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/*-************************************ +* Tuning parameters +**************************************/ +/* + * LZ4F_HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#ifndef LZ4F_HEAPMODE +# define LZ4F_HEAPMODE 0 +#endif + + +/*-************************************ +* Memory routines +**************************************/ +/* + * User may redirect invocations of + * malloc(), calloc() and free() + * towards another library or solution of their choice + * by modifying below section. + */ +#ifndef LZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ +# include <stdlib.h> /* malloc, calloc, free */ +# define ALLOC(s) malloc(s) +# define ALLOC_AND_ZERO(s) calloc(1,(s)) +# define FREEMEM(p) free(p) +#endif + +#include <string.h> /* memset, memcpy, memmove */ +#ifndef LZ4_SRC_INCLUDED /* avoid redefinition when sources are coalesced */ +# define MEM_INIT(p,v,s) memset((p),(v),(s)) +#endif + + +/*-************************************ +* Library declarations +**************************************/ +#define LZ4F_STATIC_LINKING_ONLY +#include "lz4frame.h" +#define LZ4_STATIC_LINKING_ONLY +#include "lz4.h" +#define LZ4_HC_STATIC_LINKING_ONLY +#include "lz4hc.h" +#define XXH_STATIC_LINKING_ONLY +#include "xxhash.h" + + +/*-************************************ +* Debug +**************************************/ +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1) +# include <assert.h> +#else +# ifndef assert +# define assert(condition) ((void)0) +# endif +#endif + +#define LZ4F_STATIC_ASSERT(c) { enum { LZ4F_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) && !defined(DEBUGLOG) +# include <stdio.h> +static int g_debuglog_enable = 1; +# define DEBUGLOG(l, ...) { \ + if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \ + fprintf(stderr, __FILE__ ": "); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +/*-************************************ +* Basic Types +**************************************/ +#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + + +/* unoptimized version; solves endianess & alignment issues */ +static U32 LZ4F_readLE32 (const void* src) +{ + const BYTE* const srcPtr = (const BYTE*)src; + U32 value32 = srcPtr[0]; + value32 += ((U32)srcPtr[1])<< 8; + value32 += ((U32)srcPtr[2])<<16; + value32 += ((U32)srcPtr[3])<<24; + return value32; +} + +static void LZ4F_writeLE32 (void* dst, U32 value32) +{ + BYTE* const dstPtr = (BYTE*)dst; + dstPtr[0] = (BYTE)value32; + dstPtr[1] = (BYTE)(value32 >> 8); + dstPtr[2] = (BYTE)(value32 >> 16); + dstPtr[3] = (BYTE)(value32 >> 24); +} + +static U64 LZ4F_readLE64 (const void* src) +{ + const BYTE* const srcPtr = (const BYTE*)src; + U64 value64 = srcPtr[0]; + value64 += ((U64)srcPtr[1]<<8); + value64 += ((U64)srcPtr[2]<<16); + value64 += ((U64)srcPtr[3]<<24); + value64 += ((U64)srcPtr[4]<<32); + value64 += ((U64)srcPtr[5]<<40); + value64 += ((U64)srcPtr[6]<<48); + value64 += ((U64)srcPtr[7]<<56); + return value64; +} + +static void LZ4F_writeLE64 (void* dst, U64 value64) +{ + BYTE* const dstPtr = (BYTE*)dst; + dstPtr[0] = (BYTE)value64; + dstPtr[1] = (BYTE)(value64 >> 8); + dstPtr[2] = (BYTE)(value64 >> 16); + dstPtr[3] = (BYTE)(value64 >> 24); + dstPtr[4] = (BYTE)(value64 >> 32); + dstPtr[5] = (BYTE)(value64 >> 40); + dstPtr[6] = (BYTE)(value64 >> 48); + dstPtr[7] = (BYTE)(value64 >> 56); +} + + +/*-************************************ +* Constants +**************************************/ +#ifndef LZ4_SRC_INCLUDED /* avoid double definition */ +# define KB *(1<<10) +# define MB *(1<<20) +# define GB *(1<<30) +#endif + +#define _1BIT 0x01 +#define _2BITS 0x03 +#define _3BITS 0x07 +#define _4BITS 0x0F +#define _8BITS 0xFF + +#define LZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U +#define LZ4F_MAGICNUMBER 0x184D2204U +#define LZ4F_BLOCKUNCOMPRESSED_FLAG 0x80000000U +#define LZ4F_BLOCKSIZEID_DEFAULT LZ4F_max64KB + +static const size_t minFHSize = LZ4F_HEADER_SIZE_MIN; /* 7 */ +static const size_t maxFHSize = LZ4F_HEADER_SIZE_MAX; /* 19 */ +static const size_t BHSize = LZ4F_BLOCK_HEADER_SIZE; /* block header : size, and compress flag */ +static const size_t BFSize = LZ4F_BLOCK_CHECKSUM_SIZE; /* block footer : checksum (optional) */ + + +/*-************************************ +* Structures and local types +**************************************/ +typedef struct LZ4F_cctx_s +{ + LZ4F_preferences_t prefs; + U32 version; + U32 cStage; + const LZ4F_CDict* cdict; + size_t maxBlockSize; + size_t maxBufferSize; + BYTE* tmpBuff; + BYTE* tmpIn; + size_t tmpInSize; + U64 totalInSize; + XXH32_state_t xxh; + void* lz4CtxPtr; + U16 lz4CtxAlloc; /* sized for: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */ + U16 lz4CtxState; /* in use as: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */ +} LZ4F_cctx_t; + + +/*-************************************ +* Error management +**************************************/ +#define LZ4F_GENERATE_STRING(STRING) #STRING, +static const char* LZ4F_errorStrings[] = { LZ4F_LIST_ERRORS(LZ4F_GENERATE_STRING) }; + + +unsigned LZ4F_isError(LZ4F_errorCode_t code) +{ + return (code > (LZ4F_errorCode_t)(-LZ4F_ERROR_maxCode)); +} + +const char* LZ4F_getErrorName(LZ4F_errorCode_t code) +{ + static const char* codeError = "Unspecified error code"; + if (LZ4F_isError(code)) return LZ4F_errorStrings[-(int)(code)]; + return codeError; +} + +LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult) +{ + if (!LZ4F_isError(functionResult)) return LZ4F_OK_NoError; + return (LZ4F_errorCodes)(-(ptrdiff_t)functionResult); +} + +static LZ4F_errorCode_t err0r(LZ4F_errorCodes code) +{ + /* A compilation error here means sizeof(ptrdiff_t) is not large enough */ + LZ4F_STATIC_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t)); + return (LZ4F_errorCode_t)-(ptrdiff_t)code; +} + +unsigned LZ4F_getVersion(void) { return LZ4F_VERSION; } + +int LZ4F_compressionLevel_max(void) { return LZ4HC_CLEVEL_MAX; } + +size_t LZ4F_getBlockSize(unsigned blockSizeID) +{ + static const size_t blockSizes[4] = { 64 KB, 256 KB, 1 MB, 4 MB }; + + if (blockSizeID == 0) blockSizeID = LZ4F_BLOCKSIZEID_DEFAULT; + if (blockSizeID < LZ4F_max64KB || blockSizeID > LZ4F_max4MB) + return err0r(LZ4F_ERROR_maxBlockSize_invalid); + blockSizeID -= LZ4F_max64KB; + return blockSizes[blockSizeID]; +} + +/*-************************************ +* Private functions +**************************************/ +#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) + +static BYTE LZ4F_headerChecksum (const void* header, size_t length) +{ + U32 const xxh = XXH32(header, length, 0); + return (BYTE)(xxh >> 8); +} + + +/*-************************************ +* Simple-pass compression functions +**************************************/ +static LZ4F_blockSizeID_t LZ4F_optimalBSID(const LZ4F_blockSizeID_t requestedBSID, + const size_t srcSize) +{ + LZ4F_blockSizeID_t proposedBSID = LZ4F_max64KB; + size_t maxBlockSize = 64 KB; + while (requestedBSID > proposedBSID) { + if (srcSize <= maxBlockSize) + return proposedBSID; + proposedBSID = (LZ4F_blockSizeID_t)((int)proposedBSID + 1); + maxBlockSize <<= 2; + } + return requestedBSID; +} + +/*! LZ4F_compressBound_internal() : + * Provides dstCapacity given a srcSize to guarantee operation success in worst case situations. + * prefsPtr is optional : if NULL is provided, preferences will be set to cover worst case scenario. + * @return is always the same for a srcSize and prefsPtr, so it can be relied upon to size reusable buffers. + * When srcSize==0, LZ4F_compressBound() provides an upper bound for LZ4F_flush() and LZ4F_compressEnd() operations. + */ +static size_t LZ4F_compressBound_internal(size_t srcSize, + const LZ4F_preferences_t* preferencesPtr, + size_t alreadyBuffered) +{ + LZ4F_preferences_t prefsNull = LZ4F_INIT_PREFERENCES; + prefsNull.frameInfo.contentChecksumFlag = LZ4F_contentChecksumEnabled; /* worst case */ + prefsNull.frameInfo.blockChecksumFlag = LZ4F_blockChecksumEnabled; /* worst case */ + { const LZ4F_preferences_t* const prefsPtr = (preferencesPtr==NULL) ? &prefsNull : preferencesPtr; + U32 const flush = prefsPtr->autoFlush | (srcSize==0); + LZ4F_blockSizeID_t const blockID = prefsPtr->frameInfo.blockSizeID; + size_t const blockSize = LZ4F_getBlockSize(blockID); + size_t const maxBuffered = blockSize - 1; + size_t const bufferedSize = MIN(alreadyBuffered, maxBuffered); + size_t const maxSrcSize = srcSize + bufferedSize; + unsigned const nbFullBlocks = (unsigned)(maxSrcSize / blockSize); + size_t const partialBlockSize = maxSrcSize & (blockSize-1); + size_t const lastBlockSize = flush ? partialBlockSize : 0; + unsigned const nbBlocks = nbFullBlocks + (lastBlockSize>0); + + size_t const blockCRCSize = BFSize * prefsPtr->frameInfo.blockChecksumFlag; + size_t const frameEnd = BHSize + (prefsPtr->frameInfo.contentChecksumFlag*BFSize); + + return ((BHSize + blockCRCSize) * nbBlocks) + + (blockSize * nbFullBlocks) + lastBlockSize + frameEnd; + } +} + +size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr) +{ + LZ4F_preferences_t prefs; + size_t const headerSize = maxFHSize; /* max header size, including optional fields */ + + if (preferencesPtr!=NULL) prefs = *preferencesPtr; + else MEM_INIT(&prefs, 0, sizeof(prefs)); + prefs.autoFlush = 1; + + return headerSize + LZ4F_compressBound_internal(srcSize, &prefs, 0);; +} + + +/*! LZ4F_compressFrame_usingCDict() : + * Compress srcBuffer using a dictionary, in a single step. + * cdict can be NULL, in which case, no dictionary is used. + * dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr). + * The LZ4F_preferences_t structure is optional : you may provide NULL as argument, + * however, it's the only way to provide a dictID, so it's not recommended. + * @return : number of bytes written into dstBuffer, + * or an error code if it fails (can be tested using LZ4F_isError()) + */ +size_t LZ4F_compressFrame_usingCDict(LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const LZ4F_CDict* cdict, + const LZ4F_preferences_t* preferencesPtr) +{ + LZ4F_preferences_t prefs; + LZ4F_compressOptions_t options; + BYTE* const dstStart = (BYTE*) dstBuffer; + BYTE* dstPtr = dstStart; + BYTE* const dstEnd = dstStart + dstCapacity; + + if (preferencesPtr!=NULL) + prefs = *preferencesPtr; + else + MEM_INIT(&prefs, 0, sizeof(prefs)); + if (prefs.frameInfo.contentSize != 0) + prefs.frameInfo.contentSize = (U64)srcSize; /* auto-correct content size if selected (!=0) */ + + prefs.frameInfo.blockSizeID = LZ4F_optimalBSID(prefs.frameInfo.blockSizeID, srcSize); + prefs.autoFlush = 1; + if (srcSize <= LZ4F_getBlockSize(prefs.frameInfo.blockSizeID)) + prefs.frameInfo.blockMode = LZ4F_blockIndependent; /* only one block => no need for inter-block link */ + + MEM_INIT(&options, 0, sizeof(options)); + options.stableSrc = 1; + + if (dstCapacity < LZ4F_compressFrameBound(srcSize, &prefs)) /* condition to guarantee success */ + return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + + { size_t const headerSize = LZ4F_compressBegin_usingCDict(cctx, dstBuffer, dstCapacity, cdict, &prefs); /* write header */ + if (LZ4F_isError(headerSize)) return headerSize; + dstPtr += headerSize; /* header size */ } + + assert(dstEnd >= dstPtr); + { size_t const cSize = LZ4F_compressUpdate(cctx, dstPtr, (size_t)(dstEnd-dstPtr), srcBuffer, srcSize, &options); + if (LZ4F_isError(cSize)) return cSize; + dstPtr += cSize; } + + assert(dstEnd >= dstPtr); + { size_t const tailSize = LZ4F_compressEnd(cctx, dstPtr, (size_t)(dstEnd-dstPtr), &options); /* flush last block, and generate suffix */ + if (LZ4F_isError(tailSize)) return tailSize; + dstPtr += tailSize; } + + assert(dstEnd >= dstStart); + return (size_t)(dstPtr - dstStart); +} + + +/*! LZ4F_compressFrame() : + * Compress an entire srcBuffer into a valid LZ4 frame, in a single step. + * dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr). + * The LZ4F_preferences_t structure is optional : you can provide NULL as argument. All preferences will be set to default. + * @return : number of bytes written into dstBuffer. + * or an error code if it fails (can be tested using LZ4F_isError()) + */ +size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const LZ4F_preferences_t* preferencesPtr) +{ + size_t result; +#if (LZ4F_HEAPMODE) + LZ4F_cctx_t *cctxPtr; + result = LZ4F_createCompressionContext(&cctxPtr, LZ4F_VERSION); + if (LZ4F_isError(result)) return result; +#else + LZ4F_cctx_t cctx; + LZ4_stream_t lz4ctx; + LZ4F_cctx_t *cctxPtr = &cctx; + + DEBUGLOG(4, "LZ4F_compressFrame"); + MEM_INIT(&cctx, 0, sizeof(cctx)); + cctx.version = LZ4F_VERSION; + cctx.maxBufferSize = 5 MB; /* mess with real buffer size to prevent dynamic allocation; works only because autoflush==1 & stableSrc==1 */ + if (preferencesPtr == NULL || + preferencesPtr->compressionLevel < LZ4HC_CLEVEL_MIN) + { + LZ4_initStream(&lz4ctx, sizeof(lz4ctx)); + cctxPtr->lz4CtxPtr = &lz4ctx; + cctxPtr->lz4CtxAlloc = 1; + cctxPtr->lz4CtxState = 1; + } +#endif + + result = LZ4F_compressFrame_usingCDict(cctxPtr, dstBuffer, dstCapacity, + srcBuffer, srcSize, + NULL, preferencesPtr); + +#if (LZ4F_HEAPMODE) + LZ4F_freeCompressionContext(cctxPtr); +#else + if (preferencesPtr != NULL && + preferencesPtr->compressionLevel >= LZ4HC_CLEVEL_MIN) + { + FREEMEM(cctxPtr->lz4CtxPtr); + } +#endif + return result; +} + + +/*-*************************************************** +* Dictionary compression +*****************************************************/ + +struct LZ4F_CDict_s { + void* dictContent; + LZ4_stream_t* fastCtx; + LZ4_streamHC_t* HCCtx; +}; /* typedef'd to LZ4F_CDict within lz4frame_static.h */ + +/*! LZ4F_createCDict() : + * When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once. + * LZ4F_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. + * LZ4F_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * `dictBuffer` can be released after LZ4F_CDict creation, since its content is copied within CDict + * @return : digested dictionary for compression, or NULL if failed */ +LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize) +{ + const char* dictStart = (const char*)dictBuffer; + LZ4F_CDict* cdict = (LZ4F_CDict*) ALLOC(sizeof(*cdict)); + DEBUGLOG(4, "LZ4F_createCDict"); + if (!cdict) return NULL; + if (dictSize > 64 KB) { + dictStart += dictSize - 64 KB; + dictSize = 64 KB; + } + cdict->dictContent = ALLOC(dictSize); + cdict->fastCtx = LZ4_createStream(); + cdict->HCCtx = LZ4_createStreamHC(); + if (!cdict->dictContent || !cdict->fastCtx || !cdict->HCCtx) { + LZ4F_freeCDict(cdict); + return NULL; + } + memcpy(cdict->dictContent, dictStart, dictSize); + LZ4_loadDict (cdict->fastCtx, (const char*)cdict->dictContent, (int)dictSize); + LZ4_setCompressionLevel(cdict->HCCtx, LZ4HC_CLEVEL_DEFAULT); + LZ4_loadDictHC(cdict->HCCtx, (const char*)cdict->dictContent, (int)dictSize); + return cdict; +} + +void LZ4F_freeCDict(LZ4F_CDict* cdict) +{ + if (cdict==NULL) return; /* support free on NULL */ + FREEMEM(cdict->dictContent); + LZ4_freeStream(cdict->fastCtx); + LZ4_freeStreamHC(cdict->HCCtx); + FREEMEM(cdict); +} + + +/*-********************************* +* Advanced compression functions +***********************************/ + +/*! LZ4F_createCompressionContext() : + * The first thing to do is to create a compressionContext object, which will be used in all compression operations. + * This is achieved using LZ4F_createCompressionContext(), which takes as argument a version and an LZ4F_preferences_t structure. + * The version provided MUST be LZ4F_VERSION. It is intended to track potential incompatible differences between different binaries. + * The function will provide a pointer to an allocated LZ4F_compressionContext_t object. + * If the result LZ4F_errorCode_t is not OK_NoError, there was an error during context creation. + * Object can release its memory using LZ4F_freeCompressionContext(); + */ +LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** LZ4F_compressionContextPtr, unsigned version) +{ + LZ4F_cctx_t* const cctxPtr = (LZ4F_cctx_t*)ALLOC_AND_ZERO(sizeof(LZ4F_cctx_t)); + if (cctxPtr==NULL) return err0r(LZ4F_ERROR_allocation_failed); + + cctxPtr->version = version; + cctxPtr->cStage = 0; /* Next stage : init stream */ + + *LZ4F_compressionContextPtr = cctxPtr; + + return LZ4F_OK_NoError; +} + + +LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctxPtr) +{ + if (cctxPtr != NULL) { /* support free on NULL */ + FREEMEM(cctxPtr->lz4CtxPtr); /* note: LZ4_streamHC_t and LZ4_stream_t are simple POD types */ + FREEMEM(cctxPtr->tmpBuff); + FREEMEM(cctxPtr); + } + + return LZ4F_OK_NoError; +} + + +/** + * This function prepares the internal LZ4(HC) stream for a new compression, + * resetting the context and attaching the dictionary, if there is one. + * + * It needs to be called at the beginning of each independent compression + * stream (i.e., at the beginning of a frame in blockLinked mode, or at the + * beginning of each block in blockIndependent mode). + */ +static void LZ4F_initStream(void* ctx, + const LZ4F_CDict* cdict, + int level, + LZ4F_blockMode_t blockMode) { + if (level < LZ4HC_CLEVEL_MIN) { + if (cdict != NULL || blockMode == LZ4F_blockLinked) { + /* In these cases, we will call LZ4_compress_fast_continue(), + * which needs an already reset context. Otherwise, we'll call a + * one-shot API. The non-continued APIs internally perform their own + * resets at the beginning of their calls, where they know what + * tableType they need the context to be in. So in that case this + * would be misguided / wasted work. */ + LZ4_resetStream_fast((LZ4_stream_t*)ctx); + } + LZ4_attach_dictionary((LZ4_stream_t *)ctx, cdict ? cdict->fastCtx : NULL); + } else { + LZ4_resetStreamHC_fast((LZ4_streamHC_t*)ctx, level); + LZ4_attach_HC_dictionary((LZ4_streamHC_t *)ctx, cdict ? cdict->HCCtx : NULL); + } +} + + +/*! LZ4F_compressBegin_usingCDict() : + * init streaming compression and writes frame header into dstBuffer. + * dstBuffer must be >= LZ4F_HEADER_SIZE_MAX bytes. + * @return : number of bytes written into dstBuffer for the header + * or an error code (can be tested using LZ4F_isError()) + */ +size_t LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const LZ4F_CDict* cdict, + const LZ4F_preferences_t* preferencesPtr) +{ + LZ4F_preferences_t prefNull; + BYTE* const dstStart = (BYTE*)dstBuffer; + BYTE* dstPtr = dstStart; + BYTE* headerStart; + + if (dstCapacity < maxFHSize) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + MEM_INIT(&prefNull, 0, sizeof(prefNull)); + if (preferencesPtr == NULL) preferencesPtr = &prefNull; + cctxPtr->prefs = *preferencesPtr; + + /* Ctx Management */ + { U16 const ctxTypeID = (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) ? 1 : 2; + if (cctxPtr->lz4CtxAlloc < ctxTypeID) { + FREEMEM(cctxPtr->lz4CtxPtr); + if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) { + cctxPtr->lz4CtxPtr = LZ4_createStream(); + } else { + cctxPtr->lz4CtxPtr = LZ4_createStreamHC(); + } + if (cctxPtr->lz4CtxPtr == NULL) + return err0r(LZ4F_ERROR_allocation_failed); + cctxPtr->lz4CtxAlloc = ctxTypeID; + cctxPtr->lz4CtxState = ctxTypeID; + } else if (cctxPtr->lz4CtxState != ctxTypeID) { + /* otherwise, a sufficient buffer is allocated, but we need to + * reset it to the correct context type */ + if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) { + LZ4_initStream((LZ4_stream_t *) cctxPtr->lz4CtxPtr, sizeof (LZ4_stream_t)); + } else { + LZ4_initStreamHC((LZ4_streamHC_t *) cctxPtr->lz4CtxPtr, sizeof(LZ4_streamHC_t)); + LZ4_setCompressionLevel((LZ4_streamHC_t *) cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel); + } + cctxPtr->lz4CtxState = ctxTypeID; + } + } + + /* Buffer Management */ + if (cctxPtr->prefs.frameInfo.blockSizeID == 0) + cctxPtr->prefs.frameInfo.blockSizeID = LZ4F_BLOCKSIZEID_DEFAULT; + cctxPtr->maxBlockSize = LZ4F_getBlockSize(cctxPtr->prefs.frameInfo.blockSizeID); + + { size_t const requiredBuffSize = preferencesPtr->autoFlush ? + ((cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) ? 64 KB : 0) : /* only needs past data up to window size */ + cctxPtr->maxBlockSize + ((cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) ? 128 KB : 0); + + if (cctxPtr->maxBufferSize < requiredBuffSize) { + cctxPtr->maxBufferSize = 0; + FREEMEM(cctxPtr->tmpBuff); + cctxPtr->tmpBuff = (BYTE*)ALLOC_AND_ZERO(requiredBuffSize); + if (cctxPtr->tmpBuff == NULL) return err0r(LZ4F_ERROR_allocation_failed); + cctxPtr->maxBufferSize = requiredBuffSize; + } } + cctxPtr->tmpIn = cctxPtr->tmpBuff; + cctxPtr->tmpInSize = 0; + (void)XXH32_reset(&(cctxPtr->xxh), 0); + + /* context init */ + cctxPtr->cdict = cdict; + if (cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) { + /* frame init only for blockLinked : blockIndependent will be init at each block */ + LZ4F_initStream(cctxPtr->lz4CtxPtr, cdict, cctxPtr->prefs.compressionLevel, LZ4F_blockLinked); + } + if (preferencesPtr->compressionLevel >= LZ4HC_CLEVEL_MIN) { + LZ4_favorDecompressionSpeed((LZ4_streamHC_t*)cctxPtr->lz4CtxPtr, (int)preferencesPtr->favorDecSpeed); + } + + /* Magic Number */ + LZ4F_writeLE32(dstPtr, LZ4F_MAGICNUMBER); + dstPtr += 4; + headerStart = dstPtr; + + /* FLG Byte */ + *dstPtr++ = (BYTE)(((1 & _2BITS) << 6) /* Version('01') */ + + ((cctxPtr->prefs.frameInfo.blockMode & _1BIT ) << 5) + + ((cctxPtr->prefs.frameInfo.blockChecksumFlag & _1BIT ) << 4) + + ((unsigned)(cctxPtr->prefs.frameInfo.contentSize > 0) << 3) + + ((cctxPtr->prefs.frameInfo.contentChecksumFlag & _1BIT ) << 2) + + (cctxPtr->prefs.frameInfo.dictID > 0) ); + /* BD Byte */ + *dstPtr++ = (BYTE)((cctxPtr->prefs.frameInfo.blockSizeID & _3BITS) << 4); + /* Optional Frame content size field */ + if (cctxPtr->prefs.frameInfo.contentSize) { + LZ4F_writeLE64(dstPtr, cctxPtr->prefs.frameInfo.contentSize); + dstPtr += 8; + cctxPtr->totalInSize = 0; + } + /* Optional dictionary ID field */ + if (cctxPtr->prefs.frameInfo.dictID) { + LZ4F_writeLE32(dstPtr, cctxPtr->prefs.frameInfo.dictID); + dstPtr += 4; + } + /* Header CRC Byte */ + *dstPtr = LZ4F_headerChecksum(headerStart, (size_t)(dstPtr - headerStart)); + dstPtr++; + + cctxPtr->cStage = 1; /* header written, now request input data block */ + return (size_t)(dstPtr - dstStart); +} + + +/*! LZ4F_compressBegin() : + * init streaming compression and writes frame header into dstBuffer. + * dstBuffer must be >= LZ4F_HEADER_SIZE_MAX bytes. + * preferencesPtr can be NULL, in which case default parameters are selected. + * @return : number of bytes written into dstBuffer for the header + * or an error code (can be tested using LZ4F_isError()) + */ +size_t LZ4F_compressBegin(LZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const LZ4F_preferences_t* preferencesPtr) +{ + return LZ4F_compressBegin_usingCDict(cctxPtr, dstBuffer, dstCapacity, + NULL, preferencesPtr); +} + + +/* LZ4F_compressBound() : + * @return minimum capacity of dstBuffer for a given srcSize to handle worst case scenario. + * LZ4F_preferences_t structure is optional : if NULL, preferences will be set to cover worst case scenario. + * This function cannot fail. + */ +size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr) +{ + if (preferencesPtr && preferencesPtr->autoFlush) { + return LZ4F_compressBound_internal(srcSize, preferencesPtr, 0); + } + return LZ4F_compressBound_internal(srcSize, preferencesPtr, (size_t)-1); +} + + +typedef int (*compressFunc_t)(void* ctx, const char* src, char* dst, int srcSize, int dstSize, int level, const LZ4F_CDict* cdict); + + +/*! LZ4F_makeBlock(): + * compress a single block, add header and optional checksum. + * assumption : dst buffer capacity is >= BHSize + srcSize + crcSize + */ +static size_t LZ4F_makeBlock(void* dst, + const void* src, size_t srcSize, + compressFunc_t compress, void* lz4ctx, int level, + const LZ4F_CDict* cdict, + LZ4F_blockChecksum_t crcFlag) +{ + BYTE* const cSizePtr = (BYTE*)dst; + U32 cSize = (U32)compress(lz4ctx, (const char*)src, (char*)(cSizePtr+BHSize), + (int)(srcSize), (int)(srcSize-1), + level, cdict); + if (cSize == 0) { /* compression failed */ + DEBUGLOG(5, "LZ4F_makeBlock: compression failed, creating a raw block (size %u)", (U32)srcSize); + cSize = (U32)srcSize; + LZ4F_writeLE32(cSizePtr, cSize | LZ4F_BLOCKUNCOMPRESSED_FLAG); + memcpy(cSizePtr+BHSize, src, srcSize); + } else { + LZ4F_writeLE32(cSizePtr, cSize); + } + if (crcFlag) { + U32 const crc32 = XXH32(cSizePtr+BHSize, cSize, 0); /* checksum of compressed data */ + LZ4F_writeLE32(cSizePtr+BHSize+cSize, crc32); + } + return BHSize + cSize + ((U32)crcFlag)*BFSize; +} + + +static int LZ4F_compressBlock(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict) +{ + int const acceleration = (level < 0) ? -level + 1 : 1; + LZ4F_initStream(ctx, cdict, level, LZ4F_blockIndependent); + if (cdict) { + return LZ4_compress_fast_continue((LZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration); + } else { + return LZ4_compress_fast_extState_fastReset(ctx, src, dst, srcSize, dstCapacity, acceleration); + } +} + +static int LZ4F_compressBlock_continue(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict) +{ + int const acceleration = (level < 0) ? -level + 1 : 1; + (void)cdict; /* init once at beginning of frame */ + return LZ4_compress_fast_continue((LZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration); +} + +static int LZ4F_compressBlockHC(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict) +{ + LZ4F_initStream(ctx, cdict, level, LZ4F_blockIndependent); + if (cdict) { + return LZ4_compress_HC_continue((LZ4_streamHC_t*)ctx, src, dst, srcSize, dstCapacity); + } + return LZ4_compress_HC_extStateHC_fastReset(ctx, src, dst, srcSize, dstCapacity, level); +} + +static int LZ4F_compressBlockHC_continue(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict) +{ + (void)level; (void)cdict; /* init once at beginning of frame */ + return LZ4_compress_HC_continue((LZ4_streamHC_t*)ctx, src, dst, srcSize, dstCapacity); +} + +static compressFunc_t LZ4F_selectCompression(LZ4F_blockMode_t blockMode, int level) +{ + if (level < LZ4HC_CLEVEL_MIN) { + if (blockMode == LZ4F_blockIndependent) return LZ4F_compressBlock; + return LZ4F_compressBlock_continue; + } + if (blockMode == LZ4F_blockIndependent) return LZ4F_compressBlockHC; + return LZ4F_compressBlockHC_continue; +} + +static int LZ4F_localSaveDict(LZ4F_cctx_t* cctxPtr) +{ + if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) + return LZ4_saveDict ((LZ4_stream_t*)(cctxPtr->lz4CtxPtr), (char*)(cctxPtr->tmpBuff), 64 KB); + return LZ4_saveDictHC ((LZ4_streamHC_t*)(cctxPtr->lz4CtxPtr), (char*)(cctxPtr->tmpBuff), 64 KB); +} + +typedef enum { notDone, fromTmpBuffer, fromSrcBuffer } LZ4F_lastBlockStatus; + +/*! LZ4F_compressUpdate() : + * LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary. + * dstBuffer MUST be >= LZ4F_compressBound(srcSize, preferencesPtr). + * LZ4F_compressOptions_t structure is optional : you can provide NULL as argument. + * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered. + * or an error code if it fails (which can be tested using LZ4F_isError()) + */ +size_t LZ4F_compressUpdate(LZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const LZ4F_compressOptions_t* compressOptionsPtr) +{ + LZ4F_compressOptions_t cOptionsNull; + size_t const blockSize = cctxPtr->maxBlockSize; + const BYTE* srcPtr = (const BYTE*)srcBuffer; + const BYTE* const srcEnd = srcPtr + srcSize; + BYTE* const dstStart = (BYTE*)dstBuffer; + BYTE* dstPtr = dstStart; + LZ4F_lastBlockStatus lastBlockCompressed = notDone; + compressFunc_t const compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel); + + DEBUGLOG(4, "LZ4F_compressUpdate (srcSize=%zu)", srcSize); + + if (cctxPtr->cStage != 1) return err0r(LZ4F_ERROR_GENERIC); + if (dstCapacity < LZ4F_compressBound_internal(srcSize, &(cctxPtr->prefs), cctxPtr->tmpInSize)) + return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + MEM_INIT(&cOptionsNull, 0, sizeof(cOptionsNull)); + if (compressOptionsPtr == NULL) compressOptionsPtr = &cOptionsNull; + + /* complete tmp buffer */ + if (cctxPtr->tmpInSize > 0) { /* some data already within tmp buffer */ + size_t const sizeToCopy = blockSize - cctxPtr->tmpInSize; + if (sizeToCopy > srcSize) { + /* add src to tmpIn buffer */ + memcpy(cctxPtr->tmpIn + cctxPtr->tmpInSize, srcBuffer, srcSize); + srcPtr = srcEnd; + cctxPtr->tmpInSize += srcSize; + /* still needs some CRC */ + } else { + /* complete tmpIn block and then compress it */ + lastBlockCompressed = fromTmpBuffer; + memcpy(cctxPtr->tmpIn + cctxPtr->tmpInSize, srcBuffer, sizeToCopy); + srcPtr += sizeToCopy; + + dstPtr += LZ4F_makeBlock(dstPtr, + cctxPtr->tmpIn, blockSize, + compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, + cctxPtr->cdict, + cctxPtr->prefs.frameInfo.blockChecksumFlag); + + if (cctxPtr->prefs.frameInfo.blockMode==LZ4F_blockLinked) cctxPtr->tmpIn += blockSize; + cctxPtr->tmpInSize = 0; + } + } + + while ((size_t)(srcEnd - srcPtr) >= blockSize) { + /* compress full blocks */ + lastBlockCompressed = fromSrcBuffer; + dstPtr += LZ4F_makeBlock(dstPtr, + srcPtr, blockSize, + compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, + cctxPtr->cdict, + cctxPtr->prefs.frameInfo.blockChecksumFlag); + srcPtr += blockSize; + } + + if ((cctxPtr->prefs.autoFlush) && (srcPtr < srcEnd)) { + /* compress remaining input < blockSize */ + lastBlockCompressed = fromSrcBuffer; + dstPtr += LZ4F_makeBlock(dstPtr, + srcPtr, (size_t)(srcEnd - srcPtr), + compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, + cctxPtr->cdict, + cctxPtr->prefs.frameInfo.blockChecksumFlag); + srcPtr = srcEnd; + } + + /* preserve dictionary if necessary */ + if ((cctxPtr->prefs.frameInfo.blockMode==LZ4F_blockLinked) && (lastBlockCompressed==fromSrcBuffer)) { + if (compressOptionsPtr->stableSrc) { + cctxPtr->tmpIn = cctxPtr->tmpBuff; + } else { + int const realDictSize = LZ4F_localSaveDict(cctxPtr); + if (realDictSize==0) return err0r(LZ4F_ERROR_GENERIC); + cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize; + } + } + + /* keep tmpIn within limits */ + if (!(cctxPtr->prefs.autoFlush) && + (cctxPtr->tmpIn + blockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize)) /* necessarily LZ4F_blockLinked && lastBlockCompressed==fromTmpBuffer */ + { + int const realDictSize = LZ4F_localSaveDict(cctxPtr); + cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize; + } + + /* some input data left, necessarily < blockSize */ + if (srcPtr < srcEnd) { + /* fill tmp buffer */ + size_t const sizeToCopy = (size_t)(srcEnd - srcPtr); + memcpy(cctxPtr->tmpIn, srcPtr, sizeToCopy); + cctxPtr->tmpInSize = sizeToCopy; + } + + if (cctxPtr->prefs.frameInfo.contentChecksumFlag == LZ4F_contentChecksumEnabled) + (void)XXH32_update(&(cctxPtr->xxh), srcBuffer, srcSize); + + cctxPtr->totalInSize += srcSize; + return (size_t)(dstPtr - dstStart); +} + + +/*! LZ4F_flush() : + * When compressed data must be sent immediately, without waiting for a block to be filled, + * invoke LZ4_flush(), which will immediately compress any remaining data stored within LZ4F_cctx. + * The result of the function is the number of bytes written into dstBuffer. + * It can be zero, this means there was no data left within LZ4F_cctx. + * The function outputs an error code if it fails (can be tested using LZ4F_isError()) + * LZ4F_compressOptions_t* is optional. NULL is a valid argument. + */ +size_t LZ4F_flush(LZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const LZ4F_compressOptions_t* compressOptionsPtr) +{ + BYTE* const dstStart = (BYTE*)dstBuffer; + BYTE* dstPtr = dstStart; + compressFunc_t compress; + + if (cctxPtr->tmpInSize == 0) return 0; /* nothing to flush */ + if (cctxPtr->cStage != 1) return err0r(LZ4F_ERROR_GENERIC); + if (dstCapacity < (cctxPtr->tmpInSize + BHSize + BFSize)) + return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + (void)compressOptionsPtr; /* not yet useful */ + + /* select compression function */ + compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel); + + /* compress tmp buffer */ + dstPtr += LZ4F_makeBlock(dstPtr, + cctxPtr->tmpIn, cctxPtr->tmpInSize, + compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel, + cctxPtr->cdict, + cctxPtr->prefs.frameInfo.blockChecksumFlag); + assert(((void)"flush overflows dstBuffer!", (size_t)(dstPtr - dstStart) <= dstCapacity)); + + if (cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) + cctxPtr->tmpIn += cctxPtr->tmpInSize; + cctxPtr->tmpInSize = 0; + + /* keep tmpIn within limits */ + if ((cctxPtr->tmpIn + cctxPtr->maxBlockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize)) { /* necessarily LZ4F_blockLinked */ + int const realDictSize = LZ4F_localSaveDict(cctxPtr); + cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize; + } + + return (size_t)(dstPtr - dstStart); +} + + +/*! LZ4F_compressEnd() : + * When you want to properly finish the compressed frame, just call LZ4F_compressEnd(). + * It will flush whatever data remained within compressionContext (like LZ4_flush()) + * but also properly finalize the frame, with an endMark and an (optional) checksum. + * LZ4F_compressOptions_t structure is optional : you can provide NULL as argument. + * @return: the number of bytes written into dstBuffer (necessarily >= 4 (endMark size)) + * or an error code if it fails (can be tested using LZ4F_isError()) + * The context can then be used again to compress a new frame, starting with LZ4F_compressBegin(). + */ +size_t LZ4F_compressEnd(LZ4F_cctx* cctxPtr, + void* dstBuffer, size_t dstCapacity, + const LZ4F_compressOptions_t* compressOptionsPtr) +{ + BYTE* const dstStart = (BYTE*)dstBuffer; + BYTE* dstPtr = dstStart; + + size_t const flushSize = LZ4F_flush(cctxPtr, dstBuffer, dstCapacity, compressOptionsPtr); + DEBUGLOG(5,"LZ4F_compressEnd: dstCapacity=%u", (unsigned)dstCapacity); + if (LZ4F_isError(flushSize)) return flushSize; + dstPtr += flushSize; + + assert(flushSize <= dstCapacity); + dstCapacity -= flushSize; + + if (dstCapacity < 4) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + LZ4F_writeLE32(dstPtr, 0); + dstPtr += 4; /* endMark */ + + if (cctxPtr->prefs.frameInfo.contentChecksumFlag == LZ4F_contentChecksumEnabled) { + U32 const xxh = XXH32_digest(&(cctxPtr->xxh)); + if (dstCapacity < 8) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall); + DEBUGLOG(5,"Writing 32-bit content checksum"); + LZ4F_writeLE32(dstPtr, xxh); + dstPtr+=4; /* content Checksum */ + } + + cctxPtr->cStage = 0; /* state is now re-usable (with identical preferences) */ + cctxPtr->maxBufferSize = 0; /* reuse HC context */ + + if (cctxPtr->prefs.frameInfo.contentSize) { + if (cctxPtr->prefs.frameInfo.contentSize != cctxPtr->totalInSize) + return err0r(LZ4F_ERROR_frameSize_wrong); + } + + return (size_t)(dstPtr - dstStart); +} + + +/*-*************************************************** +* Frame Decompression +*****************************************************/ + +typedef enum { + dstage_getFrameHeader=0, dstage_storeFrameHeader, + dstage_init, + dstage_getBlockHeader, dstage_storeBlockHeader, + dstage_copyDirect, dstage_getBlockChecksum, + dstage_getCBlock, dstage_storeCBlock, + dstage_flushOut, + dstage_getSuffix, dstage_storeSuffix, + dstage_getSFrameSize, dstage_storeSFrameSize, + dstage_skipSkippable +} dStage_t; + +struct LZ4F_dctx_s { + LZ4F_frameInfo_t frameInfo; + U32 version; + dStage_t dStage; + U64 frameRemainingSize; + size_t maxBlockSize; + size_t maxBufferSize; + BYTE* tmpIn; + size_t tmpInSize; + size_t tmpInTarget; + BYTE* tmpOutBuffer; + const BYTE* dict; + size_t dictSize; + BYTE* tmpOut; + size_t tmpOutSize; + size_t tmpOutStart; + XXH32_state_t xxh; + XXH32_state_t blockChecksum; + BYTE header[LZ4F_HEADER_SIZE_MAX]; +}; /* typedef'd to LZ4F_dctx in lz4frame.h */ + + +/*! LZ4F_createDecompressionContext() : + * Create a decompressionContext object, which will track all decompression operations. + * Provides a pointer to a fully allocated and initialized LZ4F_decompressionContext object. + * Object can later be released using LZ4F_freeDecompressionContext(). + * @return : if != 0, there was an error during context creation. + */ +LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** LZ4F_decompressionContextPtr, unsigned versionNumber) +{ + LZ4F_dctx* const dctx = (LZ4F_dctx*)ALLOC_AND_ZERO(sizeof(LZ4F_dctx)); + if (dctx == NULL) { /* failed allocation */ + *LZ4F_decompressionContextPtr = NULL; + return err0r(LZ4F_ERROR_allocation_failed); + } + + dctx->version = versionNumber; + *LZ4F_decompressionContextPtr = dctx; + return LZ4F_OK_NoError; +} + +LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx) +{ + LZ4F_errorCode_t result = LZ4F_OK_NoError; + if (dctx != NULL) { /* can accept NULL input, like free() */ + result = (LZ4F_errorCode_t)dctx->dStage; + FREEMEM(dctx->tmpIn); + FREEMEM(dctx->tmpOutBuffer); + FREEMEM(dctx); + } + return result; +} + + +/*==--- Streaming Decompression operations ---==*/ + +void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx) +{ + dctx->dStage = dstage_getFrameHeader; + dctx->dict = NULL; + dctx->dictSize = 0; +} + + +/*! LZ4F_decodeHeader() : + * input : `src` points at the **beginning of the frame** + * output : set internal values of dctx, such as + * dctx->frameInfo and dctx->dStage. + * Also allocates internal buffers. + * @return : nb Bytes read from src (necessarily <= srcSize) + * or an error code (testable with LZ4F_isError()) + */ +static size_t LZ4F_decodeHeader(LZ4F_dctx* dctx, const void* src, size_t srcSize) +{ + unsigned blockMode, blockChecksumFlag, contentSizeFlag, contentChecksumFlag, dictIDFlag, blockSizeID; + size_t frameHeaderSize; + const BYTE* srcPtr = (const BYTE*)src; + + DEBUGLOG(5, "LZ4F_decodeHeader"); + /* need to decode header to get frameInfo */ + if (srcSize < minFHSize) return err0r(LZ4F_ERROR_frameHeader_incomplete); /* minimal frame header size */ + MEM_INIT(&(dctx->frameInfo), 0, sizeof(dctx->frameInfo)); + + /* special case : skippable frames */ + if ((LZ4F_readLE32(srcPtr) & 0xFFFFFFF0U) == LZ4F_MAGIC_SKIPPABLE_START) { + dctx->frameInfo.frameType = LZ4F_skippableFrame; + if (src == (void*)(dctx->header)) { + dctx->tmpInSize = srcSize; + dctx->tmpInTarget = 8; + dctx->dStage = dstage_storeSFrameSize; + return srcSize; + } else { + dctx->dStage = dstage_getSFrameSize; + return 4; + } + } + + /* control magic number */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (LZ4F_readLE32(srcPtr) != LZ4F_MAGICNUMBER) { + DEBUGLOG(4, "frame header error : unknown magic number"); + return err0r(LZ4F_ERROR_frameType_unknown); + } +#endif + dctx->frameInfo.frameType = LZ4F_frame; + + /* Flags */ + { U32 const FLG = srcPtr[4]; + U32 const version = (FLG>>6) & _2BITS; + blockChecksumFlag = (FLG>>4) & _1BIT; + blockMode = (FLG>>5) & _1BIT; + contentSizeFlag = (FLG>>3) & _1BIT; + contentChecksumFlag = (FLG>>2) & _1BIT; + dictIDFlag = FLG & _1BIT; + /* validate */ + if (((FLG>>1)&_1BIT) != 0) return err0r(LZ4F_ERROR_reservedFlag_set); /* Reserved bit */ + if (version != 1) return err0r(LZ4F_ERROR_headerVersion_wrong); /* Version Number, only supported value */ + } + + /* Frame Header Size */ + frameHeaderSize = minFHSize + (contentSizeFlag?8:0) + (dictIDFlag?4:0); + + if (srcSize < frameHeaderSize) { + /* not enough input to fully decode frame header */ + if (srcPtr != dctx->header) + memcpy(dctx->header, srcPtr, srcSize); + dctx->tmpInSize = srcSize; + dctx->tmpInTarget = frameHeaderSize; + dctx->dStage = dstage_storeFrameHeader; + return srcSize; + } + + { U32 const BD = srcPtr[5]; + blockSizeID = (BD>>4) & _3BITS; + /* validate */ + if (((BD>>7)&_1BIT) != 0) return err0r(LZ4F_ERROR_reservedFlag_set); /* Reserved bit */ + if (blockSizeID < 4) return err0r(LZ4F_ERROR_maxBlockSize_invalid); /* 4-7 only supported values for the time being */ + if (((BD>>0)&_4BITS) != 0) return err0r(LZ4F_ERROR_reservedFlag_set); /* Reserved bits */ + } + + /* check header */ + assert(frameHeaderSize > 5); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + { BYTE const HC = LZ4F_headerChecksum(srcPtr+4, frameHeaderSize-5); + if (HC != srcPtr[frameHeaderSize-1]) + return err0r(LZ4F_ERROR_headerChecksum_invalid); + } +#endif + + /* save */ + dctx->frameInfo.blockMode = (LZ4F_blockMode_t)blockMode; + dctx->frameInfo.blockChecksumFlag = (LZ4F_blockChecksum_t)blockChecksumFlag; + dctx->frameInfo.contentChecksumFlag = (LZ4F_contentChecksum_t)contentChecksumFlag; + dctx->frameInfo.blockSizeID = (LZ4F_blockSizeID_t)blockSizeID; + dctx->maxBlockSize = LZ4F_getBlockSize(blockSizeID); + if (contentSizeFlag) + dctx->frameRemainingSize = + dctx->frameInfo.contentSize = LZ4F_readLE64(srcPtr+6); + if (dictIDFlag) + dctx->frameInfo.dictID = LZ4F_readLE32(srcPtr + frameHeaderSize - 5); + + dctx->dStage = dstage_init; + + return frameHeaderSize; +} + + +/*! LZ4F_headerSize() : + * @return : size of frame header + * or an error code, which can be tested using LZ4F_isError() + */ +size_t LZ4F_headerSize(const void* src, size_t srcSize) +{ + if (src == NULL) return err0r(LZ4F_ERROR_srcPtr_wrong); + + /* minimal srcSize to determine header size */ + if (srcSize < LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH) + return err0r(LZ4F_ERROR_frameHeader_incomplete); + + /* special case : skippable frames */ + if ((LZ4F_readLE32(src) & 0xFFFFFFF0U) == LZ4F_MAGIC_SKIPPABLE_START) + return 8; + + /* control magic number */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (LZ4F_readLE32(src) != LZ4F_MAGICNUMBER) + return err0r(LZ4F_ERROR_frameType_unknown); +#endif + + /* Frame Header Size */ + { BYTE const FLG = ((const BYTE*)src)[4]; + U32 const contentSizeFlag = (FLG>>3) & _1BIT; + U32 const dictIDFlag = FLG & _1BIT; + return minFHSize + (contentSizeFlag?8:0) + (dictIDFlag?4:0); + } +} + +/*! LZ4F_getFrameInfo() : + * This function extracts frame parameters (max blockSize, frame checksum, etc.). + * Usage is optional. Objective is to provide relevant information for allocation purposes. + * This function works in 2 situations : + * - At the beginning of a new frame, in which case it will decode this information from `srcBuffer`, and start the decoding process. + * Amount of input data provided must be large enough to successfully decode the frame header. + * A header size is variable, but is guaranteed to be <= LZ4F_HEADER_SIZE_MAX bytes. It's possible to provide more input data than this minimum. + * - After decoding has been started. In which case, no input is read, frame parameters are extracted from dctx. + * The number of bytes consumed from srcBuffer will be updated within *srcSizePtr (necessarily <= original value). + * Decompression must resume from (srcBuffer + *srcSizePtr). + * @return : an hint about how many srcSize bytes LZ4F_decompress() expects for next call, + * or an error code which can be tested using LZ4F_isError() + * note 1 : in case of error, dctx is not modified. Decoding operations can resume from where they stopped. + * note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure. + */ +LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_dctx* dctx, + LZ4F_frameInfo_t* frameInfoPtr, + const void* srcBuffer, size_t* srcSizePtr) +{ + LZ4F_STATIC_ASSERT(dstage_getFrameHeader < dstage_storeFrameHeader); + if (dctx->dStage > dstage_storeFrameHeader) { + /* frameInfo already decoded */ + size_t o=0, i=0; + *srcSizePtr = 0; + *frameInfoPtr = dctx->frameInfo; + /* returns : recommended nb of bytes for LZ4F_decompress() */ + return LZ4F_decompress(dctx, NULL, &o, NULL, &i, NULL); + } else { + if (dctx->dStage == dstage_storeFrameHeader) { + /* frame decoding already started, in the middle of header => automatic fail */ + *srcSizePtr = 0; + return err0r(LZ4F_ERROR_frameDecoding_alreadyStarted); + } else { + size_t const hSize = LZ4F_headerSize(srcBuffer, *srcSizePtr); + if (LZ4F_isError(hSize)) { *srcSizePtr=0; return hSize; } + if (*srcSizePtr < hSize) { + *srcSizePtr=0; + return err0r(LZ4F_ERROR_frameHeader_incomplete); + } + + { size_t decodeResult = LZ4F_decodeHeader(dctx, srcBuffer, hSize); + if (LZ4F_isError(decodeResult)) { + *srcSizePtr = 0; + } else { + *srcSizePtr = decodeResult; + decodeResult = BHSize; /* block header size */ + } + *frameInfoPtr = dctx->frameInfo; + return decodeResult; + } } } +} + + +/* LZ4F_updateDict() : + * only used for LZ4F_blockLinked mode + * Condition : dstPtr != NULL + */ +static void LZ4F_updateDict(LZ4F_dctx* dctx, + const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart, + unsigned withinTmp) +{ + assert(dstPtr != NULL); + if (dctx->dictSize==0) { + dctx->dict = (const BYTE*)dstPtr; /* priority to prefix mode */ + } + assert(dctx->dict != NULL); + + if (dctx->dict + dctx->dictSize == dstPtr) { /* prefix mode, everything within dstBuffer */ + dctx->dictSize += dstSize; + return; + } + + assert(dstPtr >= dstBufferStart); + if ((size_t)(dstPtr - dstBufferStart) + dstSize >= 64 KB) { /* history in dstBuffer becomes large enough to become dictionary */ + dctx->dict = (const BYTE*)dstBufferStart; + dctx->dictSize = (size_t)(dstPtr - dstBufferStart) + dstSize; + return; + } + + assert(dstSize < 64 KB); /* if dstSize >= 64 KB, dictionary would be set into dstBuffer directly */ + + /* dstBuffer does not contain whole useful history (64 KB), so it must be saved within tmpOutBuffer */ + assert(dctx->tmpOutBuffer != NULL); + + if (withinTmp && (dctx->dict == dctx->tmpOutBuffer)) { /* continue history within tmpOutBuffer */ + /* withinTmp expectation : content of [dstPtr,dstSize] is same as [dict+dictSize,dstSize], so we just extend it */ + assert(dctx->dict + dctx->dictSize == dctx->tmpOut + dctx->tmpOutStart); + dctx->dictSize += dstSize; + return; + } + + if (withinTmp) { /* copy relevant dict portion in front of tmpOut within tmpOutBuffer */ + size_t const preserveSize = (size_t)(dctx->tmpOut - dctx->tmpOutBuffer); + size_t copySize = 64 KB - dctx->tmpOutSize; + const BYTE* const oldDictEnd = dctx->dict + dctx->dictSize - dctx->tmpOutStart; + if (dctx->tmpOutSize > 64 KB) copySize = 0; + if (copySize > preserveSize) copySize = preserveSize; + + memcpy(dctx->tmpOutBuffer + preserveSize - copySize, oldDictEnd - copySize, copySize); + + dctx->dict = dctx->tmpOutBuffer; + dctx->dictSize = preserveSize + dctx->tmpOutStart + dstSize; + return; + } + + if (dctx->dict == dctx->tmpOutBuffer) { /* copy dst into tmp to complete dict */ + if (dctx->dictSize + dstSize > dctx->maxBufferSize) { /* tmp buffer not large enough */ + size_t const preserveSize = 64 KB - dstSize; + memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize); + dctx->dictSize = preserveSize; + } + memcpy(dctx->tmpOutBuffer + dctx->dictSize, dstPtr, dstSize); + dctx->dictSize += dstSize; + return; + } + + /* join dict & dest into tmp */ + { size_t preserveSize = 64 KB - dstSize; + if (preserveSize > dctx->dictSize) preserveSize = dctx->dictSize; + memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize); + memcpy(dctx->tmpOutBuffer + preserveSize, dstPtr, dstSize); + dctx->dict = dctx->tmpOutBuffer; + dctx->dictSize = preserveSize + dstSize; + } +} + + + +/*! LZ4F_decompress() : + * Call this function repetitively to regenerate compressed data in srcBuffer. + * The function will attempt to decode up to *srcSizePtr bytes from srcBuffer + * into dstBuffer of capacity *dstSizePtr. + * + * The number of bytes regenerated into dstBuffer will be provided within *dstSizePtr (necessarily <= original value). + * + * The number of bytes effectively read from srcBuffer will be provided within *srcSizePtr (necessarily <= original value). + * If number of bytes read is < number of bytes provided, then decompression operation is not complete. + * Remaining data will have to be presented again in a subsequent invocation. + * + * The function result is an hint of the better srcSize to use for next call to LZ4F_decompress. + * Schematically, it's the size of the current (or remaining) compressed block + header of next block. + * Respecting the hint provides a small boost to performance, since it allows less buffer shuffling. + * Note that this is just a hint, and it's always possible to any srcSize value. + * When a frame is fully decoded, @return will be 0. + * If decompression failed, @return is an error code which can be tested using LZ4F_isError(). + */ +size_t LZ4F_decompress(LZ4F_dctx* dctx, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const LZ4F_decompressOptions_t* decompressOptionsPtr) +{ + LZ4F_decompressOptions_t optionsNull; + const BYTE* const srcStart = (const BYTE*)srcBuffer; + const BYTE* const srcEnd = srcStart + *srcSizePtr; + const BYTE* srcPtr = srcStart; + BYTE* const dstStart = (BYTE*)dstBuffer; + BYTE* const dstEnd = dstStart ? dstStart + *dstSizePtr : NULL; + BYTE* dstPtr = dstStart; + const BYTE* selectedIn = NULL; + unsigned doAnotherStage = 1; + size_t nextSrcSizeHint = 1; + + + DEBUGLOG(5, "LZ4F_decompress : %p,%u => %p,%u", + srcBuffer, (unsigned)*srcSizePtr, dstBuffer, (unsigned)*dstSizePtr); + if (dstBuffer == NULL) assert(*dstSizePtr == 0); + MEM_INIT(&optionsNull, 0, sizeof(optionsNull)); + if (decompressOptionsPtr==NULL) decompressOptionsPtr = &optionsNull; + *srcSizePtr = 0; + *dstSizePtr = 0; + assert(dctx != NULL); + + /* behaves as a state machine */ + + while (doAnotherStage) { + + switch(dctx->dStage) + { + + case dstage_getFrameHeader: + DEBUGLOG(6, "dstage_getFrameHeader"); + if ((size_t)(srcEnd-srcPtr) >= maxFHSize) { /* enough to decode - shortcut */ + size_t const hSize = LZ4F_decodeHeader(dctx, srcPtr, (size_t)(srcEnd-srcPtr)); /* will update dStage appropriately */ + if (LZ4F_isError(hSize)) return hSize; + srcPtr += hSize; + break; + } + dctx->tmpInSize = 0; + if (srcEnd-srcPtr == 0) return minFHSize; /* 0-size input */ + dctx->tmpInTarget = minFHSize; /* minimum size to decode header */ + dctx->dStage = dstage_storeFrameHeader; + /* fall-through */ + + case dstage_storeFrameHeader: + DEBUGLOG(6, "dstage_storeFrameHeader"); + { size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize, (size_t)(srcEnd - srcPtr)); + memcpy(dctx->header + dctx->tmpInSize, srcPtr, sizeToCopy); + dctx->tmpInSize += sizeToCopy; + srcPtr += sizeToCopy; + } + if (dctx->tmpInSize < dctx->tmpInTarget) { + nextSrcSizeHint = (dctx->tmpInTarget - dctx->tmpInSize) + BHSize; /* rest of header + nextBlockHeader */ + doAnotherStage = 0; /* not enough src data, ask for some more */ + break; + } + { size_t const hSize = LZ4F_decodeHeader(dctx, dctx->header, dctx->tmpInTarget); /* will update dStage appropriately */ + if (LZ4F_isError(hSize)) return hSize; + } + break; + + case dstage_init: + DEBUGLOG(6, "dstage_init"); + if (dctx->frameInfo.contentChecksumFlag) (void)XXH32_reset(&(dctx->xxh), 0); + /* internal buffers allocation */ + { size_t const bufferNeeded = dctx->maxBlockSize + + ((dctx->frameInfo.blockMode==LZ4F_blockLinked) ? 128 KB : 0); + if (bufferNeeded > dctx->maxBufferSize) { /* tmp buffers too small */ + dctx->maxBufferSize = 0; /* ensure allocation will be re-attempted on next entry*/ + FREEMEM(dctx->tmpIn); + dctx->tmpIn = (BYTE*)ALLOC(dctx->maxBlockSize + BFSize /* block checksum */); + if (dctx->tmpIn == NULL) + return err0r(LZ4F_ERROR_allocation_failed); + FREEMEM(dctx->tmpOutBuffer); + dctx->tmpOutBuffer= (BYTE*)ALLOC(bufferNeeded); + if (dctx->tmpOutBuffer== NULL) + return err0r(LZ4F_ERROR_allocation_failed); + dctx->maxBufferSize = bufferNeeded; + } } + dctx->tmpInSize = 0; + dctx->tmpInTarget = 0; + dctx->tmpOut = dctx->tmpOutBuffer; + dctx->tmpOutStart = 0; + dctx->tmpOutSize = 0; + + dctx->dStage = dstage_getBlockHeader; + /* fall-through */ + + case dstage_getBlockHeader: + if ((size_t)(srcEnd - srcPtr) >= BHSize) { + selectedIn = srcPtr; + srcPtr += BHSize; + } else { + /* not enough input to read cBlockSize field */ + dctx->tmpInSize = 0; + dctx->dStage = dstage_storeBlockHeader; + } + + if (dctx->dStage == dstage_storeBlockHeader) /* can be skipped */ + case dstage_storeBlockHeader: + { size_t const remainingInput = (size_t)(srcEnd - srcPtr); + size_t const wantedData = BHSize - dctx->tmpInSize; + size_t const sizeToCopy = MIN(wantedData, remainingInput); + memcpy(dctx->tmpIn + dctx->tmpInSize, srcPtr, sizeToCopy); + srcPtr += sizeToCopy; + dctx->tmpInSize += sizeToCopy; + + if (dctx->tmpInSize < BHSize) { /* not enough input for cBlockSize */ + nextSrcSizeHint = BHSize - dctx->tmpInSize; + doAnotherStage = 0; + break; + } + selectedIn = dctx->tmpIn; + } /* if (dctx->dStage == dstage_storeBlockHeader) */ + + /* decode block header */ + { U32 const blockHeader = LZ4F_readLE32(selectedIn); + size_t const nextCBlockSize = blockHeader & 0x7FFFFFFFU; + size_t const crcSize = dctx->frameInfo.blockChecksumFlag * BFSize; + if (blockHeader==0) { /* frameEnd signal, no more block */ + DEBUGLOG(5, "end of frame"); + dctx->dStage = dstage_getSuffix; + break; + } + if (nextCBlockSize > dctx->maxBlockSize) { + return err0r(LZ4F_ERROR_maxBlockSize_invalid); + } + if (blockHeader & LZ4F_BLOCKUNCOMPRESSED_FLAG) { + /* next block is uncompressed */ + dctx->tmpInTarget = nextCBlockSize; + DEBUGLOG(5, "next block is uncompressed (size %u)", (U32)nextCBlockSize); + if (dctx->frameInfo.blockChecksumFlag) { + (void)XXH32_reset(&dctx->blockChecksum, 0); + } + dctx->dStage = dstage_copyDirect; + break; + } + /* next block is a compressed block */ + dctx->tmpInTarget = nextCBlockSize + crcSize; + dctx->dStage = dstage_getCBlock; + if (dstPtr==dstEnd || srcPtr==srcEnd) { + nextSrcSizeHint = BHSize + nextCBlockSize + crcSize; + doAnotherStage = 0; + } + break; + } + + case dstage_copyDirect: /* uncompressed block */ + DEBUGLOG(6, "dstage_copyDirect"); + { size_t sizeToCopy; + if (dstPtr == NULL) { + sizeToCopy = 0; + } else { + size_t const minBuffSize = MIN((size_t)(srcEnd-srcPtr), (size_t)(dstEnd-dstPtr)); + sizeToCopy = MIN(dctx->tmpInTarget, minBuffSize); + memcpy(dstPtr, srcPtr, sizeToCopy); + if (dctx->frameInfo.blockChecksumFlag) { + (void)XXH32_update(&dctx->blockChecksum, srcPtr, sizeToCopy); + } + if (dctx->frameInfo.contentChecksumFlag) + (void)XXH32_update(&dctx->xxh, srcPtr, sizeToCopy); + if (dctx->frameInfo.contentSize) + dctx->frameRemainingSize -= sizeToCopy; + + /* history management (linked blocks only)*/ + if (dctx->frameInfo.blockMode == LZ4F_blockLinked) { + LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 0); + } } + + srcPtr += sizeToCopy; + dstPtr += sizeToCopy; + if (sizeToCopy == dctx->tmpInTarget) { /* all done */ + if (dctx->frameInfo.blockChecksumFlag) { + dctx->tmpInSize = 0; + dctx->dStage = dstage_getBlockChecksum; + } else + dctx->dStage = dstage_getBlockHeader; /* new block */ + break; + } + dctx->tmpInTarget -= sizeToCopy; /* need to copy more */ + } + nextSrcSizeHint = dctx->tmpInTarget + + +(dctx->frameInfo.blockChecksumFlag ? BFSize : 0) + + BHSize /* next header size */; + doAnotherStage = 0; + break; + + /* check block checksum for recently transferred uncompressed block */ + case dstage_getBlockChecksum: + DEBUGLOG(6, "dstage_getBlockChecksum"); + { const void* crcSrc; + if ((srcEnd-srcPtr >= 4) && (dctx->tmpInSize==0)) { + crcSrc = srcPtr; + srcPtr += 4; + } else { + size_t const stillToCopy = 4 - dctx->tmpInSize; + size_t const sizeToCopy = MIN(stillToCopy, (size_t)(srcEnd-srcPtr)); + memcpy(dctx->header + dctx->tmpInSize, srcPtr, sizeToCopy); + dctx->tmpInSize += sizeToCopy; + srcPtr += sizeToCopy; + if (dctx->tmpInSize < 4) { /* all input consumed */ + doAnotherStage = 0; + break; + } + crcSrc = dctx->header; + } + { U32 const readCRC = LZ4F_readLE32(crcSrc); + U32 const calcCRC = XXH32_digest(&dctx->blockChecksum); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + DEBUGLOG(6, "compare block checksum"); + if (readCRC != calcCRC) { + DEBUGLOG(4, "incorrect block checksum: %08X != %08X", + readCRC, calcCRC); + return err0r(LZ4F_ERROR_blockChecksum_invalid); + } +#else + (void)readCRC; + (void)calcCRC; +#endif + } } + dctx->dStage = dstage_getBlockHeader; /* new block */ + break; + + case dstage_getCBlock: + DEBUGLOG(6, "dstage_getCBlock"); + if ((size_t)(srcEnd-srcPtr) < dctx->tmpInTarget) { + dctx->tmpInSize = 0; + dctx->dStage = dstage_storeCBlock; + break; + } + /* input large enough to read full block directly */ + selectedIn = srcPtr; + srcPtr += dctx->tmpInTarget; + + if (0) /* always jump over next block */ + case dstage_storeCBlock: + { size_t const wantedData = dctx->tmpInTarget - dctx->tmpInSize; + size_t const inputLeft = (size_t)(srcEnd-srcPtr); + size_t const sizeToCopy = MIN(wantedData, inputLeft); + memcpy(dctx->tmpIn + dctx->tmpInSize, srcPtr, sizeToCopy); + dctx->tmpInSize += sizeToCopy; + srcPtr += sizeToCopy; + if (dctx->tmpInSize < dctx->tmpInTarget) { /* need more input */ + nextSrcSizeHint = (dctx->tmpInTarget - dctx->tmpInSize) + + (dctx->frameInfo.blockChecksumFlag ? BFSize : 0) + + BHSize /* next header size */; + doAnotherStage = 0; + break; + } + selectedIn = dctx->tmpIn; + } + + /* At this stage, input is large enough to decode a block */ + if (dctx->frameInfo.blockChecksumFlag) { + dctx->tmpInTarget -= 4; + assert(selectedIn != NULL); /* selectedIn is defined at this stage (either srcPtr, or dctx->tmpIn) */ + { U32 const readBlockCrc = LZ4F_readLE32(selectedIn + dctx->tmpInTarget); + U32 const calcBlockCrc = XXH32(selectedIn, dctx->tmpInTarget, 0); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (readBlockCrc != calcBlockCrc) + return err0r(LZ4F_ERROR_blockChecksum_invalid); +#else + (void)readBlockCrc; + (void)calcBlockCrc; +#endif + } } + + if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) { + const char* dict = (const char*)dctx->dict; + size_t dictSize = dctx->dictSize; + int decodedSize; + assert(dstPtr != NULL); + if (dict && dictSize > 1 GB) { + /* the dictSize param is an int, avoid truncation / sign issues */ + dict += dictSize - 64 KB; + dictSize = 64 KB; + } + /* enough capacity in `dst` to decompress directly there */ + decodedSize = LZ4_decompress_safe_usingDict( + (const char*)selectedIn, (char*)dstPtr, + (int)dctx->tmpInTarget, (int)dctx->maxBlockSize, + dict, (int)dictSize); + if (decodedSize < 0) return err0r(LZ4F_ERROR_GENERIC); /* decompression failed */ + if (dctx->frameInfo.contentChecksumFlag) + XXH32_update(&(dctx->xxh), dstPtr, (size_t)decodedSize); + if (dctx->frameInfo.contentSize) + dctx->frameRemainingSize -= (size_t)decodedSize; + + /* dictionary management */ + if (dctx->frameInfo.blockMode==LZ4F_blockLinked) { + LZ4F_updateDict(dctx, dstPtr, (size_t)decodedSize, dstStart, 0); + } + + dstPtr += decodedSize; + dctx->dStage = dstage_getBlockHeader; + break; + } + + /* not enough place into dst : decode into tmpOut */ + /* ensure enough place for tmpOut */ + if (dctx->frameInfo.blockMode == LZ4F_blockLinked) { + if (dctx->dict == dctx->tmpOutBuffer) { + if (dctx->dictSize > 128 KB) { + memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - 64 KB, 64 KB); + dctx->dictSize = 64 KB; + } + dctx->tmpOut = dctx->tmpOutBuffer + dctx->dictSize; + } else { /* dict not within tmp */ + size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB); + dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace; + } } + + /* Decode block */ + { const char* dict = (const char*)dctx->dict; + size_t dictSize = dctx->dictSize; + int decodedSize; + if (dict && dictSize > 1 GB) { + /* the dictSize param is an int, avoid truncation / sign issues */ + dict += dictSize - 64 KB; + dictSize = 64 KB; + } + decodedSize = LZ4_decompress_safe_usingDict( + (const char*)selectedIn, (char*)dctx->tmpOut, + (int)dctx->tmpInTarget, (int)dctx->maxBlockSize, + dict, (int)dictSize); + if (decodedSize < 0) /* decompression failed */ + return err0r(LZ4F_ERROR_decompressionFailed); + if (dctx->frameInfo.contentChecksumFlag) + XXH32_update(&(dctx->xxh), dctx->tmpOut, (size_t)decodedSize); + if (dctx->frameInfo.contentSize) + dctx->frameRemainingSize -= (size_t)decodedSize; + dctx->tmpOutSize = (size_t)decodedSize; + dctx->tmpOutStart = 0; + dctx->dStage = dstage_flushOut; + } + /* fall-through */ + + case dstage_flushOut: /* flush decoded data from tmpOut to dstBuffer */ + DEBUGLOG(6, "dstage_flushOut"); + if (dstPtr != NULL) { + size_t const sizeToCopy = MIN(dctx->tmpOutSize - dctx->tmpOutStart, (size_t)(dstEnd-dstPtr)); + memcpy(dstPtr, dctx->tmpOut + dctx->tmpOutStart, sizeToCopy); + + /* dictionary management */ + if (dctx->frameInfo.blockMode == LZ4F_blockLinked) + LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 1 /*withinTmp*/); + + dctx->tmpOutStart += sizeToCopy; + dstPtr += sizeToCopy; + } + if (dctx->tmpOutStart == dctx->tmpOutSize) { /* all flushed */ + dctx->dStage = dstage_getBlockHeader; /* get next block */ + break; + } + /* could not flush everything : stop there, just request a block header */ + doAnotherStage = 0; + nextSrcSizeHint = BHSize; + break; + + case dstage_getSuffix: + if (dctx->frameRemainingSize) + return err0r(LZ4F_ERROR_frameSize_wrong); /* incorrect frame size decoded */ + if (!dctx->frameInfo.contentChecksumFlag) { /* no checksum, frame is completed */ + nextSrcSizeHint = 0; + LZ4F_resetDecompressionContext(dctx); + doAnotherStage = 0; + break; + } + if ((srcEnd - srcPtr) < 4) { /* not enough size for entire CRC */ + dctx->tmpInSize = 0; + dctx->dStage = dstage_storeSuffix; + } else { + selectedIn = srcPtr; + srcPtr += 4; + } + + if (dctx->dStage == dstage_storeSuffix) /* can be skipped */ + case dstage_storeSuffix: + { size_t const remainingInput = (size_t)(srcEnd - srcPtr); + size_t const wantedData = 4 - dctx->tmpInSize; + size_t const sizeToCopy = MIN(wantedData, remainingInput); + memcpy(dctx->tmpIn + dctx->tmpInSize, srcPtr, sizeToCopy); + srcPtr += sizeToCopy; + dctx->tmpInSize += sizeToCopy; + if (dctx->tmpInSize < 4) { /* not enough input to read complete suffix */ + nextSrcSizeHint = 4 - dctx->tmpInSize; + doAnotherStage=0; + break; + } + selectedIn = dctx->tmpIn; + } /* if (dctx->dStage == dstage_storeSuffix) */ + + /* case dstage_checkSuffix: */ /* no direct entry, avoid initialization risks */ + { U32 const readCRC = LZ4F_readLE32(selectedIn); + U32 const resultCRC = XXH32_digest(&(dctx->xxh)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (readCRC != resultCRC) + return err0r(LZ4F_ERROR_contentChecksum_invalid); +#else + (void)readCRC; + (void)resultCRC; +#endif + nextSrcSizeHint = 0; + LZ4F_resetDecompressionContext(dctx); + doAnotherStage = 0; + break; + } + + case dstage_getSFrameSize: + if ((srcEnd - srcPtr) >= 4) { + selectedIn = srcPtr; + srcPtr += 4; + } else { + /* not enough input to read cBlockSize field */ + dctx->tmpInSize = 4; + dctx->tmpInTarget = 8; + dctx->dStage = dstage_storeSFrameSize; + } + + if (dctx->dStage == dstage_storeSFrameSize) + case dstage_storeSFrameSize: + { size_t const sizeToCopy = MIN(dctx->tmpInTarget - dctx->tmpInSize, + (size_t)(srcEnd - srcPtr) ); + memcpy(dctx->header + dctx->tmpInSize, srcPtr, sizeToCopy); + srcPtr += sizeToCopy; + dctx->tmpInSize += sizeToCopy; + if (dctx->tmpInSize < dctx->tmpInTarget) { + /* not enough input to get full sBlockSize; wait for more */ + nextSrcSizeHint = dctx->tmpInTarget - dctx->tmpInSize; + doAnotherStage = 0; + break; + } + selectedIn = dctx->header + 4; + } /* if (dctx->dStage == dstage_storeSFrameSize) */ + + /* case dstage_decodeSFrameSize: */ /* no direct entry */ + { size_t const SFrameSize = LZ4F_readLE32(selectedIn); + dctx->frameInfo.contentSize = SFrameSize; + dctx->tmpInTarget = SFrameSize; + dctx->dStage = dstage_skipSkippable; + break; + } + + case dstage_skipSkippable: + { size_t const skipSize = MIN(dctx->tmpInTarget, (size_t)(srcEnd-srcPtr)); + srcPtr += skipSize; + dctx->tmpInTarget -= skipSize; + doAnotherStage = 0; + nextSrcSizeHint = dctx->tmpInTarget; + if (nextSrcSizeHint) break; /* still more to skip */ + /* frame fully skipped : prepare context for a new frame */ + LZ4F_resetDecompressionContext(dctx); + break; + } + } /* switch (dctx->dStage) */ + } /* while (doAnotherStage) */ + + /* preserve history within tmp whenever necessary */ + LZ4F_STATIC_ASSERT((unsigned)dstage_init == 2); + if ( (dctx->frameInfo.blockMode==LZ4F_blockLinked) /* next block will use up to 64KB from previous ones */ + && (dctx->dict != dctx->tmpOutBuffer) /* dictionary is not already within tmp */ + && (dctx->dict != NULL) /* dictionary exists */ + && (!decompressOptionsPtr->stableDst) /* cannot rely on dst data to remain there for next call */ + && ((unsigned)(dctx->dStage)-2 < (unsigned)(dstage_getSuffix)-2) ) /* valid stages : [init ... getSuffix[ */ + { + if (dctx->dStage == dstage_flushOut) { + size_t const preserveSize = (size_t)(dctx->tmpOut - dctx->tmpOutBuffer); + size_t copySize = 64 KB - dctx->tmpOutSize; + const BYTE* oldDictEnd = dctx->dict + dctx->dictSize - dctx->tmpOutStart; + if (dctx->tmpOutSize > 64 KB) copySize = 0; + if (copySize > preserveSize) copySize = preserveSize; + assert(dctx->tmpOutBuffer != NULL); + + memcpy(dctx->tmpOutBuffer + preserveSize - copySize, oldDictEnd - copySize, copySize); + + dctx->dict = dctx->tmpOutBuffer; + dctx->dictSize = preserveSize + dctx->tmpOutStart; + } else { + const BYTE* const oldDictEnd = dctx->dict + dctx->dictSize; + size_t const newDictSize = MIN(dctx->dictSize, 64 KB); + + memcpy(dctx->tmpOutBuffer, oldDictEnd - newDictSize, newDictSize); + + dctx->dict = dctx->tmpOutBuffer; + dctx->dictSize = newDictSize; + dctx->tmpOut = dctx->tmpOutBuffer + newDictSize; + } + } + + *srcSizePtr = (size_t)(srcPtr - srcStart); + *dstSizePtr = (size_t)(dstPtr - dstStart); + return nextSrcSizeHint; +} + +/*! LZ4F_decompress_usingDict() : + * Same as LZ4F_decompress(), using a predefined dictionary. + * Dictionary is used "in place", without any preprocessing. + * It must remain accessible throughout the entire frame decoding. + */ +size_t LZ4F_decompress_usingDict(LZ4F_dctx* dctx, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const void* dict, size_t dictSize, + const LZ4F_decompressOptions_t* decompressOptionsPtr) +{ + if (dctx->dStage <= dstage_init) { + dctx->dict = (const BYTE*)dict; + dctx->dictSize = dictSize; + } + return LZ4F_decompress(dctx, dstBuffer, dstSizePtr, + srcBuffer, srcSizePtr, + decompressOptionsPtr); +} diff --git a/libbutl/lz4frame.h b/libbutl/lz4frame.h new file mode 100644 index 0000000..4573317 --- /dev/null +++ b/libbutl/lz4frame.h @@ -0,0 +1,623 @@ +/* + LZ4 auto-framing library + Header File + Copyright (C) 2011-2017, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/lz4/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/* LZ4F is a stand-alone API able to create and decode LZ4 frames + * conformant with specification v1.6.1 in doc/lz4_Frame_format.md . + * Generated frames are compatible with `lz4` CLI. + * + * LZ4F also offers streaming capabilities. + * + * lz4.h is not required when using lz4frame.h, + * except to extract common constant such as LZ4_VERSION_NUMBER. + * */ + +#ifndef LZ4F_H_09782039843 +#define LZ4F_H_09782039843 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* --- Dependency --- */ +#include <stddef.h> /* size_t */ + + +/** + Introduction + + lz4frame.h implements LZ4 frame specification (doc/lz4_Frame_format.md). + lz4frame.h provides frame compression functions that take care + of encoding standard metadata alongside LZ4-compressed blocks. +*/ + +/*-*************************************************************** + * Compiler specifics + *****************************************************************/ +/* LZ4_DLL_EXPORT : + * Enable exporting of functions when building a Windows DLL + * LZ4FLIB_VISIBILITY : + * Control library symbols visibility. + */ +#ifndef LZ4FLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4FLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define LZ4FLIB_VISIBILITY +# endif +#endif +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4FLIB_API __declspec(dllexport) LZ4FLIB_VISIBILITY +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4FLIB_API __declspec(dllimport) LZ4FLIB_VISIBILITY +#else +# define LZ4FLIB_API LZ4FLIB_VISIBILITY +#endif + +#ifdef LZ4F_DISABLE_DEPRECATE_WARNINGS +# define LZ4F_DEPRECATE(x) x +#else +# if defined(_MSC_VER) +# define LZ4F_DEPRECATE(x) x /* __declspec(deprecated) x - only works with C++ */ +# elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6)) +# define LZ4F_DEPRECATE(x) x __attribute__((deprecated)) +# else +# define LZ4F_DEPRECATE(x) x /* no deprecation warning for this compiler */ +# endif +#endif + + +/*-************************************ + * Error management + **************************************/ +typedef size_t LZ4F_errorCode_t; + +LZ4FLIB_API unsigned LZ4F_isError(LZ4F_errorCode_t code); /**< tells when a function result is an error code */ +LZ4FLIB_API const char* LZ4F_getErrorName(LZ4F_errorCode_t code); /**< return error code string; for debugging */ + + +/*-************************************ + * Frame compression types + ************************************* */ +/* #define LZ4F_ENABLE_OBSOLETE_ENUMS // uncomment to enable obsolete enums */ +#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS +# define LZ4F_OBSOLETE_ENUM(x) , LZ4F_DEPRECATE(x) = LZ4F_##x +#else +# define LZ4F_OBSOLETE_ENUM(x) +#endif + +/* The larger the block size, the (slightly) better the compression ratio, + * though there are diminishing returns. + * Larger blocks also increase memory usage on both compression and decompression sides. + */ +typedef enum { + LZ4F_default=0, + LZ4F_max64KB=4, + LZ4F_max256KB=5, + LZ4F_max1MB=6, + LZ4F_max4MB=7 + LZ4F_OBSOLETE_ENUM(max64KB) + LZ4F_OBSOLETE_ENUM(max256KB) + LZ4F_OBSOLETE_ENUM(max1MB) + LZ4F_OBSOLETE_ENUM(max4MB) +} LZ4F_blockSizeID_t; + +/* Linked blocks sharply reduce inefficiencies when using small blocks, + * they compress better. + * However, some LZ4 decoders are only compatible with independent blocks */ +typedef enum { + LZ4F_blockLinked=0, + LZ4F_blockIndependent + LZ4F_OBSOLETE_ENUM(blockLinked) + LZ4F_OBSOLETE_ENUM(blockIndependent) +} LZ4F_blockMode_t; + +typedef enum { + LZ4F_noContentChecksum=0, + LZ4F_contentChecksumEnabled + LZ4F_OBSOLETE_ENUM(noContentChecksum) + LZ4F_OBSOLETE_ENUM(contentChecksumEnabled) +} LZ4F_contentChecksum_t; + +typedef enum { + LZ4F_noBlockChecksum=0, + LZ4F_blockChecksumEnabled +} LZ4F_blockChecksum_t; + +typedef enum { + LZ4F_frame=0, + LZ4F_skippableFrame + LZ4F_OBSOLETE_ENUM(skippableFrame) +} LZ4F_frameType_t; + +#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS +typedef LZ4F_blockSizeID_t blockSizeID_t; +typedef LZ4F_blockMode_t blockMode_t; +typedef LZ4F_frameType_t frameType_t; +typedef LZ4F_contentChecksum_t contentChecksum_t; +#endif + +/*! LZ4F_frameInfo_t : + * makes it possible to set or read frame parameters. + * Structure must be first init to 0, using memset() or LZ4F_INIT_FRAMEINFO, + * setting all parameters to default. + * It's then possible to update selectively some parameters */ +typedef struct { + LZ4F_blockSizeID_t blockSizeID; /* max64KB, max256KB, max1MB, max4MB; 0 == default */ + LZ4F_blockMode_t blockMode; /* LZ4F_blockLinked, LZ4F_blockIndependent; 0 == default */ + LZ4F_contentChecksum_t contentChecksumFlag; /* 1: frame terminated with 32-bit checksum of decompressed data; 0: disabled (default) */ + LZ4F_frameType_t frameType; /* read-only field : LZ4F_frame or LZ4F_skippableFrame */ + unsigned long long contentSize; /* Size of uncompressed content ; 0 == unknown */ + unsigned dictID; /* Dictionary ID, sent by compressor to help decoder select correct dictionary; 0 == no dictID provided */ + LZ4F_blockChecksum_t blockChecksumFlag; /* 1: each block followed by a checksum of block's compressed data; 0: disabled (default) */ +} LZ4F_frameInfo_t; + +#define LZ4F_INIT_FRAMEINFO { LZ4F_default, LZ4F_blockLinked, LZ4F_noContentChecksum, LZ4F_frame, 0ULL, 0U, LZ4F_noBlockChecksum } /* v1.8.3+ */ + +/*! LZ4F_preferences_t : + * makes it possible to supply advanced compression instructions to streaming interface. + * Structure must be first init to 0, using memset() or LZ4F_INIT_PREFERENCES, + * setting all parameters to default. + * All reserved fields must be set to zero. */ +typedef struct { + LZ4F_frameInfo_t frameInfo; + int compressionLevel; /* 0: default (fast mode); values > LZ4HC_CLEVEL_MAX count as LZ4HC_CLEVEL_MAX; values < 0 trigger "fast acceleration" */ + unsigned autoFlush; /* 1: always flush; reduces usage of internal buffers */ + unsigned favorDecSpeed; /* 1: parser favors decompression speed vs compression ratio. Only works for high compression modes (>= LZ4HC_CLEVEL_OPT_MIN) */ /* v1.8.2+ */ + unsigned reserved[3]; /* must be zero for forward compatibility */ +} LZ4F_preferences_t; + +#define LZ4F_INIT_PREFERENCES { LZ4F_INIT_FRAMEINFO, 0, 0u, 0u, { 0u, 0u, 0u } } /* v1.8.3+ */ + + +/*-********************************* +* Simple compression function +***********************************/ + +LZ4FLIB_API int LZ4F_compressionLevel_max(void); /* v1.8.0+ */ + +/*! LZ4F_compressFrameBound() : + * Returns the maximum possible compressed size with LZ4F_compressFrame() given srcSize and preferences. + * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences. + * Note : this result is only usable with LZ4F_compressFrame(). + * It may also be used with LZ4F_compressUpdate() _if no flush() operation_ is performed. + */ +LZ4FLIB_API size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr); + +/*! LZ4F_compressFrame() : + * Compress an entire srcBuffer into a valid LZ4 frame. + * dstCapacity MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr). + * The LZ4F_preferences_t structure is optional : you can provide NULL as argument. All preferences will be set to default. + * @return : number of bytes written into dstBuffer. + * or an error code if it fails (can be tested using LZ4F_isError()) + */ +LZ4FLIB_API size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const LZ4F_preferences_t* preferencesPtr); + + +/*-*********************************** +* Advanced compression functions +*************************************/ +typedef struct LZ4F_cctx_s LZ4F_cctx; /* incomplete type */ +typedef LZ4F_cctx* LZ4F_compressionContext_t; /* for compatibility with previous API version */ + +typedef struct { + unsigned stableSrc; /* 1 == src content will remain present on future calls to LZ4F_compress(); skip copying src content within tmp buffer */ + unsigned reserved[3]; +} LZ4F_compressOptions_t; + +/*--- Resource Management ---*/ + +#define LZ4F_VERSION 100 /* This number can be used to check for an incompatible API breaking change */ +LZ4FLIB_API unsigned LZ4F_getVersion(void); + +/*! LZ4F_createCompressionContext() : + * The first thing to do is to create a compressionContext object, which will be used in all compression operations. + * This is achieved using LZ4F_createCompressionContext(), which takes as argument a version. + * The version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL. + * The function will provide a pointer to a fully allocated LZ4F_cctx object. + * If @return != zero, there was an error during context creation. + * Object can release its memory using LZ4F_freeCompressionContext(); + */ +LZ4FLIB_API LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** cctxPtr, unsigned version); +LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctx); + + +/*---- Compression ----*/ + +#define LZ4F_HEADER_SIZE_MIN 7 /* LZ4 Frame header size can vary, depending on selected paramaters */ +#define LZ4F_HEADER_SIZE_MAX 19 + +/* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */ +#define LZ4F_BLOCK_HEADER_SIZE 4 + +/* Size in bytes of a block checksum footer in little-endian format. */ +#define LZ4F_BLOCK_CHECKSUM_SIZE 4 + +/* Size in bytes of the content checksum. */ +#define LZ4F_CONTENT_CHECKSUM_SIZE 4 + +/*! LZ4F_compressBegin() : + * will write the frame header into dstBuffer. + * dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes. + * `prefsPtr` is optional : you can provide NULL as argument, all preferences will then be set to default. + * @return : number of bytes written into dstBuffer for the header + * or an error code (which can be tested using LZ4F_isError()) + */ +LZ4FLIB_API size_t LZ4F_compressBegin(LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const LZ4F_preferences_t* prefsPtr); + +/*! LZ4F_compressBound() : + * Provides minimum dstCapacity required to guarantee success of + * LZ4F_compressUpdate(), given a srcSize and preferences, for a worst case scenario. + * When srcSize==0, LZ4F_compressBound() provides an upper bound for LZ4F_flush() and LZ4F_compressEnd() instead. + * Note that the result is only valid for a single invocation of LZ4F_compressUpdate(). + * When invoking LZ4F_compressUpdate() multiple times, + * if the output buffer is gradually filled up instead of emptied and re-used from its start, + * one must check if there is enough remaining capacity before each invocation, using LZ4F_compressBound(). + * @return is always the same for a srcSize and prefsPtr. + * prefsPtr is optional : when NULL is provided, preferences will be set to cover worst case scenario. + * tech details : + * @return if automatic flushing is not enabled, includes the possibility that internal buffer might already be filled by up to (blockSize-1) bytes. + * It also includes frame footer (ending + checksum), since it might be generated by LZ4F_compressEnd(). + * @return doesn't include frame header, as it was already generated by LZ4F_compressBegin(). + */ +LZ4FLIB_API size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* prefsPtr); + +/*! LZ4F_compressUpdate() : + * LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary. + * Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations. + * This value is provided by LZ4F_compressBound(). + * If this condition is not respected, LZ4F_compress() will fail (result is an errorCode). + * LZ4F_compressUpdate() doesn't guarantee error recovery. + * When an error occurs, compression context must be freed or resized. + * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default. + * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered). + * or an error code if it fails (which can be tested using LZ4F_isError()) + */ +LZ4FLIB_API size_t LZ4F_compressUpdate(LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const void* srcBuffer, size_t srcSize, + const LZ4F_compressOptions_t* cOptPtr); + +/*! LZ4F_flush() : + * When data must be generated and sent immediately, without waiting for a block to be completely filled, + * it's possible to call LZ4_flush(). It will immediately compress any data buffered within cctx. + * `dstCapacity` must be large enough to ensure the operation will be successful. + * `cOptPtr` is optional : it's possible to provide NULL, all options will be set to default. + * @return : nb of bytes written into dstBuffer (can be zero, when there is no data stored within cctx) + * or an error code if it fails (which can be tested using LZ4F_isError()) + * Note : LZ4F_flush() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr). + */ +LZ4FLIB_API size_t LZ4F_flush(LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const LZ4F_compressOptions_t* cOptPtr); + +/*! LZ4F_compressEnd() : + * To properly finish an LZ4 frame, invoke LZ4F_compressEnd(). + * It will flush whatever data remained within `cctx` (like LZ4_flush()) + * and properly finalize the frame, with an endMark and a checksum. + * `cOptPtr` is optional : NULL can be provided, in which case all options will be set to default. + * @return : nb of bytes written into dstBuffer, necessarily >= 4 (endMark), + * or an error code if it fails (which can be tested using LZ4F_isError()) + * Note : LZ4F_compressEnd() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr). + * A successful call to LZ4F_compressEnd() makes `cctx` available again for another compression task. + */ +LZ4FLIB_API size_t LZ4F_compressEnd(LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const LZ4F_compressOptions_t* cOptPtr); + + +/*-********************************* +* Decompression functions +***********************************/ +typedef struct LZ4F_dctx_s LZ4F_dctx; /* incomplete type */ +typedef LZ4F_dctx* LZ4F_decompressionContext_t; /* compatibility with previous API versions */ + +typedef struct { + unsigned stableDst; /* pledges that last 64KB decompressed data will remain available unmodified. This optimization skips storage operations in tmp buffers. */ + unsigned reserved[3]; /* must be set to zero for forward compatibility */ +} LZ4F_decompressOptions_t; + + +/* Resource management */ + +/*! LZ4F_createDecompressionContext() : + * Create an LZ4F_dctx object, to track all decompression operations. + * The version provided MUST be LZ4F_VERSION. + * The function provides a pointer to an allocated and initialized LZ4F_dctx object. + * The result is an errorCode, which can be tested using LZ4F_isError(). + * dctx memory can be released using LZ4F_freeDecompressionContext(); + * Result of LZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released. + * That is, it should be == 0 if decompression has been completed fully and correctly. + */ +LZ4FLIB_API LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** dctxPtr, unsigned version); +LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx); + + +/*-*********************************** +* Streaming decompression functions +*************************************/ + +#define LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5 + +/*! LZ4F_headerSize() : v1.9.0+ + * Provide the header size of a frame starting at `src`. + * `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH, + * which is enough to decode the header length. + * @return : size of frame header + * or an error code, which can be tested using LZ4F_isError() + * note : Frame header size is variable, but is guaranteed to be + * >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes. + */ +LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize); + +/*! LZ4F_getFrameInfo() : + * This function extracts frame parameters (max blockSize, dictID, etc.). + * Its usage is optional: user can call LZ4F_decompress() directly. + * + * Extracted information will fill an existing LZ4F_frameInfo_t structure. + * This can be useful for allocation and dictionary identification purposes. + * + * LZ4F_getFrameInfo() can work in the following situations : + * + * 1) At the beginning of a new frame, before any invocation of LZ4F_decompress(). + * It will decode header from `srcBuffer`, + * consuming the header and starting the decoding process. + * + * Input size must be large enough to contain the full frame header. + * Frame header size can be known beforehand by LZ4F_headerSize(). + * Frame header size is variable, but is guaranteed to be >= LZ4F_HEADER_SIZE_MIN bytes, + * and not more than <= LZ4F_HEADER_SIZE_MAX bytes. + * Hence, blindly providing LZ4F_HEADER_SIZE_MAX bytes or more will always work. + * It's allowed to provide more input data than the header size, + * LZ4F_getFrameInfo() will only consume the header. + * + * If input size is not large enough, + * aka if it's smaller than header size, + * function will fail and return an error code. + * + * 2) After decoding has been started, + * it's possible to invoke LZ4F_getFrameInfo() anytime + * to extract already decoded frame parameters stored within dctx. + * + * Note that, if decoding has barely started, + * and not yet read enough information to decode the header, + * LZ4F_getFrameInfo() will fail. + * + * The number of bytes consumed from srcBuffer will be updated in *srcSizePtr (necessarily <= original value). + * LZ4F_getFrameInfo() only consumes bytes when decoding has not yet started, + * and when decoding the header has been successful. + * Decompression must then resume from (srcBuffer + *srcSizePtr). + * + * @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call, + * or an error code which can be tested using LZ4F_isError(). + * note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely. + * note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure. + */ +LZ4FLIB_API size_t LZ4F_getFrameInfo(LZ4F_dctx* dctx, + LZ4F_frameInfo_t* frameInfoPtr, + const void* srcBuffer, size_t* srcSizePtr); + +/*! LZ4F_decompress() : + * Call this function repetitively to regenerate data compressed in `srcBuffer`. + * + * The function requires a valid dctx state. + * It will read up to *srcSizePtr bytes from srcBuffer, + * and decompress data into dstBuffer, of capacity *dstSizePtr. + * + * The nb of bytes consumed from srcBuffer will be written into *srcSizePtr (necessarily <= original value). + * The nb of bytes decompressed into dstBuffer will be written into *dstSizePtr (necessarily <= original value). + * + * The function does not necessarily read all input bytes, so always check value in *srcSizePtr. + * Unconsumed source data must be presented again in subsequent invocations. + * + * `dstBuffer` can freely change between each consecutive function invocation. + * `dstBuffer` content will be overwritten. + * + * @return : an hint of how many `srcSize` bytes LZ4F_decompress() expects for next call. + * Schematically, it's the size of the current (or remaining) compressed block + header of next block. + * Respecting the hint provides some small speed benefit, because it skips intermediate buffers. + * This is just a hint though, it's always possible to provide any srcSize. + * + * When a frame is fully decoded, @return will be 0 (no more data expected). + * When provided with more bytes than necessary to decode a frame, + * LZ4F_decompress() will stop reading exactly at end of current frame, and @return 0. + * + * If decompression failed, @return is an error code, which can be tested using LZ4F_isError(). + * After a decompression error, the `dctx` context is not resumable. + * Use LZ4F_resetDecompressionContext() to return to clean state. + * + * After a frame is fully decoded, dctx can be used again to decompress another frame. + */ +LZ4FLIB_API size_t LZ4F_decompress(LZ4F_dctx* dctx, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const LZ4F_decompressOptions_t* dOptPtr); + + +/*! LZ4F_resetDecompressionContext() : added in v1.8.0 + * In case of an error, the context is left in "undefined" state. + * In which case, it's necessary to reset it, before re-using it. + * This method can also be used to abruptly stop any unfinished decompression, + * and start a new one using same context resources. */ +LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx); /* always successful */ + + + +#if defined (__cplusplus) +} +#endif + +#endif /* LZ4F_H_09782039843 */ + +#if defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) +#define LZ4F_H_STATIC_09782039843 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* These declarations are not stable and may change in the future. + * They are therefore only safe to depend on + * when the caller is statically linked against the library. + * To access their declarations, define LZ4F_STATIC_LINKING_ONLY. + * + * By default, these symbols aren't published into shared/dynamic libraries. + * You can override this behavior and force them to be published + * by defining LZ4F_PUBLISH_STATIC_FUNCTIONS. + * Use at your own risk. + */ +#ifdef LZ4F_PUBLISH_STATIC_FUNCTIONS +# define LZ4FLIB_STATIC_API LZ4FLIB_API +#else +# define LZ4FLIB_STATIC_API +#endif + + +/* --- Error List --- */ +#define LZ4F_LIST_ERRORS(ITEM) \ + ITEM(OK_NoError) \ + ITEM(ERROR_GENERIC) \ + ITEM(ERROR_maxBlockSize_invalid) \ + ITEM(ERROR_blockMode_invalid) \ + ITEM(ERROR_contentChecksumFlag_invalid) \ + ITEM(ERROR_compressionLevel_invalid) \ + ITEM(ERROR_headerVersion_wrong) \ + ITEM(ERROR_blockChecksum_invalid) \ + ITEM(ERROR_reservedFlag_set) \ + ITEM(ERROR_allocation_failed) \ + ITEM(ERROR_srcSize_tooLarge) \ + ITEM(ERROR_dstMaxSize_tooSmall) \ + ITEM(ERROR_frameHeader_incomplete) \ + ITEM(ERROR_frameType_unknown) \ + ITEM(ERROR_frameSize_wrong) \ + ITEM(ERROR_srcPtr_wrong) \ + ITEM(ERROR_decompressionFailed) \ + ITEM(ERROR_headerChecksum_invalid) \ + ITEM(ERROR_contentChecksum_invalid) \ + ITEM(ERROR_frameDecoding_alreadyStarted) \ + ITEM(ERROR_maxCode) + +#define LZ4F_GENERATE_ENUM(ENUM) LZ4F_##ENUM, + +/* enum list is exposed, to handle specific errors */ +typedef enum { LZ4F_LIST_ERRORS(LZ4F_GENERATE_ENUM) + _LZ4F_dummy_error_enum_for_c89_never_used } LZ4F_errorCodes; + +LZ4FLIB_STATIC_API LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult); + +LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(unsigned); + +/********************************** + * Bulk processing dictionary API + *********************************/ + +/* A Dictionary is useful for the compression of small messages (KB range). + * It dramatically improves compression efficiency. + * + * LZ4 can ingest any input as dictionary, though only the last 64 KB are useful. + * Best results are generally achieved by using Zstandard's Dictionary Builder + * to generate a high-quality dictionary from a set of samples. + * + * Loading a dictionary has a cost, since it involves construction of tables. + * The Bulk processing dictionary API makes it possible to share this cost + * over an arbitrary number of compression jobs, even concurrently, + * markedly improving compression latency for these cases. + * + * The same dictionary will have to be used on the decompression side + * for decoding to be successful. + * To help identify the correct dictionary at decoding stage, + * the frame header allows optional embedding of a dictID field. + */ +typedef struct LZ4F_CDict_s LZ4F_CDict; + +/*! LZ4_createCDict() : + * When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once. + * LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay. + * LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict */ +LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize); +LZ4FLIB_STATIC_API void LZ4F_freeCDict(LZ4F_CDict* CDict); + + +/*! LZ4_compressFrame_usingCDict() : + * Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary. + * cctx must point to a context created by LZ4F_createCompressionContext(). + * If cdict==NULL, compress without a dictionary. + * dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr). + * If this condition is not respected, function will fail (@return an errorCode). + * The LZ4F_preferences_t structure is optional : you may provide NULL as argument, + * but it's not recommended, as it's the only way to provide dictID in the frame header. + * @return : number of bytes written into dstBuffer. + * or an error code if it fails (can be tested using LZ4F_isError()) */ +LZ4FLIB_STATIC_API size_t LZ4F_compressFrame_usingCDict( + LZ4F_cctx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const LZ4F_CDict* cdict, + const LZ4F_preferences_t* preferencesPtr); + + +/*! LZ4F_compressBegin_usingCDict() : + * Inits streaming dictionary compression, and writes the frame header into dstBuffer. + * dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes. + * `prefsPtr` is optional : you may provide NULL as argument, + * however, it's the only way to provide dictID in the frame header. + * @return : number of bytes written into dstBuffer for the header, + * or an error code (which can be tested using LZ4F_isError()) */ +LZ4FLIB_STATIC_API size_t LZ4F_compressBegin_usingCDict( + LZ4F_cctx* cctx, + void* dstBuffer, size_t dstCapacity, + const LZ4F_CDict* cdict, + const LZ4F_preferences_t* prefsPtr); + + +/*! LZ4F_decompress_usingDict() : + * Same as LZ4F_decompress(), using a predefined dictionary. + * Dictionary is used "in place", without any preprocessing. + * It must remain accessible throughout the entire frame decoding. */ +LZ4FLIB_STATIC_API size_t LZ4F_decompress_usingDict( + LZ4F_dctx* dctxPtr, + void* dstBuffer, size_t* dstSizePtr, + const void* srcBuffer, size_t* srcSizePtr, + const void* dict, size_t dictSize, + const LZ4F_decompressOptions_t* decompressOptionsPtr); + +#if defined (__cplusplus) +} +#endif + +#endif /* defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) */ diff --git a/libbutl/lz4hc.c b/libbutl/lz4hc.c new file mode 100644 index 0000000..77c9f43 --- /dev/null +++ b/libbutl/lz4hc.c @@ -0,0 +1,1615 @@ +/* + LZ4 HC - High Compression Mode of LZ4 + Copyright (C) 2011-2017, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/lz4/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */ + + +/* ************************************* +* Tuning Parameter +***************************************/ + +/*! HEAPMODE : + * Select how default compression function will allocate workplace memory, + * in stack (0:fastest), or in heap (1:requires malloc()). + * Since workplace is rather large, heap mode is recommended. + */ +#ifndef LZ4HC_HEAPMODE +# define LZ4HC_HEAPMODE 1 +#endif + + +/*=== Dependency ===*/ +#define LZ4_HC_STATIC_LINKING_ONLY +#include "lz4hc.h" + + +/*=== Common definitions ===*/ +#if defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-function" +#endif +#if defined (__clang__) +# pragma clang diagnostic ignored "-Wunused-function" +#endif + +#define LZ4_COMMONDEFS_ONLY +#ifndef LZ4_SRC_INCLUDED +#include "lz4.c" /* LZ4_count, constants, mem */ +#endif + + +/*=== Enums ===*/ +typedef enum { noDictCtx, usingDictCtxHc } dictCtx_directive; + + +/*=== Constants ===*/ +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_OPT_NUM (1<<12) + + +/*=== Macros ===*/ +#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) +#define MAX(a,b) ( (a) > (b) ? (a) : (b) ) +#define HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-LZ4HC_HASH_LOG)) +#define DELTANEXTMAXD(p) chainTable[(p) & LZ4HC_MAXD_MASK] /* flexible, LZ4HC_MAXD dependent */ +#define DELTANEXTU16(table, pos) table[(U16)(pos)] /* faster */ +/* Make fields passed to, and updated by LZ4HC_encodeSequence explicit */ +#define UPDATABLE(ip, op, anchor) &ip, &op, &anchor + +static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); } + + +/************************************** +* HC Compression +**************************************/ +static void LZ4HC_clearTables (LZ4HC_CCtx_internal* hc4) +{ + MEM_INIT(hc4->hashTable, 0, sizeof(hc4->hashTable)); + MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable)); +} + +static void LZ4HC_init_internal (LZ4HC_CCtx_internal* hc4, const BYTE* start) +{ + uptrval startingOffset = (uptrval)(hc4->end - hc4->base); + if (startingOffset > 1 GB) { + LZ4HC_clearTables(hc4); + startingOffset = 0; + } + startingOffset += 64 KB; + hc4->nextToUpdate = (U32) startingOffset; + hc4->base = start - startingOffset; + hc4->end = start; + hc4->dictBase = start - startingOffset; + hc4->dictLimit = (U32) startingOffset; + hc4->lowLimit = (U32) startingOffset; +} + + +/* Update chains up to ip (excluded) */ +LZ4_FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip) +{ + U16* const chainTable = hc4->chainTable; + U32* const hashTable = hc4->hashTable; + const BYTE* const base = hc4->base; + U32 const target = (U32)(ip - base); + U32 idx = hc4->nextToUpdate; + + while (idx < target) { + U32 const h = LZ4HC_hashPtr(base+idx); + size_t delta = idx - hashTable[h]; + if (delta>LZ4_DISTANCE_MAX) delta = LZ4_DISTANCE_MAX; + DELTANEXTU16(chainTable, idx) = (U16)delta; + hashTable[h] = idx; + idx++; + } + + hc4->nextToUpdate = target; +} + +/** LZ4HC_countBack() : + * @return : negative value, nb of common bytes before ip/match */ +LZ4_FORCE_INLINE +int LZ4HC_countBack(const BYTE* const ip, const BYTE* const match, + const BYTE* const iMin, const BYTE* const mMin) +{ + int back = 0; + int const min = (int)MAX(iMin - ip, mMin - match); + assert(min <= 0); + assert(ip >= iMin); assert((size_t)(ip-iMin) < (1U<<31)); + assert(match >= mMin); assert((size_t)(match - mMin) < (1U<<31)); + while ( (back > min) + && (ip[back-1] == match[back-1]) ) + back--; + return back; +} + +#if defined(_MSC_VER) +# define LZ4HC_rotl32(x,r) _rotl(x,r) +#else +# define LZ4HC_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif + + +static U32 LZ4HC_rotatePattern(size_t const rotate, U32 const pattern) +{ + size_t const bitsToRotate = (rotate & (sizeof(pattern) - 1)) << 3; + if (bitsToRotate == 0) return pattern; + return LZ4HC_rotl32(pattern, (int)bitsToRotate); +} + +/* LZ4HC_countPattern() : + * pattern32 must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!) */ +static unsigned +LZ4HC_countPattern(const BYTE* ip, const BYTE* const iEnd, U32 const pattern32) +{ + const BYTE* const iStart = ip; + reg_t const pattern = (sizeof(pattern)==8) ? + (reg_t)pattern32 + (((reg_t)pattern32) << (sizeof(pattern)*4)) : pattern32; + + while (likely(ip < iEnd-(sizeof(pattern)-1))) { + reg_t const diff = LZ4_read_ARCH(ip) ^ pattern; + if (!diff) { ip+=sizeof(pattern); continue; } + ip += LZ4_NbCommonBytes(diff); + return (unsigned)(ip - iStart); + } + + if (LZ4_isLittleEndian()) { + reg_t patternByte = pattern; + while ((ip<iEnd) && (*ip == (BYTE)patternByte)) { + ip++; patternByte >>= 8; + } + } else { /* big endian */ + U32 bitOffset = (sizeof(pattern)*8) - 8; + while (ip < iEnd) { + BYTE const byte = (BYTE)(pattern >> bitOffset); + if (*ip != byte) break; + ip ++; bitOffset -= 8; + } + } + + return (unsigned)(ip - iStart); +} + +/* LZ4HC_reverseCountPattern() : + * pattern must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!) + * read using natural platform endianess */ +static unsigned +LZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern) +{ + const BYTE* const iStart = ip; + + while (likely(ip >= iLow+4)) { + if (LZ4_read32(ip-4) != pattern) break; + ip -= 4; + } + { const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianess */ + while (likely(ip>iLow)) { + if (ip[-1] != *bytePtr) break; + ip--; bytePtr--; + } } + return (unsigned)(iStart - ip); +} + +/* LZ4HC_protectDictEnd() : + * Checks if the match is in the last 3 bytes of the dictionary, so reading the + * 4 byte MINMATCH would overflow. + * @returns true if the match index is okay. + */ +static int LZ4HC_protectDictEnd(U32 const dictLimit, U32 const matchIndex) +{ + return ((U32)((dictLimit - 1) - matchIndex) >= 3); +} + +typedef enum { rep_untested, rep_not, rep_confirmed } repeat_state_e; +typedef enum { favorCompressionRatio=0, favorDecompressionSpeed } HCfavor_e; + +LZ4_FORCE_INLINE int +LZ4HC_InsertAndGetWiderMatch ( + LZ4HC_CCtx_internal* hc4, + const BYTE* const ip, + const BYTE* const iLowLimit, + const BYTE* const iHighLimit, + int longest, + const BYTE** matchpos, + const BYTE** startpos, + const int maxNbAttempts, + const int patternAnalysis, + const int chainSwap, + const dictCtx_directive dict, + const HCfavor_e favorDecSpeed) +{ + U16* const chainTable = hc4->chainTable; + U32* const HashTable = hc4->hashTable; + const LZ4HC_CCtx_internal * const dictCtx = hc4->dictCtx; + const BYTE* const base = hc4->base; + const U32 dictLimit = hc4->dictLimit; + const BYTE* const lowPrefixPtr = base + dictLimit; + const U32 ipIndex = (U32)(ip - base); + const U32 lowestMatchIndex = (hc4->lowLimit + (LZ4_DISTANCE_MAX + 1) > ipIndex) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX; + const BYTE* const dictBase = hc4->dictBase; + int const lookBackLength = (int)(ip-iLowLimit); + int nbAttempts = maxNbAttempts; + U32 matchChainPos = 0; + U32 const pattern = LZ4_read32(ip); + U32 matchIndex; + repeat_state_e repeat = rep_untested; + size_t srcPatternLength = 0; + + DEBUGLOG(7, "LZ4HC_InsertAndGetWiderMatch"); + /* First Match */ + LZ4HC_Insert(hc4, ip); + matchIndex = HashTable[LZ4HC_hashPtr(ip)]; + DEBUGLOG(7, "First match at index %u / %u (lowestMatchIndex)", + matchIndex, lowestMatchIndex); + + while ((matchIndex>=lowestMatchIndex) && (nbAttempts>0)) { + int matchLength=0; + nbAttempts--; + assert(matchIndex < ipIndex); + if (favorDecSpeed && (ipIndex - matchIndex < 8)) { + /* do nothing */ + } else if (matchIndex >= dictLimit) { /* within current Prefix */ + const BYTE* const matchPtr = base + matchIndex; + assert(matchPtr >= lowPrefixPtr); + assert(matchPtr < ip); + assert(longest >= 1); + if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - lookBackLength + longest - 1)) { + if (LZ4_read32(matchPtr) == pattern) { + int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, lowPrefixPtr) : 0; + matchLength = MINMATCH + (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit); + matchLength -= back; + if (matchLength > longest) { + longest = matchLength; + *matchpos = matchPtr + back; + *startpos = ip + back; + } } } + } else { /* lowestMatchIndex <= matchIndex < dictLimit */ + const BYTE* const matchPtr = dictBase + matchIndex; + if (LZ4_read32(matchPtr) == pattern) { + const BYTE* const dictStart = dictBase + hc4->lowLimit; + int back = 0; + const BYTE* vLimit = ip + (dictLimit - matchIndex); + if (vLimit > iHighLimit) vLimit = iHighLimit; + matchLength = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH; + if ((ip+matchLength == vLimit) && (vLimit < iHighLimit)) + matchLength += LZ4_count(ip+matchLength, lowPrefixPtr, iHighLimit); + back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0; + matchLength -= back; + if (matchLength > longest) { + longest = matchLength; + *matchpos = base + matchIndex + back; /* virtual pos, relative to ip, to retrieve offset */ + *startpos = ip + back; + } } } + + if (chainSwap && matchLength==longest) { /* better match => select a better chain */ + assert(lookBackLength==0); /* search forward only */ + if (matchIndex + (U32)longest <= ipIndex) { + int const kTrigger = 4; + U32 distanceToNextMatch = 1; + int const end = longest - MINMATCH + 1; + int step = 1; + int accel = 1 << kTrigger; + int pos; + for (pos = 0; pos < end; pos += step) { + U32 const candidateDist = DELTANEXTU16(chainTable, matchIndex + (U32)pos); + step = (accel++ >> kTrigger); + if (candidateDist > distanceToNextMatch) { + distanceToNextMatch = candidateDist; + matchChainPos = (U32)pos; + accel = 1 << kTrigger; + } + } + if (distanceToNextMatch > 1) { + if (distanceToNextMatch > matchIndex) break; /* avoid overflow */ + matchIndex -= distanceToNextMatch; + continue; + } } } + + { U32 const distNextMatch = DELTANEXTU16(chainTable, matchIndex); + if (patternAnalysis && distNextMatch==1 && matchChainPos==0) { + U32 const matchCandidateIdx = matchIndex-1; + /* may be a repeated pattern */ + if (repeat == rep_untested) { + if ( ((pattern & 0xFFFF) == (pattern >> 16)) + & ((pattern & 0xFF) == (pattern >> 24)) ) { + repeat = rep_confirmed; + srcPatternLength = LZ4HC_countPattern(ip+sizeof(pattern), iHighLimit, pattern) + sizeof(pattern); + } else { + repeat = rep_not; + } } + if ( (repeat == rep_confirmed) && (matchCandidateIdx >= lowestMatchIndex) + && LZ4HC_protectDictEnd(dictLimit, matchCandidateIdx) ) { + const int extDict = matchCandidateIdx < dictLimit; + const BYTE* const matchPtr = (extDict ? dictBase : base) + matchCandidateIdx; + if (LZ4_read32(matchPtr) == pattern) { /* good candidate */ + const BYTE* const dictStart = dictBase + hc4->lowLimit; + const BYTE* const iLimit = extDict ? dictBase + dictLimit : iHighLimit; + size_t forwardPatternLength = LZ4HC_countPattern(matchPtr+sizeof(pattern), iLimit, pattern) + sizeof(pattern); + if (extDict && matchPtr + forwardPatternLength == iLimit) { + U32 const rotatedPattern = LZ4HC_rotatePattern(forwardPatternLength, pattern); + forwardPatternLength += LZ4HC_countPattern(lowPrefixPtr, iHighLimit, rotatedPattern); + } + { const BYTE* const lowestMatchPtr = extDict ? dictStart : lowPrefixPtr; + size_t backLength = LZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern); + size_t currentSegmentLength; + if (!extDict && matchPtr - backLength == lowPrefixPtr && hc4->lowLimit < dictLimit) { + U32 const rotatedPattern = LZ4HC_rotatePattern((U32)(-(int)backLength), pattern); + backLength += LZ4HC_reverseCountPattern(dictBase + dictLimit, dictStart, rotatedPattern); + } + /* Limit backLength not go further than lowestMatchIndex */ + backLength = matchCandidateIdx - MAX(matchCandidateIdx - (U32)backLength, lowestMatchIndex); + assert(matchCandidateIdx - backLength >= lowestMatchIndex); + currentSegmentLength = backLength + forwardPatternLength; + /* Adjust to end of pattern if the source pattern fits, otherwise the beginning of the pattern */ + if ( (currentSegmentLength >= srcPatternLength) /* current pattern segment large enough to contain full srcPatternLength */ + && (forwardPatternLength <= srcPatternLength) ) { /* haven't reached this position yet */ + U32 const newMatchIndex = matchCandidateIdx + (U32)forwardPatternLength - (U32)srcPatternLength; /* best position, full pattern, might be followed by more match */ + if (LZ4HC_protectDictEnd(dictLimit, newMatchIndex)) + matchIndex = newMatchIndex; + else { + /* Can only happen if started in the prefix */ + assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict); + matchIndex = dictLimit; + } + } else { + U32 const newMatchIndex = matchCandidateIdx - (U32)backLength; /* farthest position in current segment, will find a match of length currentSegmentLength + maybe some back */ + if (!LZ4HC_protectDictEnd(dictLimit, newMatchIndex)) { + assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict); + matchIndex = dictLimit; + } else { + matchIndex = newMatchIndex; + if (lookBackLength==0) { /* no back possible */ + size_t const maxML = MIN(currentSegmentLength, srcPatternLength); + if ((size_t)longest < maxML) { + assert(base + matchIndex != ip); + if ((size_t)(ip - base) - matchIndex > LZ4_DISTANCE_MAX) break; + assert(maxML < 2 GB); + longest = (int)maxML; + *matchpos = base + matchIndex; /* virtual pos, relative to ip, to retrieve offset */ + *startpos = ip; + } + { U32 const distToNextPattern = DELTANEXTU16(chainTable, matchIndex); + if (distToNextPattern > matchIndex) break; /* avoid overflow */ + matchIndex -= distToNextPattern; + } } } } } + continue; + } } + } } /* PA optimization */ + + /* follow current chain */ + matchIndex -= DELTANEXTU16(chainTable, matchIndex + matchChainPos); + + } /* while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) */ + + if ( dict == usingDictCtxHc + && nbAttempts > 0 + && ipIndex - lowestMatchIndex < LZ4_DISTANCE_MAX) { + size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->base); + U32 dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)]; + assert(dictEndOffset <= 1 GB); + matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset; + while (ipIndex - matchIndex <= LZ4_DISTANCE_MAX && nbAttempts--) { + const BYTE* const matchPtr = dictCtx->base + dictMatchIndex; + + if (LZ4_read32(matchPtr) == pattern) { + int mlt; + int back = 0; + const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex); + if (vLimit > iHighLimit) vLimit = iHighLimit; + mlt = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH; + back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->base + dictCtx->dictLimit) : 0; + mlt -= back; + if (mlt > longest) { + longest = mlt; + *matchpos = base + matchIndex + back; + *startpos = ip + back; + } } + + { U32 const nextOffset = DELTANEXTU16(dictCtx->chainTable, dictMatchIndex); + dictMatchIndex -= nextOffset; + matchIndex -= nextOffset; + } } } + + return longest; +} + +LZ4_FORCE_INLINE +int LZ4HC_InsertAndFindBestMatch(LZ4HC_CCtx_internal* const hc4, /* Index table will be updated */ + const BYTE* const ip, const BYTE* const iLimit, + const BYTE** matchpos, + const int maxNbAttempts, + const int patternAnalysis, + const dictCtx_directive dict) +{ + const BYTE* uselessPtr = ip; + /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos), + * but this won't be the case here, as we define iLowLimit==ip, + * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */ + return LZ4HC_InsertAndGetWiderMatch(hc4, ip, ip, iLimit, MINMATCH-1, matchpos, &uselessPtr, maxNbAttempts, patternAnalysis, 0 /*chainSwap*/, dict, favorCompressionRatio); +} + +/* LZ4HC_encodeSequence() : + * @return : 0 if ok, + * 1 if buffer issue detected */ +LZ4_FORCE_INLINE int LZ4HC_encodeSequence ( + const BYTE** _ip, + BYTE** _op, + const BYTE** _anchor, + int matchLength, + const BYTE* const match, + limitedOutput_directive limit, + BYTE* oend) +{ +#define ip (*_ip) +#define op (*_op) +#define anchor (*_anchor) + + size_t length; + BYTE* const token = op++; + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 6) + static const BYTE* start = NULL; + static U32 totalCost = 0; + U32 const pos = (start==NULL) ? 0 : (U32)(anchor - start); + U32 const ll = (U32)(ip - anchor); + U32 const llAdd = (ll>=15) ? ((ll-15) / 255) + 1 : 0; + U32 const mlAdd = (matchLength>=19) ? ((matchLength-19) / 255) + 1 : 0; + U32 const cost = 1 + llAdd + ll + 2 + mlAdd; + if (start==NULL) start = anchor; /* only works for single segment */ + /* g_debuglog_enable = (pos >= 2228) & (pos <= 2262); */ + DEBUGLOG(6, "pos:%7u -- literals:%4u, match:%4i, offset:%5u, cost:%4u + %5u", + pos, + (U32)(ip - anchor), matchLength, (U32)(ip-match), + cost, totalCost); + totalCost += cost; +#endif + + /* Encode Literal length */ + length = (size_t)(ip - anchor); + LZ4_STATIC_ASSERT(notLimited == 0); + /* Check output limit */ + if (limit && ((op + (length / 255) + length + (2 + 1 + LASTLITERALS)) > oend)) { + DEBUGLOG(6, "Not enough room to write %i literals (%i bytes remaining)", + (int)length, (int)(oend - op)); + return 1; + } + if (length >= RUN_MASK) { + size_t len = length - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for(; len >= 255 ; len -= 255) *op++ = 255; + *op++ = (BYTE)len; + } else { + *token = (BYTE)(length << ML_BITS); + } + + /* Copy Literals */ + LZ4_wildCopy8(op, anchor, op + length); + op += length; + + /* Encode Offset */ + assert( (ip - match) <= LZ4_DISTANCE_MAX ); /* note : consider providing offset as a value, rather than as a pointer difference */ + LZ4_writeLE16(op, (U16)(ip - match)); op += 2; + + /* Encode MatchLength */ + assert(matchLength >= MINMATCH); + length = (size_t)matchLength - MINMATCH; + if (limit && (op + (length / 255) + (1 + LASTLITERALS) > oend)) { + DEBUGLOG(6, "Not enough room to write match length"); + return 1; /* Check output limit */ + } + if (length >= ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for(; length >= 510 ; length -= 510) { *op++ = 255; *op++ = 255; } + if (length >= 255) { length -= 255; *op++ = 255; } + *op++ = (BYTE)length; + } else { + *token += (BYTE)(length); + } + + /* Prepare next loop */ + ip += matchLength; + anchor = ip; + + return 0; +} +#undef ip +#undef op +#undef anchor + +LZ4_FORCE_INLINE int LZ4HC_compress_hashChain ( + LZ4HC_CCtx_internal* const ctx, + const char* const source, + char* const dest, + int* srcSizePtr, + int const maxOutputSize, + int maxNbAttempts, + const limitedOutput_directive limit, + const dictCtx_directive dict + ) +{ + const int inputSize = *srcSizePtr; + const int patternAnalysis = (maxNbAttempts > 128); /* levels 9+ */ + + const BYTE* ip = (const BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = (iend - LASTLITERALS); + + BYTE* optr = (BYTE*) dest; + BYTE* op = (BYTE*) dest; + BYTE* oend = op + maxOutputSize; + + int ml0, ml, ml2, ml3; + const BYTE* start0; + const BYTE* ref0; + const BYTE* ref = NULL; + const BYTE* start2 = NULL; + const BYTE* ref2 = NULL; + const BYTE* start3 = NULL; + const BYTE* ref3 = NULL; + + /* init */ + *srcSizePtr = 0; + if (limit == fillOutput) oend -= LASTLITERALS; /* Hack for support LZ4 format restriction */ + if (inputSize < LZ4_minLength) goto _last_literals; /* Input too small, no compression (all literals) */ + + /* Main Loop */ + while (ip <= mflimit) { + ml = LZ4HC_InsertAndFindBestMatch(ctx, ip, matchlimit, &ref, maxNbAttempts, patternAnalysis, dict); + if (ml<MINMATCH) { ip++; continue; } + + /* saved, in case we would skip too much */ + start0 = ip; ref0 = ref; ml0 = ml; + +_Search2: + if (ip+ml <= mflimit) { + ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, + ip + ml - 2, ip + 0, matchlimit, ml, &ref2, &start2, + maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio); + } else { + ml2 = ml; + } + + if (ml2 == ml) { /* No better match => encode ML1 */ + optr = op; + if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow; + continue; + } + + if (start0 < ip) { /* first match was skipped at least once */ + if (start2 < ip + ml0) { /* squeezing ML1 between ML0(original ML1) and ML2 */ + ip = start0; ref = ref0; ml = ml0; /* restore initial ML1 */ + } } + + /* Here, start0==ip */ + if ((start2 - ip) < 3) { /* First Match too small : removed */ + ml = ml2; + ip = start2; + ref =ref2; + goto _Search2; + } + +_Search3: + /* At this stage, we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML; + if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */ + + if (start2 + ml2 <= mflimit) { + ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, + start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, + maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio); + } else { + ml3 = ml2; + } + + if (ml3 == ml2) { /* No better match => encode ML1 and ML2 */ + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) ml = (int)(start2 - ip); + /* Now, encode 2 sequences */ + optr = op; + if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow; + ip = start2; + optr = op; + if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml2, ref2, limit, oend)) { + ml = ml2; + ref = ref2; + goto _dest_overflow; + } + continue; + } + + if (start3 < ip+ml+3) { /* Not enough space for match 2 : remove it */ + if (start3 >= (ip+ml)) { /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */ + if (start2 < ip+ml) { + int correction = (int)(ip+ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + optr = op; + if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow; + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _Search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _Search3; + } + + /* + * OK, now we have 3 ascending matches; + * let's write the first one ML1. + * ip & ref are known; Now decide ml. + */ + if (start2 < ip+ml) { + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + if (ml > OPTIMAL_ML) ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else { + ml = (int)(start2 - ip); + } + } + optr = op; + if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow; + + /* ML2 becomes ML1 */ + ip = start2; ref = ref2; ml = ml2; + + /* ML3 becomes ML2 */ + start2 = start3; ref2 = ref3; ml2 = ml3; + + /* let's find a new ML3 */ + goto _Search3; + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRunSize = (size_t)(iend - anchor); /* literals */ + size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255; + size_t const totalSize = 1 + llAdd + lastRunSize; + if (limit == fillOutput) oend += LASTLITERALS; /* restore correct value */ + if (limit && (op + totalSize > oend)) { + if (limit == limitedOutput) return 0; + /* adapt lastRunSize to fill 'dest' */ + lastRunSize = (size_t)(oend - op) - 1 /*token*/; + llAdd = (lastRunSize + 256 - RUN_MASK) / 256; + lastRunSize -= llAdd; + } + DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize); + ip = anchor + lastRunSize; /* can be != iend if limit==fillOutput */ + + if (lastRunSize >= RUN_MASK) { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = (RUN_MASK << ML_BITS); + for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRunSize << ML_BITS); + } + memcpy(op, anchor, lastRunSize); + op += lastRunSize; + } + + /* End */ + *srcSizePtr = (int) (((const char*)ip) - source); + return (int) (((char*)op)-dest); + +_dest_overflow: + if (limit == fillOutput) { + /* Assumption : ip, anchor, ml and ref must be set correctly */ + size_t const ll = (size_t)(ip - anchor); + size_t const ll_addbytes = (ll + 240) / 255; + size_t const ll_totalCost = 1 + ll_addbytes + ll; + BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */ + DEBUGLOG(6, "Last sequence overflowing"); + op = optr; /* restore correct out pointer */ + if (op + ll_totalCost <= maxLitPos) { + /* ll validated; now adjust match length */ + size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost)); + size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255); + assert(maxMlSize < INT_MAX); assert(ml >= 0); + if ((size_t)ml > maxMlSize) ml = (int)maxMlSize; + if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + ml >= MFLIMIT) { + LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, notLimited, oend); + } } + goto _last_literals; + } + /* compression failed */ + return 0; +} + + +static int LZ4HC_compress_optimal( LZ4HC_CCtx_internal* ctx, + const char* const source, char* dst, + int* srcSizePtr, int dstCapacity, + int const nbSearches, size_t sufficient_len, + const limitedOutput_directive limit, int const fullUpdate, + const dictCtx_directive dict, + const HCfavor_e favorDecSpeed); + + +LZ4_FORCE_INLINE int LZ4HC_compress_generic_internal ( + LZ4HC_CCtx_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + int const dstCapacity, + int cLevel, + const limitedOutput_directive limit, + const dictCtx_directive dict + ) +{ + typedef enum { lz4hc, lz4opt } lz4hc_strat_e; + typedef struct { + lz4hc_strat_e strat; + int nbSearches; + U32 targetLength; + } cParams_t; + static const cParams_t clTable[LZ4HC_CLEVEL_MAX+1] = { + { lz4hc, 2, 16 }, /* 0, unused */ + { lz4hc, 2, 16 }, /* 1, unused */ + { lz4hc, 2, 16 }, /* 2, unused */ + { lz4hc, 4, 16 }, /* 3 */ + { lz4hc, 8, 16 }, /* 4 */ + { lz4hc, 16, 16 }, /* 5 */ + { lz4hc, 32, 16 }, /* 6 */ + { lz4hc, 64, 16 }, /* 7 */ + { lz4hc, 128, 16 }, /* 8 */ + { lz4hc, 256, 16 }, /* 9 */ + { lz4opt, 96, 64 }, /*10==LZ4HC_CLEVEL_OPT_MIN*/ + { lz4opt, 512,128 }, /*11 */ + { lz4opt,16384,LZ4_OPT_NUM }, /* 12==LZ4HC_CLEVEL_MAX */ + }; + + DEBUGLOG(4, "LZ4HC_compress_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)", + ctx, src, *srcSizePtr, limit); + + if (limit == fillOutput && dstCapacity < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size (too large or negative) */ + + ctx->end += *srcSizePtr; + if (cLevel < 1) cLevel = LZ4HC_CLEVEL_DEFAULT; /* note : convention is different from lz4frame, maybe something to review */ + cLevel = MIN(LZ4HC_CLEVEL_MAX, cLevel); + { cParams_t const cParam = clTable[cLevel]; + HCfavor_e const favor = ctx->favorDecSpeed ? favorDecompressionSpeed : favorCompressionRatio; + int result; + + if (cParam.strat == lz4hc) { + result = LZ4HC_compress_hashChain(ctx, + src, dst, srcSizePtr, dstCapacity, + cParam.nbSearches, limit, dict); + } else { + assert(cParam.strat == lz4opt); + result = LZ4HC_compress_optimal(ctx, + src, dst, srcSizePtr, dstCapacity, + cParam.nbSearches, cParam.targetLength, limit, + cLevel == LZ4HC_CLEVEL_MAX, /* ultra mode */ + dict, favor); + } + if (result <= 0) ctx->dirty = 1; + return result; + } +} + +static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock); + +static int +LZ4HC_compress_generic_noDictCtx ( + LZ4HC_CCtx_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + int const dstCapacity, + int cLevel, + limitedOutput_directive limit + ) +{ + assert(ctx->dictCtx == NULL); + return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, noDictCtx); +} + +static int +LZ4HC_compress_generic_dictCtx ( + LZ4HC_CCtx_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + int const dstCapacity, + int cLevel, + limitedOutput_directive limit + ) +{ + const size_t position = (size_t)(ctx->end - ctx->base) - ctx->lowLimit; + assert(ctx->dictCtx != NULL); + if (position >= 64 KB) { + ctx->dictCtx = NULL; + return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); + } else if (position == 0 && *srcSizePtr > 4 KB) { + memcpy(ctx, ctx->dictCtx, sizeof(LZ4HC_CCtx_internal)); + LZ4HC_setExternalDict(ctx, (const BYTE *)src); + ctx->compressionLevel = (short)cLevel; + return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); + } else { + return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, usingDictCtxHc); + } +} + +static int +LZ4HC_compress_generic ( + LZ4HC_CCtx_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + int const dstCapacity, + int cLevel, + limitedOutput_directive limit + ) +{ + if (ctx->dictCtx == NULL) { + return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); + } else { + return LZ4HC_compress_generic_dictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit); + } +} + + +int LZ4_sizeofStateHC(void) { return (int)sizeof(LZ4_streamHC_t); } + +static size_t LZ4_streamHC_t_alignment(void) +{ +#if LZ4_ALIGN_TEST + typedef struct { char c; LZ4_streamHC_t t; } t_a; + return sizeof(t_a) - sizeof(LZ4_streamHC_t); +#else + return 1; /* effectively disabled */ +#endif +} + +/* state is presumed correctly initialized, + * in which case its size and alignment have already been validate */ +int LZ4_compress_HC_extStateHC_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel) +{ + LZ4HC_CCtx_internal* const ctx = &((LZ4_streamHC_t*)state)->internal_donotuse; + if (!LZ4_isAligned(state, LZ4_streamHC_t_alignment())) return 0; + LZ4_resetStreamHC_fast((LZ4_streamHC_t*)state, compressionLevel); + LZ4HC_init_internal (ctx, (const BYTE*)src); + if (dstCapacity < LZ4_compressBound(srcSize)) + return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, limitedOutput); + else + return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, notLimited); +} + +int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel) +{ + LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx)); + if (ctx==NULL) return 0; /* init failure */ + return LZ4_compress_HC_extStateHC_fastReset(state, src, dst, srcSize, dstCapacity, compressionLevel); +} + +int LZ4_compress_HC(const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel) +{ +#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1 + LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)ALLOC(sizeof(LZ4_streamHC_t)); +#else + LZ4_streamHC_t state; + LZ4_streamHC_t* const statePtr = &state; +#endif + int const cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel); +#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1 + FREEMEM(statePtr); +#endif + return cSize; +} + +/* state is presumed sized correctly (>= sizeof(LZ4_streamHC_t)) */ +int LZ4_compress_HC_destSize(void* state, const char* source, char* dest, int* sourceSizePtr, int targetDestSize, int cLevel) +{ + LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx)); + if (ctx==NULL) return 0; /* init failure */ + LZ4HC_init_internal(&ctx->internal_donotuse, (const BYTE*) source); + LZ4_setCompressionLevel(ctx, cLevel); + return LZ4HC_compress_generic(&ctx->internal_donotuse, source, dest, sourceSizePtr, targetDestSize, cLevel, fillOutput); +} + + + +/************************************** +* Streaming Functions +**************************************/ +/* allocation */ +LZ4_streamHC_t* LZ4_createStreamHC(void) +{ + LZ4_streamHC_t* const state = + (LZ4_streamHC_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamHC_t)); + if (state == NULL) return NULL; + LZ4_setCompressionLevel(state, LZ4HC_CLEVEL_DEFAULT); + return state; +} + +int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr) +{ + DEBUGLOG(4, "LZ4_freeStreamHC(%p)", LZ4_streamHCPtr); + if (!LZ4_streamHCPtr) return 0; /* support free on NULL */ + FREEMEM(LZ4_streamHCPtr); + return 0; +} + + +LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size) +{ + LZ4_streamHC_t* const LZ4_streamHCPtr = (LZ4_streamHC_t*)buffer; + /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */ + LZ4_STATIC_ASSERT(sizeof(LZ4HC_CCtx_internal) <= LZ4_STREAMHCSIZE); + DEBUGLOG(4, "LZ4_initStreamHC(%p, %u)", buffer, (unsigned)size); + /* check conditions */ + if (buffer == NULL) return NULL; + if (size < sizeof(LZ4_streamHC_t)) return NULL; + if (!LZ4_isAligned(buffer, LZ4_streamHC_t_alignment())) return NULL; + /* init */ + { LZ4HC_CCtx_internal* const hcstate = &(LZ4_streamHCPtr->internal_donotuse); + MEM_INIT(hcstate, 0, sizeof(*hcstate)); } + LZ4_setCompressionLevel(LZ4_streamHCPtr, LZ4HC_CLEVEL_DEFAULT); + return LZ4_streamHCPtr; +} + +/* just a stub */ +void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel) +{ + LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr)); + LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel); +} + +void LZ4_resetStreamHC_fast (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel) +{ + DEBUGLOG(4, "LZ4_resetStreamHC_fast(%p, %d)", LZ4_streamHCPtr, compressionLevel); + if (LZ4_streamHCPtr->internal_donotuse.dirty) { + LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr)); + } else { + /* preserve end - base : can trigger clearTable's threshold */ + LZ4_streamHCPtr->internal_donotuse.end -= (uptrval)LZ4_streamHCPtr->internal_donotuse.base; + LZ4_streamHCPtr->internal_donotuse.base = NULL; + LZ4_streamHCPtr->internal_donotuse.dictCtx = NULL; + } + LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel); +} + +void LZ4_setCompressionLevel(LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel) +{ + DEBUGLOG(5, "LZ4_setCompressionLevel(%p, %d)", LZ4_streamHCPtr, compressionLevel); + if (compressionLevel < 1) compressionLevel = LZ4HC_CLEVEL_DEFAULT; + if (compressionLevel > LZ4HC_CLEVEL_MAX) compressionLevel = LZ4HC_CLEVEL_MAX; + LZ4_streamHCPtr->internal_donotuse.compressionLevel = (short)compressionLevel; +} + +void LZ4_favorDecompressionSpeed(LZ4_streamHC_t* LZ4_streamHCPtr, int favor) +{ + LZ4_streamHCPtr->internal_donotuse.favorDecSpeed = (favor!=0); +} + +/* LZ4_loadDictHC() : + * LZ4_streamHCPtr is presumed properly initialized */ +int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, + const char* dictionary, int dictSize) +{ + LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse; + DEBUGLOG(4, "LZ4_loadDictHC(ctx:%p, dict:%p, dictSize:%d)", LZ4_streamHCPtr, dictionary, dictSize); + assert(LZ4_streamHCPtr != NULL); + if (dictSize > 64 KB) { + dictionary += (size_t)dictSize - 64 KB; + dictSize = 64 KB; + } + /* need a full initialization, there are bad side-effects when using resetFast() */ + { int const cLevel = ctxPtr->compressionLevel; + LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr)); + LZ4_setCompressionLevel(LZ4_streamHCPtr, cLevel); + } + LZ4HC_init_internal (ctxPtr, (const BYTE*)dictionary); + ctxPtr->end = (const BYTE*)dictionary + dictSize; + if (dictSize >= 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3); + return dictSize; +} + +void LZ4_attach_HC_dictionary(LZ4_streamHC_t *working_stream, const LZ4_streamHC_t *dictionary_stream) { + working_stream->internal_donotuse.dictCtx = dictionary_stream != NULL ? &(dictionary_stream->internal_donotuse) : NULL; +} + +/* compression */ + +static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock) +{ + DEBUGLOG(4, "LZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock); + if (ctxPtr->end >= ctxPtr->base + ctxPtr->dictLimit + 4) + LZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */ + + /* Only one memory segment for extDict, so any previous extDict is lost at this stage */ + ctxPtr->lowLimit = ctxPtr->dictLimit; + ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base); + ctxPtr->dictBase = ctxPtr->base; + ctxPtr->base = newBlock - ctxPtr->dictLimit; + ctxPtr->end = newBlock; + ctxPtr->nextToUpdate = ctxPtr->dictLimit; /* match referencing will resume from there */ + + /* cannot reference an extDict and a dictCtx at the same time */ + ctxPtr->dictCtx = NULL; +} + +static int +LZ4_compressHC_continue_generic (LZ4_streamHC_t* LZ4_streamHCPtr, + const char* src, char* dst, + int* srcSizePtr, int dstCapacity, + limitedOutput_directive limit) +{ + LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse; + DEBUGLOG(5, "LZ4_compressHC_continue_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)", + LZ4_streamHCPtr, src, *srcSizePtr, limit); + assert(ctxPtr != NULL); + /* auto-init if forgotten */ + if (ctxPtr->base == NULL) LZ4HC_init_internal (ctxPtr, (const BYTE*) src); + + /* Check overflow */ + if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) { + size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit; + if (dictSize > 64 KB) dictSize = 64 KB; + LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize); + } + + /* Check if blocks follow each other */ + if ((const BYTE*)src != ctxPtr->end) + LZ4HC_setExternalDict(ctxPtr, (const BYTE*)src); + + /* Check overlapping input/dictionary space */ + { const BYTE* sourceEnd = (const BYTE*) src + *srcSizePtr; + const BYTE* const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit; + const BYTE* const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit; + if ((sourceEnd > dictBegin) && ((const BYTE*)src < dictEnd)) { + if (sourceEnd > dictEnd) sourceEnd = dictEnd; + ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase); + if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit; + } } + + return LZ4HC_compress_generic (ctxPtr, src, dst, srcSizePtr, dstCapacity, ctxPtr->compressionLevel, limit); +} + +int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int srcSize, int dstCapacity) +{ + if (dstCapacity < LZ4_compressBound(srcSize)) + return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, limitedOutput); + else + return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, notLimited); +} + +int LZ4_compress_HC_continue_destSize (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int* srcSizePtr, int targetDestSize) +{ + return LZ4_compressHC_continue_generic(LZ4_streamHCPtr, src, dst, srcSizePtr, targetDestSize, fillOutput); +} + + + +/* LZ4_saveDictHC : + * save history content + * into a user-provided buffer + * which is then used to continue compression + */ +int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize) +{ + LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse; + int const prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit)); + DEBUGLOG(5, "LZ4_saveDictHC(%p, %p, %d)", LZ4_streamHCPtr, safeBuffer, dictSize); + assert(prefixSize >= 0); + if (dictSize > 64 KB) dictSize = 64 KB; + if (dictSize < 4) dictSize = 0; + if (dictSize > prefixSize) dictSize = prefixSize; + if (safeBuffer == NULL) assert(dictSize == 0); + if (dictSize > 0) + memmove(safeBuffer, streamPtr->end - dictSize, dictSize); + { U32 const endIndex = (U32)(streamPtr->end - streamPtr->base); + streamPtr->end = (const BYTE*)safeBuffer + dictSize; + streamPtr->base = streamPtr->end - endIndex; + streamPtr->dictLimit = endIndex - (U32)dictSize; + streamPtr->lowLimit = endIndex - (U32)dictSize; + if (streamPtr->nextToUpdate < streamPtr->dictLimit) + streamPtr->nextToUpdate = streamPtr->dictLimit; + } + return dictSize; +} + + +/*************************************************** +* Deprecated Functions +***************************************************/ + +/* These functions currently generate deprecation warnings */ + +/* Wrappers for deprecated compression functions */ +int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); } +int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); } +int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); } +int LZ4_compressHC2_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, cLevel); } +int LZ4_compressHC_withStateHC (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, LZ4_compressBound(srcSize), 0); } +int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, maxDstSize, 0); } +int LZ4_compressHC2_withStateHC (void* state, const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); } +int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, maxDstSize, cLevel); } +int LZ4_compressHC_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, LZ4_compressBound(srcSize)); } +int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, maxDstSize); } + + +/* Deprecated streaming functions */ +int LZ4_sizeofStreamStateHC(void) { return LZ4_STREAMHCSIZE; } + +/* state is presumed correctly sized, aka >= sizeof(LZ4_streamHC_t) + * @return : 0 on success, !=0 if error */ +int LZ4_resetStreamStateHC(void* state, char* inputBuffer) +{ + LZ4_streamHC_t* const hc4 = LZ4_initStreamHC(state, sizeof(*hc4)); + if (hc4 == NULL) return 1; /* init failed */ + LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer); + return 0; +} + +void* LZ4_createHC (const char* inputBuffer) +{ + LZ4_streamHC_t* const hc4 = LZ4_createStreamHC(); + if (hc4 == NULL) return NULL; /* not enough memory */ + LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer); + return hc4; +} + +int LZ4_freeHC (void* LZ4HC_Data) +{ + if (!LZ4HC_Data) return 0; /* support free on NULL */ + FREEMEM(LZ4HC_Data); + return 0; +} + +int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int cLevel) +{ + return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, 0, cLevel, notLimited); +} + +int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int dstCapacity, int cLevel) +{ + return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, dstCapacity, cLevel, limitedOutput); +} + +char* LZ4_slideInputBufferHC(void* LZ4HC_Data) +{ + LZ4_streamHC_t *ctx = (LZ4_streamHC_t*)LZ4HC_Data; + const BYTE *bufferStart = ctx->internal_donotuse.base + ctx->internal_donotuse.lowLimit; + LZ4_resetStreamHC_fast(ctx, ctx->internal_donotuse.compressionLevel); + /* avoid const char * -> char * conversion warning :( */ + return (char *)(uptrval)bufferStart; +} + + +/* ================================================ + * LZ4 Optimal parser (levels [LZ4HC_CLEVEL_OPT_MIN - LZ4HC_CLEVEL_MAX]) + * ===============================================*/ +typedef struct { + int price; + int off; + int mlen; + int litlen; +} LZ4HC_optimal_t; + +/* price in bytes */ +LZ4_FORCE_INLINE int LZ4HC_literalsPrice(int const litlen) +{ + int price = litlen; + assert(litlen >= 0); + if (litlen >= (int)RUN_MASK) + price += 1 + ((litlen-(int)RUN_MASK) / 255); + return price; +} + + +/* requires mlen >= MINMATCH */ +LZ4_FORCE_INLINE int LZ4HC_sequencePrice(int litlen, int mlen) +{ + int price = 1 + 2 ; /* token + 16-bit offset */ + assert(litlen >= 0); + assert(mlen >= MINMATCH); + + price += LZ4HC_literalsPrice(litlen); + + if (mlen >= (int)(ML_MASK+MINMATCH)) + price += 1 + ((mlen-(int)(ML_MASK+MINMATCH)) / 255); + + return price; +} + + +typedef struct { + int off; + int len; +} LZ4HC_match_t; + +LZ4_FORCE_INLINE LZ4HC_match_t +LZ4HC_FindLongerMatch(LZ4HC_CCtx_internal* const ctx, + const BYTE* ip, const BYTE* const iHighLimit, + int minLen, int nbSearches, + const dictCtx_directive dict, + const HCfavor_e favorDecSpeed) +{ + LZ4HC_match_t match = { 0 , 0 }; + const BYTE* matchPtr = NULL; + /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos), + * but this won't be the case here, as we define iLowLimit==ip, + * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */ + int matchLength = LZ4HC_InsertAndGetWiderMatch(ctx, ip, ip, iHighLimit, minLen, &matchPtr, &ip, nbSearches, 1 /*patternAnalysis*/, 1 /*chainSwap*/, dict, favorDecSpeed); + if (matchLength <= minLen) return match; + if (favorDecSpeed) { + if ((matchLength>18) & (matchLength<=36)) matchLength=18; /* favor shortcut */ + } + match.len = matchLength; + match.off = (int)(ip-matchPtr); + return match; +} + + +static int LZ4HC_compress_optimal ( LZ4HC_CCtx_internal* ctx, + const char* const source, + char* dst, + int* srcSizePtr, + int dstCapacity, + int const nbSearches, + size_t sufficient_len, + const limitedOutput_directive limit, + int const fullUpdate, + const dictCtx_directive dict, + const HCfavor_e favorDecSpeed) +{ + int retval = 0; +#define TRAILING_LITERALS 3 +#ifdef LZ4HC_HEAPMODE + LZ4HC_optimal_t* const opt = (LZ4HC_optimal_t*)ALLOC(sizeof(LZ4HC_optimal_t) * (LZ4_OPT_NUM + TRAILING_LITERALS)); +#else + LZ4HC_optimal_t opt[LZ4_OPT_NUM + TRAILING_LITERALS]; /* ~64 KB, which is a bit large for stack... */ +#endif + + const BYTE* ip = (const BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + BYTE* op = (BYTE*) dst; + BYTE* opSaved = (BYTE*) dst; + BYTE* oend = op + dstCapacity; + int ovml = MINMATCH; /* overflow - last sequence */ + const BYTE* ovref = NULL; + + /* init */ +#ifdef LZ4HC_HEAPMODE + if (opt == NULL) goto _return_label; +#endif + DEBUGLOG(5, "LZ4HC_compress_optimal(dst=%p, dstCapa=%u)", dst, (unsigned)dstCapacity); + *srcSizePtr = 0; + if (limit == fillOutput) oend -= LASTLITERALS; /* Hack for support LZ4 format restriction */ + if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1; + + /* Main Loop */ + while (ip <= mflimit) { + int const llen = (int)(ip - anchor); + int best_mlen, best_off; + int cur, last_match_pos = 0; + + LZ4HC_match_t const firstMatch = LZ4HC_FindLongerMatch(ctx, ip, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed); + if (firstMatch.len==0) { ip++; continue; } + + if ((size_t)firstMatch.len > sufficient_len) { + /* good enough solution : immediate encoding */ + int const firstML = firstMatch.len; + const BYTE* const matchPos = ip - firstMatch.off; + opSaved = op; + if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), firstML, matchPos, limit, oend) ) { /* updates ip, op and anchor */ + ovml = firstML; + ovref = matchPos; + goto _dest_overflow; + } + continue; + } + + /* set prices for first positions (literals) */ + { int rPos; + for (rPos = 0 ; rPos < MINMATCH ; rPos++) { + int const cost = LZ4HC_literalsPrice(llen + rPos); + opt[rPos].mlen = 1; + opt[rPos].off = 0; + opt[rPos].litlen = llen + rPos; + opt[rPos].price = cost; + DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup", + rPos, cost, opt[rPos].litlen); + } } + /* set prices using initial match */ + { int mlen = MINMATCH; + int const matchML = firstMatch.len; /* necessarily < sufficient_len < LZ4_OPT_NUM */ + int const offset = firstMatch.off; + assert(matchML < LZ4_OPT_NUM); + for ( ; mlen <= matchML ; mlen++) { + int const cost = LZ4HC_sequencePrice(llen, mlen); + opt[mlen].mlen = mlen; + opt[mlen].off = offset; + opt[mlen].litlen = llen; + opt[mlen].price = cost; + DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i) -- initial setup", + mlen, cost, mlen); + } } + last_match_pos = firstMatch.len; + { int addLit; + for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) { + opt[last_match_pos+addLit].mlen = 1; /* literal */ + opt[last_match_pos+addLit].off = 0; + opt[last_match_pos+addLit].litlen = addLit; + opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit); + DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup", + last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit); + } } + + /* check further positions */ + for (cur = 1; cur < last_match_pos; cur++) { + const BYTE* const curPtr = ip + cur; + LZ4HC_match_t newMatch; + + if (curPtr > mflimit) break; + DEBUGLOG(7, "rPos:%u[%u] vs [%u]%u", + cur, opt[cur].price, opt[cur+1].price, cur+1); + if (fullUpdate) { + /* not useful to search here if next position has same (or lower) cost */ + if ( (opt[cur+1].price <= opt[cur].price) + /* in some cases, next position has same cost, but cost rises sharply after, so a small match would still be beneficial */ + && (opt[cur+MINMATCH].price < opt[cur].price + 3/*min seq price*/) ) + continue; + } else { + /* not useful to search here if next position has same (or lower) cost */ + if (opt[cur+1].price <= opt[cur].price) continue; + } + + DEBUGLOG(7, "search at rPos:%u", cur); + if (fullUpdate) + newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed); + else + /* only test matches of minimum length; slightly faster, but misses a few bytes */ + newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, last_match_pos - cur, nbSearches, dict, favorDecSpeed); + if (!newMatch.len) continue; + + if ( ((size_t)newMatch.len > sufficient_len) + || (newMatch.len + cur >= LZ4_OPT_NUM) ) { + /* immediate encoding */ + best_mlen = newMatch.len; + best_off = newMatch.off; + last_match_pos = cur + 1; + goto encode; + } + + /* before match : set price with literals at beginning */ + { int const baseLitlen = opt[cur].litlen; + int litlen; + for (litlen = 1; litlen < MINMATCH; litlen++) { + int const price = opt[cur].price - LZ4HC_literalsPrice(baseLitlen) + LZ4HC_literalsPrice(baseLitlen+litlen); + int const pos = cur + litlen; + if (price < opt[pos].price) { + opt[pos].mlen = 1; /* literal */ + opt[pos].off = 0; + opt[pos].litlen = baseLitlen+litlen; + opt[pos].price = price; + DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)", + pos, price, opt[pos].litlen); + } } } + + /* set prices using match at position = cur */ + { int const matchML = newMatch.len; + int ml = MINMATCH; + + assert(cur + newMatch.len < LZ4_OPT_NUM); + for ( ; ml <= matchML ; ml++) { + int const pos = cur + ml; + int const offset = newMatch.off; + int price; + int ll; + DEBUGLOG(7, "testing price rPos %i (last_match_pos=%i)", + pos, last_match_pos); + if (opt[cur].mlen == 1) { + ll = opt[cur].litlen; + price = ((cur > ll) ? opt[cur - ll].price : 0) + + LZ4HC_sequencePrice(ll, ml); + } else { + ll = 0; + price = opt[cur].price + LZ4HC_sequencePrice(0, ml); + } + + assert((U32)favorDecSpeed <= 1); + if (pos > last_match_pos+TRAILING_LITERALS + || price <= opt[pos].price - (int)favorDecSpeed) { + DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i)", + pos, price, ml); + assert(pos < LZ4_OPT_NUM); + if ( (ml == matchML) /* last pos of last match */ + && (last_match_pos < pos) ) + last_match_pos = pos; + opt[pos].mlen = ml; + opt[pos].off = offset; + opt[pos].litlen = ll; + opt[pos].price = price; + } } } + /* complete following positions with literals */ + { int addLit; + for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) { + opt[last_match_pos+addLit].mlen = 1; /* literal */ + opt[last_match_pos+addLit].off = 0; + opt[last_match_pos+addLit].litlen = addLit; + opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit); + DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)", last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit); + } } + } /* for (cur = 1; cur <= last_match_pos; cur++) */ + + assert(last_match_pos < LZ4_OPT_NUM + TRAILING_LITERALS); + best_mlen = opt[last_match_pos].mlen; + best_off = opt[last_match_pos].off; + cur = last_match_pos - best_mlen; + +encode: /* cur, last_match_pos, best_mlen, best_off must be set */ + assert(cur < LZ4_OPT_NUM); + assert(last_match_pos >= 1); /* == 1 when only one candidate */ + DEBUGLOG(6, "reverse traversal, looking for shortest path (last_match_pos=%i)", last_match_pos); + { int candidate_pos = cur; + int selected_matchLength = best_mlen; + int selected_offset = best_off; + while (1) { /* from end to beginning */ + int const next_matchLength = opt[candidate_pos].mlen; /* can be 1, means literal */ + int const next_offset = opt[candidate_pos].off; + DEBUGLOG(7, "pos %i: sequence length %i", candidate_pos, selected_matchLength); + opt[candidate_pos].mlen = selected_matchLength; + opt[candidate_pos].off = selected_offset; + selected_matchLength = next_matchLength; + selected_offset = next_offset; + if (next_matchLength > candidate_pos) break; /* last match elected, first match to encode */ + assert(next_matchLength > 0); /* can be 1, means literal */ + candidate_pos -= next_matchLength; + } } + + /* encode all recorded sequences in order */ + { int rPos = 0; /* relative position (to ip) */ + while (rPos < last_match_pos) { + int const ml = opt[rPos].mlen; + int const offset = opt[rPos].off; + if (ml == 1) { ip++; rPos++; continue; } /* literal; note: can end up with several literals, in which case, skip them */ + rPos += ml; + assert(ml >= MINMATCH); + assert((offset >= 1) && (offset <= LZ4_DISTANCE_MAX)); + opSaved = op; + if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ip - offset, limit, oend) ) { /* updates ip, op and anchor */ + ovml = ml; + ovref = ip - offset; + goto _dest_overflow; + } } } + } /* while (ip <= mflimit) */ + +_last_literals: + /* Encode Last Literals */ + { size_t lastRunSize = (size_t)(iend - anchor); /* literals */ + size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255; + size_t const totalSize = 1 + llAdd + lastRunSize; + if (limit == fillOutput) oend += LASTLITERALS; /* restore correct value */ + if (limit && (op + totalSize > oend)) { + if (limit == limitedOutput) { /* Check output limit */ + retval = 0; + goto _return_label; + } + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (size_t)(oend - op) - 1 /*token*/; + llAdd = (lastRunSize + 256 - RUN_MASK) / 256; + lastRunSize -= llAdd; + } + DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize); + ip = anchor + lastRunSize; /* can be != iend if limit==fillOutput */ + + if (lastRunSize >= RUN_MASK) { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = (RUN_MASK << ML_BITS); + for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRunSize << ML_BITS); + } + memcpy(op, anchor, lastRunSize); + op += lastRunSize; + } + + /* End */ + *srcSizePtr = (int) (((const char*)ip) - source); + retval = (int) ((char*)op-dst); + goto _return_label; + +_dest_overflow: +if (limit == fillOutput) { + /* Assumption : ip, anchor, ovml and ovref must be set correctly */ + size_t const ll = (size_t)(ip - anchor); + size_t const ll_addbytes = (ll + 240) / 255; + size_t const ll_totalCost = 1 + ll_addbytes + ll; + BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */ + DEBUGLOG(6, "Last sequence overflowing (only %i bytes remaining)", (int)(oend-1-opSaved)); + op = opSaved; /* restore correct out pointer */ + if (op + ll_totalCost <= maxLitPos) { + /* ll validated; now adjust match length */ + size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost)); + size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255); + assert(maxMlSize < INT_MAX); assert(ovml >= 0); + if ((size_t)ovml > maxMlSize) ovml = (int)maxMlSize; + if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + ovml >= MFLIMIT) { + DEBUGLOG(6, "Space to end : %i + ml (%i)", (int)((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1), ovml); + DEBUGLOG(6, "Before : ip = %p, anchor = %p", ip, anchor); + LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ovml, ovref, notLimited, oend); + DEBUGLOG(6, "After : ip = %p, anchor = %p", ip, anchor); + } } + goto _last_literals; +} +_return_label: +#ifdef LZ4HC_HEAPMODE + FREEMEM(opt); +#endif + return retval; +} diff --git a/libbutl/lz4hc.h b/libbutl/lz4hc.h new file mode 100644 index 0000000..3d441fb --- /dev/null +++ b/libbutl/lz4hc.h @@ -0,0 +1,413 @@ +/* + LZ4 HC - High Compression Mode of LZ4 + Header File + Copyright (C) 2011-2017, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/lz4/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +#ifndef LZ4_HC_H_19834876238432 +#define LZ4_HC_H_19834876238432 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* --- Dependency --- */ +/* note : lz4hc requires lz4.h/lz4.c for compilation */ +#include "lz4.h" /* stddef, LZ4LIB_API, LZ4_DEPRECATED */ + + +/* --- Useful constants --- */ +#define LZ4HC_CLEVEL_MIN 3 +#define LZ4HC_CLEVEL_DEFAULT 9 +#define LZ4HC_CLEVEL_OPT_MIN 10 +#define LZ4HC_CLEVEL_MAX 12 + + +/*-************************************ + * Block Compression + **************************************/ +/*! LZ4_compress_HC() : + * Compress data from `src` into `dst`, using the powerful but slower "HC" algorithm. + * `dst` must be already allocated. + * Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h") + * Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h") + * `compressionLevel` : any value between 1 and LZ4HC_CLEVEL_MAX will work. + * Values > LZ4HC_CLEVEL_MAX behave the same as LZ4HC_CLEVEL_MAX. + * @return : the number of bytes written into 'dst' + * or 0 if compression fails. + */ +LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel); + + +/* Note : + * Decompression functions are provided within "lz4.h" (BSD license) + */ + + +/*! LZ4_compress_HC_extStateHC() : + * Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`. + * `state` size is provided by LZ4_sizeofStateHC(). + * Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() should do properly). + */ +LZ4LIB_API int LZ4_sizeofStateHC(void); +LZ4LIB_API int LZ4_compress_HC_extStateHC(void* stateHC, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel); + + +/*! LZ4_compress_HC_destSize() : v1.9.0+ + * Will compress as much data as possible from `src` + * to fit into `targetDstSize` budget. + * Result is provided in 2 parts : + * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize) + * or 0 if compression fails. + * `srcSizePtr` : on success, *srcSizePtr is updated to indicate how much bytes were read from `src` + */ +LZ4LIB_API int LZ4_compress_HC_destSize(void* stateHC, + const char* src, char* dst, + int* srcSizePtr, int targetDstSize, + int compressionLevel); + + +/*-************************************ + * Streaming Compression + * Bufferless synchronous API + **************************************/ + typedef union LZ4_streamHC_u LZ4_streamHC_t; /* incomplete type (defined later) */ + +/*! LZ4_createStreamHC() and LZ4_freeStreamHC() : + * These functions create and release memory for LZ4 HC streaming state. + * Newly created states are automatically initialized. + * A same state can be used multiple times consecutively, + * starting with LZ4_resetStreamHC_fast() to start a new stream of blocks. + */ +LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void); +LZ4LIB_API int LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr); + +/* + These functions compress data in successive blocks of any size, + using previous blocks as dictionary, to improve compression ratio. + One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks. + There is an exception for ring buffers, which can be smaller than 64 KB. + Ring-buffer scenario is automatically detected and handled within LZ4_compress_HC_continue(). + + Before starting compression, state must be allocated and properly initialized. + LZ4_createStreamHC() does both, though compression level is set to LZ4HC_CLEVEL_DEFAULT. + + Selecting the compression level can be done with LZ4_resetStreamHC_fast() (starts a new stream) + or LZ4_setCompressionLevel() (anytime, between blocks in the same stream) (experimental). + LZ4_resetStreamHC_fast() only works on states which have been properly initialized at least once, + which is automatically the case when state is created using LZ4_createStreamHC(). + + After reset, a first "fictional block" can be designated as initial dictionary, + using LZ4_loadDictHC() (Optional). + + Invoke LZ4_compress_HC_continue() to compress each successive block. + The number of blocks is unlimited. + Previous input blocks, including initial dictionary when present, + must remain accessible and unmodified during compression. + + It's allowed to update compression level anytime between blocks, + using LZ4_setCompressionLevel() (experimental). + + 'dst' buffer should be sized to handle worst case scenarios + (see LZ4_compressBound(), it ensures compression success). + In case of failure, the API does not guarantee recovery, + so the state _must_ be reset. + To ensure compression success + whenever `dst` buffer size cannot be made >= LZ4_compressBound(), + consider using LZ4_compress_HC_continue_destSize(). + + Whenever previous input blocks can't be preserved unmodified in-place during compression of next blocks, + it's possible to copy the last blocks into a more stable memory space, using LZ4_saveDictHC(). + Return value of LZ4_saveDictHC() is the size of dictionary effectively saved into 'safeBuffer' (<= 64 KB) + + After completing a streaming compression, + it's possible to start a new stream of blocks, using the same LZ4_streamHC_t state, + just by resetting it, using LZ4_resetStreamHC_fast(). +*/ + +LZ4LIB_API void LZ4_resetStreamHC_fast(LZ4_streamHC_t* streamHCPtr, int compressionLevel); /* v1.9.0+ */ +LZ4LIB_API int LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize); + +LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, + const char* src, char* dst, + int srcSize, int maxDstSize); + +/*! LZ4_compress_HC_continue_destSize() : v1.9.0+ + * Similar to LZ4_compress_HC_continue(), + * but will read as much data as possible from `src` + * to fit into `targetDstSize` budget. + * Result is provided into 2 parts : + * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize) + * or 0 if compression fails. + * `srcSizePtr` : on success, *srcSizePtr will be updated to indicate how much bytes were read from `src`. + * Note that this function may not consume the entire input. + */ +LZ4LIB_API int LZ4_compress_HC_continue_destSize(LZ4_streamHC_t* LZ4_streamHCPtr, + const char* src, char* dst, + int* srcSizePtr, int targetDstSize); + +LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize); + + + +/*^********************************************** + * !!!!!! STATIC LINKING ONLY !!!!!! + ***********************************************/ + +/*-****************************************************************** + * PRIVATE DEFINITIONS : + * Do not use these definitions directly. + * They are merely exposed to allow static allocation of `LZ4_streamHC_t`. + * Declare an `LZ4_streamHC_t` directly, rather than any type below. + * Even then, only do so in the context of static linking, as definitions may change between versions. + ********************************************************************/ + +#define LZ4HC_DICTIONARY_LOGSIZE 16 +#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE) +#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1) + +#define LZ4HC_HASH_LOG 15 +#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG) +#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1) + + +typedef struct LZ4HC_CCtx_internal LZ4HC_CCtx_internal; +struct LZ4HC_CCtx_internal +{ + LZ4_u32 hashTable[LZ4HC_HASHTABLESIZE]; + LZ4_u16 chainTable[LZ4HC_MAXD]; + const LZ4_byte* end; /* next block here to continue on current prefix */ + const LZ4_byte* base; /* All index relative to this position */ + const LZ4_byte* dictBase; /* alternate base for extDict */ + LZ4_u32 dictLimit; /* below that point, need extDict */ + LZ4_u32 lowLimit; /* below that point, no more dict */ + LZ4_u32 nextToUpdate; /* index from which to continue dictionary update */ + short compressionLevel; + LZ4_i8 favorDecSpeed; /* favor decompression speed if this flag set, + otherwise, favor compression ratio */ + LZ4_i8 dirty; /* stream has to be fully reset if this flag is set */ + const LZ4HC_CCtx_internal* dictCtx; +}; + + +/* Do not use these definitions directly ! + * Declare or allocate an LZ4_streamHC_t instead. + */ +#define LZ4_STREAMHCSIZE 262200 /* static size, for inter-version compatibility */ +#define LZ4_STREAMHCSIZE_VOIDP (LZ4_STREAMHCSIZE / sizeof(void*)) +union LZ4_streamHC_u { + void* table[LZ4_STREAMHCSIZE_VOIDP]; + LZ4HC_CCtx_internal internal_donotuse; +}; /* previously typedef'd to LZ4_streamHC_t */ + +/* LZ4_streamHC_t : + * This structure allows static allocation of LZ4 HC streaming state. + * This can be used to allocate statically, on state, or as part of a larger structure. + * + * Such state **must** be initialized using LZ4_initStreamHC() before first use. + * + * Note that invoking LZ4_initStreamHC() is not required when + * the state was created using LZ4_createStreamHC() (which is recommended). + * Using the normal builder, a newly created state is automatically initialized. + * + * Static allocation shall only be used in combination with static linking. + */ + +/* LZ4_initStreamHC() : v1.9.0+ + * Required before first use of a statically allocated LZ4_streamHC_t. + * Before v1.9.0 : use LZ4_resetStreamHC() instead + */ +LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size); + + +/*-************************************ +* Deprecated Functions +**************************************/ +/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */ + +/* deprecated compression functions */ +LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC (const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete streaming functions; degraded functionality; do not use! + * + * In order to perform streaming compression, these functions depended on data + * that is no longer tracked in the state. They have been preserved as well as + * possible: using them will still produce a correct output. However, use of + * LZ4_slideInputBufferHC() will truncate the history of the stream, rather + * than preserve a window-sized chunk of history. + */ +LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API void* LZ4_createHC (const char* inputBuffer); +LZ4_DEPRECATED("use LZ4_saveDictHC() instead") LZ4LIB_API char* LZ4_slideInputBufferHC (void* LZ4HC_Data); +LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API int LZ4_freeHC (void* LZ4HC_Data); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API int LZ4_sizeofStreamStateHC(void); +LZ4_DEPRECATED("use LZ4_initStreamHC() instead") LZ4LIB_API int LZ4_resetStreamStateHC(void* state, char* inputBuffer); + + +/* LZ4_resetStreamHC() is now replaced by LZ4_initStreamHC(). + * The intention is to emphasize the difference with LZ4_resetStreamHC_fast(), + * which is now the recommended function to start a new stream of blocks, + * but cannot be used to initialize a memory segment containing arbitrary garbage data. + * + * It is recommended to switch to LZ4_initStreamHC(). + * LZ4_resetStreamHC() will generate deprecation warnings in a future version. + */ +LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel); + + +#if defined (__cplusplus) +} +#endif + +#endif /* LZ4_HC_H_19834876238432 */ + + +/*-************************************************** + * !!!!! STATIC LINKING ONLY !!!!! + * Following definitions are considered experimental. + * They should not be linked from DLL, + * as there is no guarantee of API stability yet. + * Prototypes will be promoted to "stable" status + * after successfull usage in real-life scenarios. + ***************************************************/ +#ifdef LZ4_HC_STATIC_LINKING_ONLY /* protection macro */ +#ifndef LZ4_HC_SLO_098092834 +#define LZ4_HC_SLO_098092834 + +#define LZ4_STATIC_LINKING_ONLY /* LZ4LIB_STATIC_API */ +#include "lz4.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +/*! LZ4_setCompressionLevel() : v1.8.0+ (experimental) + * It's possible to change compression level + * between successive invocations of LZ4_compress_HC_continue*() + * for dynamic adaptation. + */ +LZ4LIB_STATIC_API void LZ4_setCompressionLevel( + LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel); + +/*! LZ4_favorDecompressionSpeed() : v1.8.2+ (experimental) + * Opt. Parser will favor decompression speed over compression ratio. + * Only applicable to levels >= LZ4HC_CLEVEL_OPT_MIN. + */ +LZ4LIB_STATIC_API void LZ4_favorDecompressionSpeed( + LZ4_streamHC_t* LZ4_streamHCPtr, int favor); + +/*! LZ4_resetStreamHC_fast() : v1.9.0+ + * When an LZ4_streamHC_t is known to be in a internally coherent state, + * it can often be prepared for a new compression with almost no work, only + * sometimes falling back to the full, expensive reset that is always required + * when the stream is in an indeterminate state (i.e., the reset performed by + * LZ4_resetStreamHC()). + * + * LZ4_streamHCs are guaranteed to be in a valid state when: + * - returned from LZ4_createStreamHC() + * - reset by LZ4_resetStreamHC() + * - memset(stream, 0, sizeof(LZ4_streamHC_t)) + * - the stream was in a valid state and was reset by LZ4_resetStreamHC_fast() + * - the stream was in a valid state and was then used in any compression call + * that returned success + * - the stream was in an indeterminate state and was used in a compression + * call that fully reset the state (LZ4_compress_HC_extStateHC()) and that + * returned success + * + * Note: + * A stream that was last used in a compression call that returned an error + * may be passed to this function. However, it will be fully reset, which will + * clear any existing history and settings from the context. + */ +LZ4LIB_STATIC_API void LZ4_resetStreamHC_fast( + LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel); + +/*! LZ4_compress_HC_extStateHC_fastReset() : + * A variant of LZ4_compress_HC_extStateHC(). + * + * Using this variant avoids an expensive initialization step. It is only safe + * to call if the state buffer is known to be correctly initialized already + * (see above comment on LZ4_resetStreamHC_fast() for a definition of + * "correctly initialized"). From a high level, the difference is that this + * function initializes the provided state with a call to + * LZ4_resetStreamHC_fast() while LZ4_compress_HC_extStateHC() starts with a + * call to LZ4_resetStreamHC(). + */ +LZ4LIB_STATIC_API int LZ4_compress_HC_extStateHC_fastReset ( + void* state, + const char* src, char* dst, + int srcSize, int dstCapacity, + int compressionLevel); + +/*! LZ4_attach_HC_dictionary() : + * This is an experimental API that allows for the efficient use of a + * static dictionary many times. + * + * Rather than re-loading the dictionary buffer into a working context before + * each compression, or copying a pre-loaded dictionary's LZ4_streamHC_t into a + * working LZ4_streamHC_t, this function introduces a no-copy setup mechanism, + * in which the working stream references the dictionary stream in-place. + * + * Several assumptions are made about the state of the dictionary stream. + * Currently, only streams which have been prepared by LZ4_loadDictHC() should + * be expected to work. + * + * Alternatively, the provided dictionary stream pointer may be NULL, in which + * case any existing dictionary stream is unset. + * + * A dictionary should only be attached to a stream without any history (i.e., + * a stream that has just been reset). + * + * The dictionary will remain attached to the working stream only for the + * current stream session. Calls to LZ4_resetStreamHC(_fast) will remove the + * dictionary context association from the working stream. The dictionary + * stream (and source buffer) must remain in-place / accessible / unchanged + * through the lifetime of the stream session. + */ +LZ4LIB_STATIC_API void LZ4_attach_HC_dictionary( + LZ4_streamHC_t *working_stream, + const LZ4_streamHC_t *dictionary_stream); + +#if defined (__cplusplus) +} +#endif + +#endif /* LZ4_HC_SLO_098092834 */ +#endif /* LZ4_HC_STATIC_LINKING_ONLY */ diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx index 9514bbd..904910a 100644 --- a/libbutl/manifest-parser.cxx +++ b/libbutl/manifest-parser.cxx @@ -1,39 +1,10 @@ // file : libbutl/manifest-parser.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/manifest-parser.mxx> -#endif +#include <libbutl/manifest-parser.hxx> -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> -#include <vector> -#include <cstdint> -#include <utility> -#include <stdexcept> - -#include <sstream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.manifest_parser; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -import butl.char_scanner; -import butl.manifest_types; -#endif - -#endif +#include <cassert> using namespace std; @@ -177,41 +148,136 @@ namespace butl { using iterator = string::const_iterator; - auto space = [] (char c) -> bool {return c == ' ' || c == '\t';}; + // Parse the value differently depending on whether it is multi-line or + // not. + // + if (v.find ('\n') == string::npos) // Single-line. + { + auto space = [] (char c) {return c == ' ' || c == '\t';}; - iterator i (v.begin ()); - iterator e (v.end ()); + iterator i (v.begin ()); + iterator e (v.end ()); - string r; - size_t n (0); - for (char c; i != e && (c = *i) != ';'; ++i) - { - // Unescape ';' character. + string r; + size_t n (0); + for (char c; i != e && (c = *i) != ';'; ++i) + { + // Unescape ';' and '\' characters. + // + if (c == '\\' && i + 1 != e && (*(i + 1) == ';' || *(i + 1) == '\\')) + c = *++i; + + r += c; + + if (!space (c)) + n = r.size (); + } + + // Strip the value trailing spaces. // - if (c == '\\' && i + 1 != e && *(i + 1) == ';') - c = *++i; + if (r.size () != n) + r.resize (n); - r += c; + // Find beginning of a comment (i). + // + if (i != e) + { + // Skip spaces. + // + for (++i; i != e && space (*i); ++i); + } - if (!space (c)) - n = r.size (); + return make_pair (move (r), string (i, e)); } + else // Multi-line. + { + string r; + string c; - // Strip the value trailing spaces. - // - if (r.size () != n) - r.resize (n); + // Parse the value lines until the comment separator is encountered or + // the end of the value is reached. Add these lines to the resulting + // value, unescaping them if required. + // + // Note that we only need to unescape lines which have the '\+;' form. + // + auto i (v.begin ()); + auto e (v.end ()); - // Find beginning of a comment (i). - // - if (i != e) - { - // Skip spaces. + while (i != e) + { + // Find the end of the line and while at it the first non-backslash + // character. + // + auto le (i); + auto nb (e); + for (; le != e && *le != '\n'; ++le) + { + if (nb == e && *le != '\\') + nb = le; + } + + // If the value end is not reached then position to the beginning of + // the next line and to the end of the value otherwise. + // + auto next = [&i, &le, &e] () {i = (le != e ? le + 1 : e);}; + + // If the first non-backslash character is ';' and it is the last + // character on the line, then this is either the comment separator or + // an escape sequence. + // + if (nb != e && *nb == ';' && nb + 1 == le) + { + // If ';' is the first (and thus the only) character on the line, + // then this is the comment separator and we bail out from this + // loop. Note that in this case we need to trim the trailing newline + // (but only one) from the resulting value since it is considered as + // a part of the separator. + // + if (nb == i) + { + if (!r.empty ()) + { + assert (r.back () == '\n'); + r.pop_back (); + } + + next (); + break; + } + // + // Otherwise, this is an escape sequence, so unescape it. For that + // just take the rightmost half of the string: + // + // \; -> ; + // \\; -> \; + // \\\; -> \; + // \\\\; -> \\; + // \\\\\; -> \\; + // + else + i += (le - i) / 2; + } + + // Add the line to the resulting value together with the trailing + // newline, if present. + // + r.append (i, le); + + if (le != e) + r += '\n'; + + next (); + } + + // If we haven't reached the end of the value then it means we've + // encountered the comment separator. In this case save the remaining + // value part as a comment. // - for (++i; i != e && space (*i); ++i); - } + if (i != e) + c = string (i, e); - return make_pair (move (r), string (i, e)); + return make_pair (move (r), move (c)); + } } void manifest_parser:: @@ -251,7 +317,8 @@ namespace butl string& v (r.value); string::size_type n (0); // Size of last non-space character (simple mode). - // Detect the multi-line mode introductor. + // Detect the old-fashioned multi-line mode introducer (like in + // 'foo:\<newline>'). // bool ml (false); if (c == '\\') @@ -266,11 +333,46 @@ namespace butl ml = true; } else if (eos (p)) + { + c = p; // Set to EOF. ml = true; + } else unget (c); } + // Detect the new-fashioned multi-line mode introducer (like in + // 'foo:<newline>\<newline>'). + // + if (!ml && c == '\n') + { + get (); + xchar p1 (peek ()); + + if (p1 == '\\') + { + get (); + xchar p2 (peek ()); + + if (p2 == '\n') + { + get (); // Newline is not part of the value so skip it. + c = peek (); + ml = true; + } + else if (eos (p2)) + { + c = p2; // Set to EOF. + ml = true; + } + else + unget (p1); // Unget '\\'. Note: '\n' will be ungot below. + } + + if (!ml) + unget (c); // Unget '\n'. + } + // Multi-line value starts from the line that follows the name. // if (ml) @@ -281,7 +383,7 @@ namespace butl // The nl flag signals that the preceding character was a "special // newline", that is, a newline that was part of the milti-line mode - // introductor or an escape sequence. + // introducer or an escape sequence. // for (bool nl (ml); !eos (c); c = peek ()) { @@ -299,7 +401,7 @@ namespace butl // // The first block handles the special sequence that starts with // a special newline. In multi-line mode, this is an "immediate - // termination" where we "use" the newline from the introductor. + // termination" where we "use" the newline from the introducer. // Note also that in the simple mode the special sequence can // only start with a special (i.e., escaped) newline. // @@ -472,11 +574,21 @@ namespace butl static inline string format (const string& n, uint64_t l, uint64_t c, const string& d) { - ostringstream os; + using std::to_string; + + string r; if (!n.empty ()) - os << n << ':'; - os << l << ':' << c << ": error: " << d; - return os.str (); + { + r += n; + r += ':'; + } + + r += to_string (l); + r += ':'; + r += to_string (c); + r += ": error: "; + r += d; + return r; } manifest_parsing:: diff --git a/libbutl/manifest-parser.mxx b/libbutl/manifest-parser.hxx index 77addff..601fb2d 100644 --- a/libbutl/manifest-parser.mxx +++ b/libbutl/manifest-parser.hxx @@ -1,13 +1,8 @@ -// file : libbutl/manifest-parser.mxx -*- C++ -*- +// file : libbutl/manifest-parser.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <iosfwd> @@ -15,30 +10,15 @@ #include <utility> // pair, move() #include <stdexcept> // runtime_error #include <functional> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.manifest_parser; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.utf8; -import butl.optional; -import butl.char_scanner; -import butl.manifest_types; -#else -#include <libbutl/utf8.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/char-scanner.mxx> -#include <libbutl/manifest-types.mxx> -#endif + +#include <libbutl/utf8.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/char-scanner.hxx> +#include <libbutl/manifest-types.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { class LIBBUTL_SYMEXPORT manifest_parsing: public std::runtime_error { @@ -57,7 +37,7 @@ LIBBUTL_MODEXPORT namespace butl }; class LIBBUTL_SYMEXPORT manifest_parser: - protected char_scanner<utf8_validator> + protected char_scanner<utf8_validator, 2> { public: // The filter, if specified, is called by next() prior to returning the @@ -103,7 +83,7 @@ LIBBUTL_MODEXPORT namespace butl split_comment (const std::string&); private: - using base = char_scanner<utf8_validator>; + using base = char_scanner<utf8_validator, 2>; void parse_next (manifest_name_value&); diff --git a/libbutl/manifest-rewriter.cxx b/libbutl/manifest-rewriter.cxx index e38d5f4..1232e9c 100644 --- a/libbutl/manifest-rewriter.cxx +++ b/libbutl/manifest-rewriter.cxx @@ -1,41 +1,15 @@ // file : libbutl/manifest-rewriter.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/manifest-rewriter.mxx> -#endif +#include <libbutl/manifest-rewriter.hxx> -#include <cassert> - -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> +#include <cassert> #include <cstdint> // uint64_t #include <cstddef> // size_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.manifest_rewriter; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.fdstream; -import butl.manifest_types; -#endif - -import butl.utility; // utf8_length() -import butl.manifest_serializer; -#else -#include <libbutl/utility.mxx> -#include <libbutl/manifest-serializer.mxx> -#endif + +#include <libbutl/utility.hxx> // utf8_length() +#include <libbutl/manifest-serializer.hxx> using namespace std; @@ -64,7 +38,7 @@ namespace butl // Temporary move the descriptor into the stream. // ifdstream is (move (fd)); - fdbuf& buf (static_cast<fdbuf&> (*is.rdbuf ())); + fdstreambuf& buf (static_cast<fdstreambuf&> (*is.rdbuf ())); // Read suffix. // @@ -99,8 +73,6 @@ namespace butl if (!nv.value.empty ()) { - os << ' '; - manifest_serializer s (os, path_.string (), long_lines_); // Note that the name can be surrounded with the ASCII whitespace @@ -112,7 +84,7 @@ namespace butl // s.write_value (nv.value, static_cast<size_t> (nv.colon_pos - nv.start_pos) - - (nv.name.size () - utf8_length (nv.name)) + 2); + (nv.name.size () - utf8_length (nv.name)) + 1); } os << suffix; @@ -144,15 +116,13 @@ namespace butl if (!nv.value.empty ()) { - os << ' '; - // Note that the name can be surrounded with the ASCII whitespace // characters and the start_pos refers to the first character in the // line. // s.write_value (nv.value, static_cast<size_t> (nv.colon_pos - nv.start_pos) - - (nv.name.size () - n) + 2); + (nv.name.size () - n) + 1); } os << suffix; diff --git a/libbutl/manifest-rewriter.mxx b/libbutl/manifest-rewriter.hxx index 907c990..02a533a 100644 --- a/libbutl/manifest-rewriter.mxx +++ b/libbutl/manifest-rewriter.hxx @@ -1,33 +1,15 @@ -// file : libbutl/manifest-rewriter.mxx -*- C++ -*- +// file : libbutl/manifest-rewriter.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.manifest_rewriter; -#ifdef __cpp_lib_modules_ts -#endif -import butl.path; -import butl.fdstream; -import butl.manifest_types; -#else -#include <libbutl/path.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/manifest-types.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/manifest-types.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Rewrite a hand-written manifest file preserving formatting, comments, // etc., of the unaffected parts. The general workflow is as follows: diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx index 6a26a15..26699e0 100644 --- a/libbutl/manifest-serializer.cxx +++ b/libbutl/manifest-serializer.cxx @@ -1,41 +1,13 @@ // file : libbutl/manifest-serializer.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/manifest-serializer.mxx> -#endif - -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <cstddef> -#include <stdexcept> +#include <libbutl/manifest-serializer.hxx> #include <ostream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.manifest_serializer; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.manifest_types; -#endif +#include <cassert> -import butl.utf8; -import butl.utility; -#else -#include <libbutl/utf8.mxx> -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utf8.hxx> +#include <libbutl/utility.hxx> using namespace std; @@ -95,10 +67,7 @@ namespace butl os_ << ':'; if (!v.empty ()) - { - os_ << ' '; - write_value (v, l + 2); - } + write_value (v, l + 1); os_ << endl; break; @@ -132,22 +101,89 @@ namespace butl merge_comment (const string& value, const string& comment) { string r; - for (char c: value) + + // Merge the value and comment differently depending on whether any of + // them is multi-line or not. + // + if (value.find ('\n') == string::npos && // Single-line. + comment.find ('\n') == string::npos) { - // Escape ';' character. - // - if (c == ';') - r += '\\'; + for (char c: value) + { + // Escape ';' and '\' characters. + // + if (c == ';' || c == '\\') + r += '\\'; - r += c; - } + r += c; + } - // Add the comment. - // - if (!comment.empty ()) + // Add the comment. + // + if (!comment.empty ()) + { + r += "; "; + r += comment; + } + } + else // Multi-line. { - r += "; "; - r += comment; + // Parse the value lines and add them to the resulting value, escaping + // them if required. + // + // Note that we only need to escape lines which have the '\*;' form. + // + for (auto i (value.begin ()), e (value.end ()); i != e; ) + { + // Find the end of the line and while at it the first non-backslash + // character. + // + auto le (i); + auto nb (e); + for (; le != e && *le != '\n'; ++le) + { + if (nb == e && *le != '\\') + nb = le; + } + + // If the first non-backslash character is ';' and it is the last + // character on the line, then we need to escape the line characters. + // Note that we only escape ';' if it is the only character on the + // line. Otherwise, we only escape backslashes doubling the number of + // them from the left: + // + // ; -> \; + // \; -> \\; + // \\; -> \\\\; + // \\\; -> \\\\\\; + // + if (nb != e && *nb == ';' && nb + 1 == le) + r.append (nb == i ? 1 : nb - i, '\\'); + + // Add the line to the resulting value together with the trailing + // newline, if present. + // + r.append (i, le); + + if (le != e) + r += '\n'; + + // If the value end is not reached then position to the beginning of + // the next line and to the end of the value otherwise. + // + i = (le != e ? le + 1 : e); + } + + // Append the comment, if present. + // + if (!comment.empty ()) + { + if (!r.empty ()) + r += '\n'; + + r += ";\n"; + r += comment; + } } return r; @@ -301,6 +337,8 @@ namespace butl void manifest_serializer:: write_value (const string& v, size_t cl) { + assert (!v.empty ()); + // Consider both \r and \n characters as line separators, and the // \r\n characters sequence as a single line separator. // @@ -319,11 +357,17 @@ namespace butl // readability, still allowing the user to easily copy the value which // seems to be the main reason for using the flag. // - if (cl > 39 || nl () != string::npos || - v.front () == ' ' || v.front () == '\t' || - v.back () == ' ' || v.back () == '\t') + if (cl + 1 > 39 || // '+ 1' for the space after the colon. + nl () != string::npos || + v.front () == ' ' || + v.front () == '\t' || + v.back () == ' ' || + v.back () == '\t') { - os_ << "\\" << endl; // Multi-line mode introductor. + if (multiline_v2_) + os_ << endl; + + os_ << "\\" << endl; // Multi-line mode introducer. // Chunk the value into fragments separated by newlines. // @@ -346,7 +390,10 @@ namespace butl os_ << endl << "\\"; // Multi-line mode terminator. } else - write_value (v.c_str (), v.size (), cl); + { + os_ << ' '; + write_value (v.c_str (), v.size (), cl + 1); + } } // manifest_serialization diff --git a/libbutl/manifest-serializer.mxx b/libbutl/manifest-serializer.hxx index b73c255..2159901 100644 --- a/libbutl/manifest-serializer.mxx +++ b/libbutl/manifest-serializer.hxx @@ -1,37 +1,20 @@ -// file : libbutl/manifest-serializer.mxx -*- C++ -*- +// file : libbutl/manifest-serializer.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <iosfwd> #include <cstddef> // size_t #include <stdexcept> // runtime_error #include <functional> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.manifest_serializer; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.manifest_types; -#else -#include <libbutl/manifest-types.mxx> -#endif +#include <libbutl/manifest-types.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { class LIBBUTL_SYMEXPORT manifest_serialization: public std::runtime_error { @@ -62,14 +45,19 @@ LIBBUTL_MODEXPORT namespace butl // Unless long_lines is true, break lines in values (including multi-line) // so that their length does not exceed 78 codepoints (including '\n'). // + // Note that the multiline_v2 flag is temporary and should not be used + // except by the implementation for testing. + // manifest_serializer (std::ostream& os, const std::string& name, bool long_lines = false, - std::function<filter_function> filter = {}) + std::function<filter_function> filter = {}, + bool multiline_v2 = false) : os_ (os), name_ (name), long_lines_ (long_lines), - filter_ (std::move (filter)) + filter_ (std::move (filter)), + multiline_v2_ (multiline_v2) { } @@ -113,10 +101,12 @@ LIBBUTL_MODEXPORT namespace butl size_t write_name (const std::string&); - // Write a value assuming the current line already has the specified - // codepoint offset. If the resulting line length would be too large then - // the multi-line representation will be used. It is assumed that the - // name, followed by the colon, is already written. + // Write a non-empty value assuming the current line already has the + // specified codepoint offset. If the resulting line length would be too + // large then the multi-line representation will be used. For the + // single-line representation the space character is written before the + // value. It is assumed that the name, followed by the colon, is already + // written. // void write_value (const std::string&, std::size_t offset); @@ -138,6 +128,7 @@ LIBBUTL_MODEXPORT namespace butl const std::string name_; bool long_lines_; const std::function<filter_function> filter_; + bool multiline_v2_; }; // Serialize a manifest to a stream adding the leading format version pair diff --git a/libbutl/manifest-types.mxx b/libbutl/manifest-types.hxx index 93f6fc6..23318f0 100644 --- a/libbutl/manifest-types.mxx +++ b/libbutl/manifest-types.hxx @@ -1,30 +1,14 @@ -// file : libbutl/manifest-types.mxx -*- C++ -*- +// file : libbutl/manifest-types.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> -#include <cstdint> // uint64_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.manifest_types; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#else -#endif +#include <cstdint> // uint64_t #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { class manifest_name_value { diff --git a/libbutl/mingw-condition_variable.hxx b/libbutl/mingw-condition_variable.hxx new file mode 100644 index 0000000..965f533 --- /dev/null +++ b/libbutl/mingw-condition_variable.hxx @@ -0,0 +1,275 @@ +/** +* std::condition_variable implementation for MinGW-w64 +* +* Copyright (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* Copyright (c) 2022 the build2 authors +* +* Licensed under the simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#ifndef LIBBUTL_MINGW_CONDITION_VARIABLE_HXX +#define LIBBUTL_MINGW_CONDITION_VARIABLE_HXX + +#if !defined(__cplusplus) || (__cplusplus < 201402L) +# error C++14 compiler required +#endif + +#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0601 +# error _WIN32_WINNT should be 0x0601 (Windows 7) or greater +#endif + +#include <condition_variable> // Use std::cv_status, if available. + +#include <cassert> +#include <chrono> +#include <system_error> + +#include <synchapi.h> + +#include <libbutl/mingw-mutex.hxx> +#include <libbutl/mingw-shared_mutex.hxx> + +namespace mingw_stdthread +{ +#if defined(__MINGW32__ ) && !defined(_GLIBCXX_HAS_GTHREADS) + enum class cv_status { no_timeout, timeout }; +#else + using std::cv_status; +#endif + + // Native condition variable-based implementation. + // + class condition_variable + { + static constexpr DWORD kInfinite = 0xffffffffl; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" + CONDITION_VARIABLE cvariable_ = CONDITION_VARIABLE_INIT; +#pragma GCC diagnostic pop + + friend class condition_variable_any; + + bool wait_unique (mutex * pmutex, DWORD time) + { + BOOL success = SleepConditionVariableSRW(native_handle(), + pmutex->native_handle(), + time, +// CONDITION_VARIABLE_LOCKMODE_SHARED has a value not specified by +// Microsoft's Dev Center, but is known to be (convertible to) a ULONG. To +// ensure that the value passed to this function is not equal to Microsoft's +// constant, we can either use a static_assert, or simply generate an +// appropriate value. + !CONDITION_VARIABLE_LOCKMODE_SHARED); + return success; + } + bool wait_impl (unique_lock<mutex> & lock, DWORD time) + { + mutex * pmutex = lock.release(); + bool success = wait_unique(pmutex, time); + lock = unique_lock<mutex>(*pmutex, adopt_lock); + return success; + } +public: + using native_handle_type = PCONDITION_VARIABLE; + native_handle_type native_handle () + { + return &cvariable_; + } + + condition_variable () = default; + ~condition_variable () = default; + + condition_variable (const condition_variable &) = delete; + condition_variable & operator= (const condition_variable &) = delete; + + void notify_one () noexcept + { + WakeConditionVariable(&cvariable_); + } + + void notify_all () noexcept + { + WakeAllConditionVariable(&cvariable_); + } + + void wait (unique_lock<mutex> & lock) + { + wait_impl(lock, kInfinite); + } + + template<class Predicate> + void wait (unique_lock<mutex> & lock, Predicate pred) + { + while (!pred()) + wait(lock); + } + + template <class Rep, class Period> + cv_status wait_for(unique_lock<mutex>& lock, + const std::chrono::duration<Rep, Period>& rel_time) + { + using namespace std::chrono; + auto timeout = duration_cast<milliseconds>(rel_time).count(); + DWORD waittime = (timeout < kInfinite) ? ((timeout < 0) ? 0 : static_cast<DWORD>(timeout)) : (kInfinite - 1); + bool result = wait_impl(lock, waittime) || (timeout >= kInfinite); + return result ? cv_status::no_timeout : cv_status::timeout; + } + + template <class Rep, class Period, class Predicate> + bool wait_for(unique_lock<mutex>& lock, + const std::chrono::duration<Rep, Period>& rel_time, + Predicate pred) + { +#if __cplusplus >= 201703L + using steady_duration = typename std::chrono::steady_clock::duration; + return wait_until(lock, + std::chrono::steady_clock::now() + + std::chrono::ceil<steady_duration> (rel_time), + std::move(pred)); +#else + return wait_until(lock, + std::chrono::steady_clock::now() + rel_time, + std::move(pred)); +#endif + } + template <class Clock, class Duration> + cv_status wait_until (unique_lock<mutex>& lock, + const std::chrono::time_point<Clock,Duration>& abs_time) + { + return wait_for(lock, abs_time - Clock::now()); + } + template <class Clock, class Duration, class Predicate> + bool wait_until (unique_lock<mutex>& lock, + const std::chrono::time_point<Clock, Duration>& abs_time, + Predicate pred) + { + while (!pred()) + { + if (wait_until(lock, abs_time) == cv_status::timeout) + { + return pred(); + } + } + return true; + } + }; + + class condition_variable_any + { + static constexpr DWORD kInfinite = 0xffffffffl; + + condition_variable internal_cv_ {}; + mutex internal_mutex_ {}; + + template<class L> + bool wait_impl (L & lock, DWORD time) + { + unique_lock<decltype(internal_mutex_)> internal_lock(internal_mutex_); + lock.unlock(); + bool success = internal_cv_.wait_impl(internal_lock, time); + lock.lock(); + return success; + } + // If the lock happens to be called on a native Windows mutex, skip any + // extra contention. + inline bool wait_impl (unique_lock<mutex> & lock, DWORD time) + { + return internal_cv_.wait_impl(lock, time); + } + bool wait_impl (unique_lock<shared_mutex> & lock, DWORD time) + { + shared_mutex * pmutex = lock.release(); + bool success = internal_cv_.wait_unique(pmutex, time); + lock = unique_lock<shared_mutex>(*pmutex, adopt_lock); + return success; + } + bool wait_impl (shared_lock<shared_mutex> & lock, DWORD time) + { + shared_mutex * pmutex = lock.release(); + BOOL success = SleepConditionVariableSRW(native_handle(), + pmutex->native_handle(), time, + CONDITION_VARIABLE_LOCKMODE_SHARED); + lock = shared_lock<shared_mutex>(*pmutex, adopt_lock); + return success; + } + public: + using native_handle_type = typename condition_variable::native_handle_type; + + native_handle_type native_handle () + { + return internal_cv_.native_handle(); + } + + void notify_one () noexcept + { + internal_cv_.notify_one(); + } + + void notify_all () noexcept + { + internal_cv_.notify_all(); + } + + condition_variable_any () = default; + ~condition_variable_any () = default; + + template<class L> + void wait (L & lock) + { + wait_impl(lock, kInfinite); + } + + template<class L, class Predicate> + void wait (L & lock, Predicate pred) + { + while (!pred()) + wait(lock); + } + + template <class L, class Rep, class Period> + cv_status wait_for(L& lock, const std::chrono::duration<Rep,Period>& period) + { + using namespace std::chrono; + auto timeout = duration_cast<milliseconds>(period).count(); + DWORD waittime = (timeout < kInfinite) ? ((timeout < 0) ? 0 : static_cast<DWORD>(timeout)) : (kInfinite - 1); + bool result = wait_impl(lock, waittime) || (timeout >= kInfinite); + return result ? cv_status::no_timeout : cv_status::timeout; + } + + template <class L, class Rep, class Period, class Predicate> + bool wait_for(L& lock, const std::chrono::duration<Rep, Period>& period, + Predicate pred) + { + return wait_until(lock, std::chrono::steady_clock::now() + period, + std::move(pred)); + } + template <class L, class Clock, class Duration> + cv_status wait_until (L& lock, + const std::chrono::time_point<Clock,Duration>& abs_time) + { + return wait_for(lock, abs_time - Clock::now()); + } + template <class L, class Clock, class Duration, class Predicate> + bool wait_until (L& lock, + const std::chrono::time_point<Clock, Duration>& abs_time, + Predicate pred) + { + while (!pred()) + { + if (wait_until(lock, abs_time) == cv_status::timeout) + { + return pred(); + } + } + return true; + } + }; +} + +#endif // LIBBUTL_MINGW_CONDITION_VARIABLE_HXX diff --git a/libbutl/mingw-invoke.hxx b/libbutl/mingw-invoke.hxx new file mode 100644 index 0000000..65810e7 --- /dev/null +++ b/libbutl/mingw-invoke.hxx @@ -0,0 +1,109 @@ +/** +* Lightweight std::invoke() implementation for C++11 and C++14 +* +* Copyright (c) 2018-2019 by Nathaniel J. McClatchey, San Jose, CA, United States +* Copyright (c) 2022 the build2 authors +* +* Licensed under the simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#ifndef LIBBUTL_MINGW_INVOKE_HXX +#define LIBBUTL_MINGW_INVOKE_HXX + +#include <type_traits> // For std::result_of, etc. +#include <utility> // For std::forward +#include <functional> // For std::reference_wrapper + +namespace mingw_stdthread +{ + namespace detail + { + // For compatibility, implement std::invoke for C++11 and C++14. + // + template<bool PMemFunc, bool PMemData> + struct Invoker + { + template<class F, class... Args> + inline static typename std::result_of<F(Args...)>::type invoke (F&& f, Args&&... args) + { + return std::forward<F>(f)(std::forward<Args>(args)...); + } + }; + template<bool> + struct InvokerHelper; + + template<> + struct InvokerHelper<false> + { + template<class T1> + inline static auto get (T1&& t1) -> decltype(*std::forward<T1>(t1)) + { + return *std::forward<T1>(t1); + } + + template<class T1> + inline static auto get (const std::reference_wrapper<T1>& t1) -> decltype(t1.get()) + { + return t1.get(); + } + }; + + template<> + struct InvokerHelper<true> + { + template<class T1> + inline static auto get (T1&& t1) -> decltype(std::forward<T1>(t1)) + { + return std::forward<T1>(t1); + } + }; + + template<> + struct Invoker<true, false> + { + template<class T, class F, class T1, class... Args> + inline static auto invoke (F T::* f, T1&& t1, Args&&... args) -> \ + decltype((InvokerHelper<std::is_base_of<T,typename std::decay<T1>::type>::value>::get(std::forward<T1>(t1)).*f)(std::forward<Args>(args)...)) + { + return (InvokerHelper<std::is_base_of<T,typename std::decay<T1>::type>::value>::get(std::forward<T1>(t1)).*f)(std::forward<Args>(args)...); + } + }; + + template<> + struct Invoker<false, true> + { + template<class T, class F, class T1, class... Args> + inline static auto invoke (F T::* f, T1&& t1, Args&&... args) -> \ + decltype(InvokerHelper<std::is_base_of<T,typename std::decay<T1>::type>::value>::get(t1).*f) + { + return InvokerHelper<std::is_base_of<T,typename std::decay<T1>::type>::value>::get(t1).*f; + } + }; + + template<class F, class... Args> + struct InvokeResult + { + typedef Invoker<std::is_member_function_pointer<typename std::remove_reference<F>::type>::value, + std::is_member_object_pointer<typename std::remove_reference<F>::type>::value && + (sizeof...(Args) == 1)> invoker; + inline static auto invoke (F&& f, Args&&... args) -> decltype(invoker::invoke(std::forward<F>(f), std::forward<Args>(args)...)) + { + return invoker::invoke(std::forward<F>(f), std::forward<Args>(args)...); + } + }; + + template<class F, class...Args> + auto invoke (F&& f, Args&&... args) -> decltype(InvokeResult<F, Args...>::invoke(std::forward<F>(f), std::forward<Args>(args)...)) + { + return InvokeResult<F, Args...>::invoke(std::forward<F>(f), std::forward<Args>(args)...); + } + } +} + +#endif // LIBBUTL_MINGW_INVOKE_HXX diff --git a/libbutl/mingw-mutex.hxx b/libbutl/mingw-mutex.hxx new file mode 100644 index 0000000..d297786 --- /dev/null +++ b/libbutl/mingw-mutex.hxx @@ -0,0 +1,210 @@ +/** +* std::mutex et al implementation for MinGW-w64 +* +* Copyright (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* Copyright (c) 2022 the build2 authors +* +* Licensed under the simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#ifndef LIBBUTL_MINGW_MUTEX_HXX +#define LIBBUTL_MINGW_MUTEX_HXX + +#if !defined(__cplusplus) || (__cplusplus < 201402L) +# error C++14 compiler required +#endif + +#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0601 +# error _WIN32_WINNT should be 0x0601 (Windows 7) or greater +#endif + +#include <chrono> +#include <system_error> +#include <atomic> + +#include <mutex> + +#include <synchapi.h> // For InitializeCriticalSection, etc. +#include <errhandlingapi.h> // For GetLastError +#include <handleapi.h> + +namespace mingw_stdthread +{ + // To make this namespace equivalent to the thread-related subset of std, + // pull in the classes and class templates supplied by std but not by this + // implementation. + // + using std::lock_guard; + using std::unique_lock; + using std::adopt_lock_t; + using std::defer_lock_t; + using std::try_to_lock_t; + using std::adopt_lock; + using std::defer_lock; + using std::try_to_lock; + + class recursive_mutex + { + CRITICAL_SECTION mHandle; + public: + typedef LPCRITICAL_SECTION native_handle_type; + native_handle_type native_handle() {return &mHandle;} + recursive_mutex() noexcept : mHandle() + { + InitializeCriticalSection(&mHandle); + } + recursive_mutex (const recursive_mutex&) = delete; + recursive_mutex& operator=(const recursive_mutex&) = delete; + ~recursive_mutex() noexcept + { + DeleteCriticalSection(&mHandle); + } + void lock() + { + EnterCriticalSection(&mHandle); + } + void unlock() + { + LeaveCriticalSection(&mHandle); + } + bool try_lock() + { + return (TryEnterCriticalSection(&mHandle)!=0); + } + }; + + // Slim Reader-Writer (SRW)-based implementation that requires Windows 7. + // + class mutex + { + protected: + SRWLOCK mHandle; + public: + typedef PSRWLOCK native_handle_type; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" + constexpr mutex () noexcept : mHandle(SRWLOCK_INIT) { } +#pragma GCC diagnostic pop + mutex (const mutex&) = delete; + mutex & operator= (const mutex&) = delete; + void lock () + { + AcquireSRWLockExclusive(&mHandle); + } + void unlock () + { + ReleaseSRWLockExclusive(&mHandle); + } + // TryAcquireSRW functions are a Windows 7 feature. + bool try_lock () + { + BOOL ret = TryAcquireSRWLockExclusive(&mHandle); + return ret; + } + native_handle_type native_handle () + { + return &mHandle; + } + }; + + class recursive_timed_mutex + { + static constexpr DWORD kWaitAbandoned = 0x00000080l; + static constexpr DWORD kWaitObject0 = 0x00000000l; + static constexpr DWORD kInfinite = 0xffffffffl; + inline bool try_lock_internal (DWORD ms) noexcept + { + DWORD ret = WaitForSingleObject(mHandle, ms); + + /* + @@ TODO +#ifndef NDEBUG + if (ret == kWaitAbandoned) + { + using namespace std; + fprintf(stderr, "FATAL: Thread terminated while holding a mutex."); + terminate(); + } +#endif + */ + + return (ret == kWaitObject0) || (ret == kWaitAbandoned); + } + protected: + HANDLE mHandle; + public: + typedef HANDLE native_handle_type; + native_handle_type native_handle() const {return mHandle;} + recursive_timed_mutex(const recursive_timed_mutex&) = delete; + recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete; + recursive_timed_mutex(): mHandle(CreateMutex(NULL, FALSE, NULL)) {} + ~recursive_timed_mutex() + { + CloseHandle(mHandle); + } + void lock() + { + DWORD ret = WaitForSingleObject(mHandle, kInfinite); + + /* + @@ TODO + +// If (ret == WAIT_ABANDONED), then the thread that held ownership was +// terminated. Behavior is undefined, but Windows will pass ownership to this +// thread. +#ifndef NDEBUG + if (ret == kWaitAbandoned) + { + using namespace std; + fprintf(stderr, "FATAL: Thread terminated while holding a mutex."); + terminate(); + } +#endif + */ + + if ((ret != kWaitObject0) && (ret != kWaitAbandoned)) + { + throw std::system_error(GetLastError(), std::system_category()); + } + } + void unlock() + { + if (!ReleaseMutex(mHandle)) + throw std::system_error(GetLastError(), std::system_category()); + } + bool try_lock() + { + return try_lock_internal(0); + } + template <class Rep, class Period> + bool try_lock_for(const std::chrono::duration<Rep,Period>& dur) + { + using namespace std::chrono; + auto timeout = duration_cast<milliseconds>(dur).count(); + while (timeout > 0) + { + constexpr auto kMaxStep = static_cast<decltype(timeout)>(kInfinite-1); + auto step = (timeout < kMaxStep) ? timeout : kMaxStep; + if (try_lock_internal(static_cast<DWORD>(step))) + return true; + timeout -= step; + } + return false; + } + template <class Clock, class Duration> + bool try_lock_until(const std::chrono::time_point<Clock,Duration>& timeout_time) + { + return try_lock_for(timeout_time - Clock::now()); + } + }; + + typedef recursive_timed_mutex timed_mutex; +} + +#endif // LIBBUTL_MINGW_MUTEX_HXX diff --git a/libbutl/mingw-shared_mutex.hxx b/libbutl/mingw-shared_mutex.hxx new file mode 100644 index 0000000..aacbaf8 --- /dev/null +++ b/libbutl/mingw-shared_mutex.hxx @@ -0,0 +1,124 @@ +/** +* std::shared_mutex et al implementation for MinGW-w64 +* +* Copyright (c) 2017 by Nathaniel J. McClatchey, Athens OH, United States +* Copyright (c) 2022 the build2 authors +* +* Licensed under the simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#ifndef LIBBUTL_MINGW_SHARED_MUTEX_HXX +#define LIBBUTL_MINGW_SHARED_MUTEX_HXX + +#if !defined(__cplusplus) || (__cplusplus < 201402L) +# error C++14 compiler required +#endif + +#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0601 +# error _WIN32_WINNT should be 0x0601 (Windows 7) or greater +#endif + +#include <cassert> +// For descriptive errors. +#include <system_error> +// For timing in shared_timed_mutex. +#include <chrono> +#include <limits> + +#include <shared_mutex> // shared_lock + +// For defer_lock_t, adopt_lock_t, and try_to_lock_t +#include <libbutl/mingw-mutex.hxx> + +#include <synchapi.h> + +namespace mingw_stdthread +{ + using std::shared_lock; + + class condition_variable_any; + + // Slim Reader-Writer (SRW)-based implementation that requires Windows 7. + // + class shared_mutex : mutex + { + friend class condition_variable_any; + public: + using mutex::native_handle_type; + using mutex::lock; + using mutex::try_lock; + using mutex::unlock; + using mutex::native_handle; + + void lock_shared () + { + AcquireSRWLockShared(&mHandle); + } + + void unlock_shared () + { + ReleaseSRWLockShared(&mHandle); + } + + bool try_lock_shared () + { + return TryAcquireSRWLockShared(&mHandle) != 0; + } + }; + + class shared_timed_mutex : shared_mutex + { + typedef shared_mutex Base; + public: + using Base::lock; + using Base::try_lock; + using Base::unlock; + using Base::lock_shared; + using Base::try_lock_shared; + using Base::unlock_shared; + + template< class Clock, class Duration > + bool try_lock_until ( const std::chrono::time_point<Clock,Duration>& cutoff ) + { + do + { + if (try_lock()) + return true; + } + while (std::chrono::steady_clock::now() < cutoff); + return false; + } + + template< class Rep, class Period > + bool try_lock_for (const std::chrono::duration<Rep,Period>& rel_time) + { + return try_lock_until(std::chrono::steady_clock::now() + rel_time); + } + + template< class Clock, class Duration > + bool try_lock_shared_until ( const std::chrono::time_point<Clock,Duration>& cutoff ) + { + do + { + if (try_lock_shared()) + return true; + } + while (std::chrono::steady_clock::now() < cutoff); + return false; + } + + template< class Rep, class Period > + bool try_lock_shared_for (const std::chrono::duration<Rep,Period>& rel_time) + { + return try_lock_shared_until(std::chrono::steady_clock::now() + rel_time); + } + }; +} + +#endif // LIBBUTL_MINGW_SHARED_MUTEX_HXX diff --git a/libbutl/mingw-thread.hxx b/libbutl/mingw-thread.hxx new file mode 100644 index 0000000..66f98aa --- /dev/null +++ b/libbutl/mingw-thread.hxx @@ -0,0 +1,330 @@ +/** +* std::thread implementation for MinGW-w64 +* +* Copyright (c) 2013-2016 by Mega Limited, Auckland, New Zealand +* Copyright (c) 2022 the build2 authors +* +* Licensed under the simplified (2-clause) BSD License. +* You should have received a copy of the license along with this +* program. +* +* This code is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ + +#ifndef LIBBUTL_MINGW_THREAD_HXX +#define LIBBUTL_MINGW_THREAD_HXX + +#if !defined(__cplusplus) || (__cplusplus < 201402L) +# error C++14 compiler required +#endif + +#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0601 +# error _WIN32_WINNT should be 0x0601 (Windows 7) or greater +#endif + +#include <cstddef> // For std::size_t +#include <cerrno> // Detect error type. +#include <exception> // For std::terminate +#include <system_error> // For std::system_error +#include <functional> // For std::hash, std::invoke (C++17) +#include <tuple> // For std::tuple +#include <chrono> // For sleep timing. +#include <memory> // For std::unique_ptr +#include <iosfwd> // Stream output for thread ids. +#include <utility> // For std::swap, std::forward + +#include <synchapi.h> // For WaitForSingleObject +#include <handleapi.h> // For CloseHandle, etc. +#include <sysinfoapi.h> // For GetNativeSystemInfo +#include <processthreadsapi.h> // For GetCurrentThreadId + +#include <process.h> // For _beginthreadex + +#if __cplusplus < 201703L +# include <libbutl/mingw-invoke.hxx> +#endif + +namespace mingw_stdthread +{ + // @@ I think can get rid of this in C++14. + // + namespace detail + { + template<std::size_t...> + struct IntSeq {}; + + template<std::size_t N, std::size_t... S> + struct GenIntSeq : GenIntSeq<N-1, N-1, S...> { }; + + template<std::size_t... S> + struct GenIntSeq<0, S...> { typedef IntSeq<S...> type; }; + +// Use a template specialization to avoid relying on compiler optimization +// when determining the parameter integer sequence. + template<class Func, class T, typename... Args> + class ThreadFuncCall; +// We can't define the Call struct in the function - the standard forbids template methods in that case + template<class Func, std::size_t... S, typename... Args> + class ThreadFuncCall<Func, detail::IntSeq<S...>, Args...> + { + static_assert(sizeof...(S) == sizeof...(Args), "Args must match."); + using Tuple = std::tuple<typename std::decay<Args>::type...>; + typename std::decay<Func>::type mFunc; + Tuple mArgs; + + public: + ThreadFuncCall(Func&& aFunc, Args&&... aArgs) + : mFunc(std::forward<Func>(aFunc)), + mArgs(std::forward<Args>(aArgs)...) + { + } + + void callFunc() + { +#if __cplusplus < 201703L + detail::invoke(std::move(mFunc), std::move(std::get<S>(mArgs)) ...); +#else + std::invoke (std::move(mFunc), std::move(std::get<S>(mArgs)) ...); +#endif + } + }; + + // Allow construction of threads without exposing implementation. + class ThreadIdTool; + } + + class thread + { + public: + class id + { + DWORD mId = 0; + friend class thread; + friend class std::hash<id>; + friend class detail::ThreadIdTool; + explicit id(DWORD aId) noexcept : mId(aId){} + public: + id () noexcept = default; + friend bool operator==(id x, id y) noexcept {return x.mId == y.mId; } + friend bool operator!=(id x, id y) noexcept {return x.mId != y.mId; } + friend bool operator< (id x, id y) noexcept {return x.mId < y.mId; } + friend bool operator<=(id x, id y) noexcept {return x.mId <= y.mId; } + friend bool operator> (id x, id y) noexcept {return x.mId > y.mId; } + friend bool operator>=(id x, id y) noexcept {return x.mId >= y.mId; } + + template<class _CharT, class _Traits> + friend std::basic_ostream<_CharT, _Traits>& + operator<<(std::basic_ostream<_CharT, _Traits>& __out, id __id) + { + if (__id.mId == 0) + { + return __out << "<invalid std::thread::id>"; + } + else + { + return __out << __id.mId; + } + } + }; + private: + static constexpr HANDLE kInvalidHandle = nullptr; + static constexpr DWORD kInfinite = 0xffffffffl; + HANDLE mHandle; + id mThreadId; + + template <class Call> + static unsigned __stdcall threadfunc(void* arg) + { + std::unique_ptr<Call> call(static_cast<Call*>(arg)); + call->callFunc(); + return 0; + } + + static unsigned int _hardware_concurrency_helper() noexcept + { + SYSTEM_INFO sysinfo; + ::GetNativeSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; + } + public: + typedef HANDLE native_handle_type; + id get_id() const noexcept {return mThreadId;} + native_handle_type native_handle() const {return mHandle;} + thread(): mHandle(kInvalidHandle), mThreadId(){} + + thread(thread&& other) noexcept + :mHandle(other.mHandle), mThreadId(other.mThreadId) + { + other.mHandle = kInvalidHandle; + other.mThreadId = id{}; + } + + thread(const thread &other) = delete; + + template<class Func, typename... Args> + explicit thread(Func&& func, Args&&... args) : mHandle(), mThreadId() + { + // Instead of INVALID_HANDLE_VALUE, _beginthreadex returns 0. + + using ArgSequence = typename detail::GenIntSeq<sizeof...(Args)>::type; + using Call = detail::ThreadFuncCall<Func, ArgSequence, Args...>; + auto call = new Call(std::forward<Func>(func), std::forward<Args>(args)...); + unsigned int id_receiver; + auto int_handle = _beginthreadex(NULL, 0, threadfunc<Call>, + static_cast<LPVOID>(call), 0, &id_receiver); + if (int_handle == 0) + { + mHandle = kInvalidHandle; + int errnum = errno; + delete call; + // Note: Should only throw EINVAL, EAGAIN, EACCES + throw std::system_error(errnum, std::generic_category()); + } else { + mThreadId.mId = id_receiver; + mHandle = reinterpret_cast<HANDLE>(int_handle); + } + } + + bool joinable() const {return mHandle != kInvalidHandle;} + + // Note: Due to lack of synchronization, this function has a race + // condition if called concurrently, which leads to undefined + // behavior. The same applies to all other member functions of this + // class, but this one is mentioned explicitly. + void join() + { + using namespace std; + if (get_id() == id(GetCurrentThreadId())) + throw system_error(make_error_code(errc::resource_deadlock_would_occur)); + if (mHandle == kInvalidHandle) + throw system_error(make_error_code(errc::no_such_process)); + if (!joinable()) + throw system_error(make_error_code(errc::invalid_argument)); + WaitForSingleObject(mHandle, kInfinite); + CloseHandle(mHandle); + mHandle = kInvalidHandle; + mThreadId = id{}; + } + + ~thread() + { + if (joinable()) + { + // @@ TODO + /* +#ifndef NDEBUG + std::printf("Error: Must join() or detach() a thread before \ +destroying it.\n"); +#endif + */ + std::terminate(); + } + } + thread& operator=(const thread&) = delete; + thread& operator=(thread&& other) noexcept + { + if (joinable()) + { + // @@ TODO + /* +#ifndef NDEBUG + std::printf("Error: Must join() or detach() a thread before \ +moving another thread to it.\n"); +#endif + */ + std::terminate(); + } + swap(other); + return *this; + } + void swap(thread& other) noexcept + { + std::swap(mHandle, other.mHandle); + std::swap(mThreadId.mId, other.mThreadId.mId); + } + + static unsigned int hardware_concurrency() noexcept + { + // @@ TODO: this seems like a bad idea. + // + /*static*/ unsigned int cached = _hardware_concurrency_helper(); + return cached; + } + + void detach() + { + if (!joinable()) + { + using namespace std; + throw system_error(make_error_code(errc::invalid_argument)); + } + if (mHandle != kInvalidHandle) + { + CloseHandle(mHandle); + mHandle = kInvalidHandle; + } + mThreadId = id{}; + } + }; + + namespace detail + { + class ThreadIdTool + { + public: + static thread::id make_id (DWORD base_id) noexcept + { + return thread::id(base_id); + } + }; + } + + namespace this_thread + { + inline thread::id get_id() noexcept + { + return detail::ThreadIdTool::make_id(GetCurrentThreadId()); + } + inline void yield() noexcept {Sleep(0);} + template< class Rep, class Period > + void sleep_for( const std::chrono::duration<Rep,Period>& sleep_duration) + { + static constexpr DWORD kInfinite = 0xffffffffl; + using namespace std::chrono; + using rep = milliseconds::rep; + rep ms = duration_cast<milliseconds>(sleep_duration).count(); + while (ms > 0) + { + constexpr rep kMaxRep = static_cast<rep>(kInfinite - 1); + auto sleepTime = (ms < kMaxRep) ? ms : kMaxRep; + Sleep(static_cast<DWORD>(sleepTime)); + ms -= sleepTime; + } + } + template <class Clock, class Duration> + void sleep_until(const std::chrono::time_point<Clock,Duration>& sleep_time) + { + sleep_for(sleep_time-Clock::now()); + } + } +} + +namespace std +{ + // Specialize hash for this implementation's thread::id, even if the + // std::thread::id already has a hash. + template<> + struct hash<mingw_stdthread::thread::id> + { + typedef mingw_stdthread::thread::id argument_type; + typedef size_t result_type; + size_t operator() (const argument_type & i) const noexcept + { + return i.mId; + } + }; +} + +#endif // LIBBUTL_MINGW_THREAD_HXX diff --git a/libbutl/move-only-function.hxx b/libbutl/move-only-function.hxx new file mode 100644 index 0000000..e5cfe51 --- /dev/null +++ b/libbutl/move-only-function.hxx @@ -0,0 +1,177 @@ +// file : libbutl/move-only-function.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include <utility> +#include <functional> +#include <type_traits> + +namespace butl +{ + // This is a move-only std::function version which is implemented in terms + // of std::function. It is similar to C++23 std::move_only_function but + // still provides target() (but not target_type()). + // + template <typename> + class move_only_function_ex; + + // Alias butl::move_only_function to std::move_only_function if available + // and to move_only_function_ex otherwise. + // +#ifdef __cpp_lib_move_only_function + using std::move_only_function; +#else + template <typename F> + using move_only_function = move_only_function_ex<F>; +#endif + + template <typename R, typename... A> + class move_only_function_ex<R (A...)> + { + public: + using result_type = R; + + move_only_function_ex () = default; + move_only_function_ex (std::nullptr_t) noexcept {} + + // Note: according to the spec we should also disable these if F is not + // callable, but that is not easy to do in C++14. Maybe we should do + // something for C++17 and later (without this the diagnostics is quite + // hairy). + // + template <typename F> + move_only_function_ex (F&& f, typename std::enable_if<!std::is_same<typename std::remove_reference<F>::type, move_only_function_ex>::value>::type* = 0) + { + using FV = typename std::decay<F>::type; + + if (!null (f)) + f_ = wrapper<FV> (std::forward<F> (f)); + } + + template <typename F> + typename std::enable_if<!std::is_same<typename std::remove_reference<F>::type, move_only_function_ex>::value, move_only_function_ex>::type& + operator= (F&& f) + { + move_only_function_ex (std::forward<F> (f)).swap (*this); + return *this; + } + + move_only_function_ex& + operator= (std::nullptr_t) noexcept + { + f_ = nullptr; + return *this; + } + + void swap (move_only_function_ex& f) noexcept + { + f_.swap (f.f_); + } + + R operator() (A... args) const + { + return f_ (std::forward<A> (args)...); + } + + explicit operator bool () const noexcept + { + return static_cast<bool> (f_); + } + + template <typename T> + T* target() noexcept + { + wrapper<T>* r (f_.template target<wrapper<T>> ()); + return r != nullptr ? &r->f : nullptr; + } + + template <typename T> + const T* target() const noexcept + { + const wrapper<T>* r (f_.template target<wrapper<T>> ()); + return r != nullptr ? &r->f : nullptr; + } + + move_only_function_ex (move_only_function_ex&&) = default; + move_only_function_ex& operator= (move_only_function_ex&&) = default; + + move_only_function_ex (const move_only_function_ex&) = delete; + move_only_function_ex& operator= (const move_only_function_ex&) = delete; + + private: + template <typename F> + struct wrapper + { + struct empty {}; + + union + { + F f; + empty e; + }; + + explicit wrapper (F&& f_): f (std::move (f_)) {} + explicit wrapper (const F& f_): f (f_) {} + + R operator() (A... args) + { + return f (std::forward<A> (args)...); + } + + R operator() (A... args) const + { + return f (std::forward<A> (args)...); + } + + wrapper (wrapper&& w) + noexcept (std::is_nothrow_move_constructible<F>::value) + : f (std::move (w.f)) {} + + wrapper& operator= (wrapper&&) = delete; // Shouldn't be needed. + + ~wrapper () {f.~F ();} + + // These shouldn't be called. + // + wrapper (const wrapper&) {} + wrapper& operator= (const wrapper&) {return *this;} + }; + + template <typename F> static bool null (const F&) {return false;} + template <typename R1, typename... A1> static bool null (R1 (*p) (A1...)) {return p == nullptr;} + template <typename R1, typename... A1> static bool null (const move_only_function_ex<R1 (A1...)>& f) {return !f;} + template <typename R1, typename C, typename... A1> static bool null (R1 (C::*p) (A1...)) {return p == nullptr;} + template <typename R1, typename C, typename... A1> static bool null (R1 (C::*p) (A1...) const) {return p == nullptr;} + + std::function<R (A...)> f_; + }; + + template <typename R, typename... A> + inline bool + operator== (const move_only_function_ex<R (A...)>& f, std::nullptr_t) noexcept + { + return !f; + } + + template <typename R, typename... A> + inline bool + operator== (std::nullptr_t, const move_only_function_ex<R (A...)>& f) noexcept + { + return !f; + } + + template <typename R, typename... A> + inline bool + operator!= (const move_only_function_ex<R (A...)>& f, std::nullptr_t) noexcept + { + return static_cast<bool> (f); + } + + template <typename R, typename... A> + inline bool + operator!= (std::nullptr_t, const move_only_function_ex<R (A...)>& f) noexcept + { + return static_cast<bool> (f); + } +} diff --git a/libbutl/multi-index.mxx b/libbutl/multi-index.hxx index d51bdfc..a6754cd 100644 --- a/libbutl/multi-index.mxx +++ b/libbutl/multi-index.hxx @@ -1,29 +1,14 @@ -// file : libbutl/multi-index.mxx -*- C++ -*- +// file : libbutl/multi-index.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <utility> // declval() #include <functional> // hash -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.multi_index; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Google the "Emulating Boost.MultiIndex with Standard Containers" blog // post for details. @@ -57,7 +42,7 @@ LIBBUTL_MODEXPORT namespace butl }; } -LIBBUTL_MODEXPORT namespace std +namespace std { template <typename T> struct hash<butl::map_key<T>>: hash<T> diff --git a/libbutl/openssl.cxx b/libbutl/openssl.cxx index 8741b35..f9df2e7 100644 --- a/libbutl/openssl.cxx +++ b/libbutl/openssl.cxx @@ -1,35 +1,10 @@ // file : libbutl/openssl.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/openssl.mxx> -#endif +#include <libbutl/openssl.hxx> #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> - #include <utility> // move() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.openssl; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.process; -import butl.fdstream; -import butl.small_vector; -#endif - -#endif using namespace std; diff --git a/libbutl/openssl.mxx b/libbutl/openssl.hxx index 6a0907e..b340f5c 100644 --- a/libbutl/openssl.mxx +++ b/libbutl/openssl.hxx @@ -1,41 +1,21 @@ -// file : libbutl/openssl.mxx -*- C++ -*- +// file : libbutl/openssl.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <type_traits> -#include <cstddef> // size_t -#include <utility> // move(), forward() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.openssl; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.process; //@@ MOD TODO: should we re-export? -import butl.fdstream; -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/process.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/process.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/small-vector.hxx> +#include <libbutl/semantic-version.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Perform a crypto operation using the openssl(1) program. Throw // process_error and io_error (both derive from system_error) in case of @@ -100,6 +80,23 @@ LIBBUTL_MODEXPORT namespace butl // department (that were apparently fixed in 1.0.2). To work around these // bugs pass user-supplied options first. // + struct openssl_info + { + // Note that the program name can be used by the caller to properly + // interpret the version. + // + // The name/version examples: + // + // OpenSSL 3.0.0 + // OpenSSL 1.1.1l + // LibreSSL 2.8.3 + // + // The `l` component above ends up in semantic_version::build. + // + std::string name; + semantic_version version; + }; + class LIBBUTL_SYMEXPORT openssl: public process { public: @@ -133,6 +130,22 @@ LIBBUTL_MODEXPORT namespace butl const std::string& command, A&&... options); + // Run `openssl version` command and try to parse and return the + // information it prints to stdout. Return nullopt if the process hasn't + // terminated successfully or stdout parsing has failed. Throw + // process_error and io_error in case of errors. + // + template <typename E> + static optional<openssl_info> + info (E&& err, const process_env&); + + template <typename C, + typename E> + static optional<openssl_info> + info (const C&, + E&& err, + const process_env&); + private: template <typename T> struct is_other diff --git a/libbutl/openssl.ixx b/libbutl/openssl.ixx index c685b65..db2fbcd 100644 --- a/libbutl/openssl.ixx +++ b/libbutl/openssl.ixx @@ -1,7 +1,10 @@ // file : libbutl/openssl.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <cstddef> // size_t +#include <utility> // forward() + +namespace butl { template <typename I, typename O, @@ -23,4 +26,13 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. std::forward<A> (options)...) { } + + template <typename E> + inline optional<openssl_info> openssl:: + info (E&& err, const process_env& env) + { + return info ([] (const char* [], std::size_t) {}, + std::forward<E> (err), + env); + } } diff --git a/libbutl/openssl.txx b/libbutl/openssl.txx index 3a2c579..f55432d 100644 --- a/libbutl/openssl.txx +++ b/libbutl/openssl.txx @@ -1,7 +1,10 @@ // file : libbutl/openssl.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <cstddef> // size_t +#include <utility> // forward() + +namespace butl { template <typename I> typename std::enable_if<openssl::is_other<I>::value, I>::type openssl:: @@ -47,4 +50,67 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // Note: leaving this scope closes any open ends of the pipes in io_data. } + + template <typename C, + typename E> + optional<openssl_info> openssl:: + info (const C& cmdc, E&& err, const process_env& env) + { + using namespace std; + + // Run the `openssl version` command. + // + openssl os (cmdc, + nullfd, fdstream_mode::text, forward<E> (err), + env, + "version"); + + // Read the command's stdout and wait for its completion. Bail out if the + // command didn't terminate successfully or stdout contains no data. + // + string s; + if (!getline (os.in, s)) + return nullopt; + + os.in.close (); + + if (!os.wait ()) + return nullopt; + + // Parse the version string. + // + // Note that there is some variety in the version representations: + // + // OpenSSL 3.0.0 7 sep 2021 (Library: OpenSSL 3.0.0 7 sep 2021) + // OpenSSL 1.1.1l FIPS 24 Aug 2021 + // LibreSSL 2.8.3 + // + // We will only consider the first two space separated components as the + // program name and version. We will also assume that there are no leading + // spaces and the version is delimited from the program name with a single + // space character. + // + size_t e (s.find (' ')); + + // Bail out if there is no version present in the string or the program + // name is empty. + // + if (e == string::npos || e == 0) + return nullopt; + + string nm (s, 0, e); + + size_t b (e + 1); // The beginning of the version. + e = s.find (' ', b); // The end of the version. + + optional<semantic_version> ver ( + parse_semantic_version (string (s, b, e != string::npos ? e - b : e), + semantic_version::allow_build, + "" /* build_separators */)); + + if (!ver) + return nullopt; + + return openssl_info {move (nm), move (*ver)}; + } } diff --git a/libbutl/optional.mxx b/libbutl/optional.hxx index d32e14b..f22189b 100644 --- a/libbutl/optional.mxx +++ b/libbutl/optional.hxx @@ -1,11 +1,7 @@ -// file : libbutl/optional.mxx -*- C++ -*- +// file : libbutl/optional.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. // Note: the Clang check must come before GCC since it also defines __GNUC__. // @@ -54,7 +50,6 @@ # endif #endif -#ifndef __cpp_lib_modules_ts #ifdef LIBBUTL_STD_OPTIONAL # include <optional> #else @@ -62,31 +57,19 @@ # include <functional> // hash # include <type_traits> // is_* #endif -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.optional; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> #ifdef LIBBUTL_STD_OPTIONAL -LIBBUTL_MODEXPORT namespace butl +namespace butl { - template <typename T> - using optional = std::optional<T>; - + using std::optional; using std::nullopt_t; using std::nullopt; } #else -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Simple optional class template while waiting for std::optional. // @@ -125,10 +108,16 @@ LIBBUTL_MODEXPORT namespace butl #if (!defined(_MSC_VER) || _MSC_VER > 1900) && \ (!defined(__GNUC__) || __GNUC__ > 4 || defined(__clang__)) constexpr optional_data (const optional_data& o): v_ (o.v_) {if (v_) new (&d_) T (o.d_);} - constexpr optional_data (optional_data&& o): v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} + + constexpr optional_data (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value) + : v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} #else optional_data (const optional_data& o): v_ (o.v_) {if (v_) new (&d_) T (o.d_);} - optional_data (optional_data&& o): v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} + + optional_data (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value) + : v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} #endif optional_data& operator= (nullopt_t); @@ -136,7 +125,11 @@ LIBBUTL_MODEXPORT namespace butl optional_data& operator= (T&&); optional_data& operator= (const optional_data&); - optional_data& operator= (optional_data&&); + + optional_data& operator= (optional_data&&) + noexcept (std::is_nothrow_move_constructible<T>::value && + std::is_nothrow_move_assignable<T>::value && + std::is_nothrow_destructible<T>::value); ~optional_data (); }; @@ -168,10 +161,16 @@ LIBBUTL_MODEXPORT namespace butl #if (!defined(_MSC_VER) || _MSC_VER > 1900) && \ (!defined(__GNUC__) || __GNUC__ > 4 || defined(__clang__)) constexpr optional_data (const optional_data& o): v_ (o.v_) {if (v_) new (&d_) T (o.d_);} - constexpr optional_data (optional_data&& o): v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} + + constexpr optional_data (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value) + : v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} #else optional_data (const optional_data& o): v_ (o.v_) {if (v_) new (&d_) T (o.d_);} - optional_data (optional_data&& o): v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} + + optional_data (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value) + : v_ (o.v_) {if (v_) new (&d_) T (std::move (o.d_));} #endif optional_data& operator= (nullopt_t); @@ -179,7 +178,12 @@ LIBBUTL_MODEXPORT namespace butl optional_data& operator= (T&&); optional_data& operator= (const optional_data&); - optional_data& operator= (optional_data&&); + + // Note: it is trivially destructible and thus is no-throw destructible. + // + optional_data& operator= (optional_data&&) + noexcept (std::is_nothrow_move_constructible<T>::value && + std::is_nothrow_move_assignable<T>::value); }; template <typename T, @@ -306,6 +310,8 @@ LIBBUTL_MODEXPORT namespace butl explicit operator bool () const {return this->v_;} }; + // optional ? optional + // template <typename T> inline auto operator== (const optional<T>& x, const optional<T>& y) @@ -335,6 +341,131 @@ LIBBUTL_MODEXPORT namespace butl { return y < x; } + + // optional ? nullopt + // nullopt ? optional + // + template <typename T> + inline auto + operator== (const optional<T>& x, nullopt_t) + { + bool px (x); + return !px; + } + + template <typename T> + inline auto + operator== (nullopt_t, const optional<T>& y) + { + bool py (y); + return !py; + } + + template <typename T> + inline auto + operator!= (const optional<T>& x, nullopt_t y) + { + return !(x == y); + } + + template <typename T> + inline auto + operator!= (nullopt_t x, const optional<T>& y) + { + return !(x == y); + } + + template <typename T> + inline auto + operator< (const optional<T>&, nullopt_t) + { + return false; + } + + template <typename T> + inline auto + operator< (nullopt_t, const optional<T>& y) + { + bool py (y); + return py; + } + + template <typename T> + inline auto + operator> (const optional<T>& x, nullopt_t y) + { + return y < x; + } + + template <typename T> + inline auto + operator> (nullopt_t x, const optional<T>& y) + { + return y < x; + } + + // optional ? T + // T ? optional + // + template <typename T> + inline auto + operator== (const optional<T>& x, const T& y) + { + bool px (x); + return px && *x == y; + } + + template <typename T> + inline auto + operator== (const T& x, const optional<T>& y) + { + bool py (y); + return py && x == *y; + } + + template <typename T> + inline auto + operator!= (const optional<T>& x, const T& y) + { + return !(x == y); + } + + template <typename T> + inline auto + operator!= (const T& x, const optional<T>& y) + { + return !(x == y); + } + + template <typename T> + inline auto + operator< (const optional<T>& x, const T& y) + { + bool px (x); + return !px || *x < y; + } + + template <typename T> + inline auto + operator< (const T& x, const optional<T>& y) + { + bool py (y); + return py && x < *y; + } + + template <typename T> + inline auto + operator> (const optional<T>& x, const T& y) + { + return y < x; + } + + template <typename T> + inline auto + operator> (const T& x, const optional<T>& y) + { + return y < x; + } } namespace std diff --git a/libbutl/optional.ixx b/libbutl/optional.ixx index e2b552f..fdd0ac5 100644 --- a/libbutl/optional.ixx +++ b/libbutl/optional.ixx @@ -77,6 +77,9 @@ namespace butl template <typename T> inline optional_data<T, false>& optional_data<T, false>:: operator= (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value && + std::is_nothrow_move_assignable<T>::value && + std::is_nothrow_destructible<T>::value) { if (o.v_) { @@ -171,6 +174,8 @@ namespace butl template <typename T> inline optional_data<T, true>& optional_data<T, true>:: operator= (optional_data&& o) + noexcept (std::is_nothrow_move_constructible<T>::value && + std::is_nothrow_move_assignable<T>::value) { if (o.v_) { diff --git a/libbutl/pager.cxx b/libbutl/pager.cxx index 44aa83e..e647948 100644 --- a/libbutl/pager.cxx +++ b/libbutl/pager.cxx @@ -1,9 +1,7 @@ // file : libbutl/pager.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/pager.mxx> -#endif +#include <libbutl/pager.hxx> #include <errno.h> // E* @@ -14,46 +12,20 @@ # include <libbutl/win32-utility.hxx> #endif -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> -#include <iostream> - +#include <cstddef> // size_t #include <cstring> // strchr() #include <utility> // move() + #ifndef _WIN32 # include <chrono> # include <thread> // this_thread::sleep_for() #endif -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.pager; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.process; -import butl.fdstream; -#endif -#ifndef _WIN32 -import std.threading; -#endif - -import butl.utility; // operator<<(ostream, exception), throw_generic_error() -import butl.optional; -import butl.fdstream; // fdclose() -#else -#include <libbutl/utility.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/utility.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/fdstream.hxx> using namespace std; diff --git a/libbutl/pager.mxx b/libbutl/pager.hxx index a1f640f..12a6670 100644 --- a/libbutl/pager.mxx +++ b/libbutl/pager.hxx @@ -1,36 +1,18 @@ -// file : libbutl/pager.mxx -*- C++ -*- +// file : libbutl/pager.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <iostream> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.pager; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.process; -import butl.fdstream; -#else -#include <libbutl/process.mxx> -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/process.hxx> +#include <libbutl/fdstream.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Try to run the output through a pager program, such as more or less (no // pun intended, less is used by default). If the default pager program is diff --git a/libbutl/path-io.mxx b/libbutl/path-io.hxx index 6b6dbcf..a60527d 100644 --- a/libbutl/path-io.mxx +++ b/libbutl/path-io.hxx @@ -1,34 +1,16 @@ -// file : libbutl/path-io.mxx -*- C++ -*- +// file : libbutl/path-io.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. #include <cassert> - -#ifndef __cpp_lib_modules_ts #include <ostream> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.path_io; -#ifdef __cpp_lib_modules_ts -import std.core; //@@ MOD TMP (should not be needed). -import std.io; -#endif -import butl.path; -#else -#include <libbutl/path.mxx> -#endif +#include <libbutl/path.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // This is the default path IO implementation. It is separate to allow // custom implementations. For example, we may want to print paths as diff --git a/libbutl/path-map.mxx b/libbutl/path-map.hxx index daaf0a4..e3d776a 100644 --- a/libbutl/path-map.mxx +++ b/libbutl/path-map.hxx @@ -1,33 +1,16 @@ -// file : libbutl/path-map.mxx -*- C++ -*- +// file : libbutl/path-map.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <algorithm> // min() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.path_map; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.prefix_map; -#else -#include <libbutl/path.mxx> -#include <libbutl/prefix-map.mxx> -#endif + +#include <libbutl/path.hxx> +#include <libbutl/prefix-map.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // prefix_map for filesystem paths // @@ -142,4 +125,12 @@ LIBBUTL_MODEXPORT namespace butl template <typename T> using dir_path_map = prefix_map<dir_path, T, dir_path::traits_type::directory_separator>; + + template <typename T> + using path_multimap = + prefix_multimap<path, T, path::traits_type::directory_separator>; + + template <typename T> + using dir_path_multimap = + prefix_multimap<dir_path, T, dir_path::traits_type::directory_separator>; } diff --git a/libbutl/path-pattern.cxx b/libbutl/path-pattern.cxx index cea5aa7..ed36eb5 100644 --- a/libbutl/path-pattern.cxx +++ b/libbutl/path-pattern.cxx @@ -1,41 +1,14 @@ // file : libbutl/path-pattern.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/path-pattern.mxx> -#endif +#include <libbutl/path-pattern.hxx> #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstdint> -#include <cstddef> #include <iterator> // reverse_iterator - #include <algorithm> // find() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.path_pattern; -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -#endif - -import butl.utility; // lcase()[_WIN32] -import butl.filesystem; // path_search() -#else -#include <libbutl/utility.mxx> -#include <libbutl/filesystem.mxx> -#endif +#include <libbutl/utility.hxx> // lcase()[_WIN32] +#include <libbutl/filesystem.hxx> // path_search() using namespace std; diff --git a/libbutl/path-pattern.mxx b/libbutl/path-pattern.hxx index 6d9684a..f6e01be 100644 --- a/libbutl/path-pattern.mxx +++ b/libbutl/path-pattern.hxx @@ -1,37 +1,20 @@ -// file : libbutl/path-pattern.mxx -*- C++ -*- +// file : libbutl/path-pattern.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> +#include <cassert> #include <cstdint> // uint16_t #include <cstddef> // ptrdiff_t, size_t #include <iterator> // input_iterator_tag -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.path_pattern; - -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.path; -import butl.optional; -#else -#include <libbutl/path.mxx> -#include <libbutl/optional.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Wildcard pattern match (aka glob). // diff --git a/libbutl/path-pattern.ixx b/libbutl/path-pattern.ixx index 71f125c..6fee31e 100644 --- a/libbutl/path-pattern.ixx +++ b/libbutl/path-pattern.ixx @@ -3,6 +3,32 @@ namespace butl { + // path_match_flags + // + inline path_match_flags operator& (path_match_flags x, path_match_flags y) + { + return x &= y; + } + + inline path_match_flags operator| (path_match_flags x, path_match_flags y) + { + return x |= y; + } + + inline path_match_flags operator&= (path_match_flags& x, path_match_flags y) + { + return x = static_cast<path_match_flags> ( + static_cast<std::uint16_t> (x) & + static_cast<std::uint16_t> (y)); + } + + inline path_match_flags operator|= (path_match_flags& x, path_match_flags y) + { + return x = static_cast<path_match_flags> ( + static_cast<std::uint16_t> (x) | + static_cast<std::uint16_t> (y)); + } + // path_pattern_iterator // inline path_pattern_iterator:: diff --git a/libbutl/path.cxx b/libbutl/path.cxx index 3b04730..909971b 100644 --- a/libbutl/path.cxx +++ b/libbutl/path.cxx @@ -1,9 +1,7 @@ // file : libbutl/path.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/path.mxx> -#endif +#include <libbutl/path.hxx> #ifdef _WIN32 # include <libbutl/win32-utility.hxx> @@ -25,32 +23,13 @@ #endif #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <utility> - #include <atomic> #include <cstring> // strcpy() -#endif -#ifdef __cpp_modules_ts -module butl.path; +#include <libbutl/ft/lang.hxx> // thread_local -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif - -import butl.utility; // throw_*_error() -import butl.process; // process::current_id() -#else -#include <libbutl/utility.mxx> -#include <libbutl/process.mxx> -#endif +#include <libbutl/utility.hxx> // throw_*_error() +#include <libbutl/process.hxx> // process::current_id() #include <libbutl/export.hxx> @@ -78,10 +57,21 @@ namespace butl // char // + static +#ifdef __cpp_thread_local + thread_local +#else + __thread +#endif + const path_traits<char>::string_type* current_directory_ = nullptr; + template <> LIBBUTL_SYMEXPORT path_traits<char>::string_type path_traits<char>:: current_directory () { + if (const auto* twd = current_directory_) + return *twd; + #ifdef _WIN32 char cwd[_MAX_PATH]; if (_getcwd (cwd, _MAX_PATH) == 0) @@ -121,6 +111,20 @@ namespace butl #endif } + template <> + LIBBUTL_SYMEXPORT const path_traits<char>::string_type* path_traits<char>:: + thread_current_directory () + { + return current_directory_; + } + + template <> + LIBBUTL_SYMEXPORT void path_traits<char>:: + thread_current_directory (const string_type* twd) + { + current_directory_ = twd; + } + #ifndef _WIN32 static const small_vector<string, 4> tmp_vars ( {"TMPDIR", "TMP", "TEMP", "TEMPDIR"}); @@ -207,8 +211,8 @@ namespace butl using std::to_string; return prefix - + "-" + to_string (process::current_id ()) - + "-" + to_string (temp_name_count++); + + '-' + to_string (process::current_id ()) + + '-' + to_string (temp_name_count++); } template <> diff --git a/libbutl/path.mxx b/libbutl/path.hxx index 12479ce..7d8b862 100644 --- a/libbutl/path.mxx +++ b/libbutl/path.hxx @@ -1,13 +1,8 @@ -// file : libbutl/path.mxx -*- C++ -*- +// file : libbutl/path.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -#include <cassert> -#ifndef __cpp_lib_modules_ts #include <string> #include <ostream> #include <cstddef> // ptrdiff_t @@ -21,31 +16,17 @@ #ifdef _WIN32 #include <algorithm> // replace() #endif -#endif -// Other includes. +#include <libbutl/optional.hxx> +#include <libbutl/small-vector.hxx> -#ifdef __cpp_modules_ts -export module butl.path; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.optional; -import butl.small_vector; -#ifdef _WIN32 -import butl.utility; -#endif -#else -#include <libbutl/optional.mxx> -#include <libbutl/small-vector.mxx> #ifdef _WIN32 -#include <libbutl/utility.mxx> // *case*() -#endif +#include <libbutl/utility.hxx> // *case*() #endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Wish list/ideas for improvements. // @@ -78,7 +59,7 @@ LIBBUTL_MODEXPORT namespace butl string_type path; explicit - invalid_basic_path (const string_type& p): path (p) {} + invalid_basic_path (string_type p): path (std::move (p)) {} explicit invalid_basic_path (const C* p): path (p) {} invalid_basic_path (const C* p, size_type n): path (p, n) {} @@ -392,6 +373,22 @@ LIBBUTL_MODEXPORT namespace butl : (p = rfind_separator (s, n - 1)) == nullptr ? s : ++p; } + // Return true if sb is a sub-path of sp (i.e., sp is a prefix). Expects + // both paths to be normalized. Note that this function returns true if + // the paths are equal. Empty path is considered a prefix of any path. + // + static bool + sub (const C* sb, size_type nb, + const C* sp, size_type np); + + // Return true if sp is a super-path of sb (i.e., sb is a suffix). Expects + // both paths to be normalized. Note that this function returns true if + // the paths are equal. Empty path is considered a prefix of any path. + // + static bool + sup (const C* sp, size_type np, + const C* sb, size_type nb); + static int compare (string_type const& l, string_type const& r, @@ -454,11 +451,31 @@ LIBBUTL_MODEXPORT namespace butl // Get/set current working directory. Throw std::system_error to report // underlying OS errors. // + // The curren_directory() accessor (as well as the relevant process + // startup functions) have a notion of a "thread working directory" which + // is implemented as a thread-specific override that can be added/removed + // with thread_current_directory() below. + // + // Note that the current_directory() modifier always sets the process-wide + // working directory. + // + // See also thread_env(). + // static string_type current_directory (); static void - current_directory (string_type const&); + current_directory (const string_type&); + + // Get/set thread working directory override. Note that the passed + // pointed-to string should be valid (and immutable) for as long as the + // override is in effect. + // + static const string_type* + thread_current_directory (); + + static void + thread_current_directory (const string_type*); // Return the user home directory. Throw std::system_error to report // underlying OS errors. @@ -615,18 +632,18 @@ LIBBUTL_MODEXPORT namespace butl // Constructors. // - path_data () + path_data () noexcept : tsep_ (0) {} - path_data (string_type&& p, difference_type ts) + path_data (string_type&& p, difference_type ts) noexcept : path_ (std::move (p)), tsep_ (path_.empty () ? 0 : ts) {} explicit - path_data (string_type&& p) + path_data (string_type&& p) noexcept : path_ (std::move (p)) { _init (); } void - _init () + _init () noexcept { size_type n (path_.size ()), i; @@ -654,7 +671,8 @@ LIBBUTL_MODEXPORT namespace butl using path_data<C>::path_data; base_type () = default; - base_type (path_data<C>&& d): path_data<C> (std::move (d)) {} + base_type (path_data<C>&& d) noexcept + : path_data<C> (std::move (d)) {} }; using dir_type = basic_path<C, dir_path_kind<C>>; @@ -892,7 +910,7 @@ LIBBUTL_MODEXPORT namespace butl make_leaf (); // Return the path without the specified directory part. Returns empty - // path if the paths are the same. Throws invalid_path if the directory is + // path if the paths are the same. Throw invalid_path if the directory is // not a prefix of *this. Expects both paths to be normalized. // basic_path @@ -910,7 +928,7 @@ LIBBUTL_MODEXPORT namespace butl make_directory (); // Return the directory part of the path without the specified leaf part. - // Throws invalid_path if the leaf is not a suffix of *this. Expects both + // Throw invalid_path if the leaf is not a suffix of *this. Expects both // paths to be normalized. // dir_type @@ -946,12 +964,18 @@ LIBBUTL_MODEXPORT namespace butl extension_cstring () const; // Return a path relative to the specified path that is equivalent - // to *this. Throws invalid_path if a relative path cannot be derived + // to *this. Throw invalid_path if a relative path cannot be derived // (e.g., paths are on different drives on Windows). // basic_path relative (basic_path) const; + // As above but return nullopt rather than throw if a relative path cannot + // be derived. + // + optional<basic_path> + try_relative (basic_path) const; + // Iteration over path components. // // Note that for an absolute POSIX path the first component is empty, @@ -1108,19 +1132,22 @@ LIBBUTL_MODEXPORT namespace butl basic_path& canonicalize (char dir_sep = '\0'); - // Normalize the path and return *this. Normalization involves collapsing - // the '.' and '..' directories if possible, collapsing multiple - // directory separators, and converting all directory separators to the - // canonical form. If cur_empty is true then collapse relative paths - // representing the current directory (for example, '.', './', 'foo/..') - // to an empty path. Otherwise convert it to the canonical form (./ on - // POSIX systems). Note that a non-empty path cannot become an empty one - // in the latter case. + // Normalize the path and return *this. Throw invalid_path if the + // resulting path would be invalid (e.g., /tmp/../..). + // + // Normalization involves collapsing the '.' and '..' directories if + // possible, collapsing multiple directory separators, and converting all + // directory separators to the canonical form. If cur_empty is true then + // collapse relative paths representing the current directory (for + // example, '.', './', 'foo/..') to an empty path. Otherwise convert it + // to the canonical form (./ on POSIX systems). Note that a non-empty path + // cannot become an empty one in the latter case. // // If actual is true, then for case-insensitive filesystems obtain the // actual spelling of the path. Only an absolute path can be actualized. // If a path component does not exist, then its (and all subsequent) - // spelling is unchanged. This is a potentially expensive operation. + // spelling is unchanged. Throw system_error on all other underlying + // filesystem errors. Note that this is a potentially expensive operation. // Normally one can assume that "well-known" directories (current, home, // etc.) are returned in their actual spelling. // @@ -1275,7 +1302,8 @@ LIBBUTL_MODEXPORT namespace butl // Direct initialization without init()/cast(). // explicit - basic_path (data_type&& d): base_type (std::move (d)) {} + basic_path (data_type&& d) noexcept + : base_type (std::move (d)) {} using base_type::_size; using base_type::_init; @@ -1474,9 +1502,9 @@ LIBBUTL_MODEXPORT namespace butl basic_path_name (): // Create empty/NULL path name. base (nullptr, &name) {} - basic_path_name (basic_path_name&&); + basic_path_name (basic_path_name&&) noexcept; basic_path_name (const basic_path_name&); - basic_path_name& operator= (basic_path_name&&); + basic_path_name& operator= (basic_path_name&&) noexcept; basic_path_name& operator= (const basic_path_name&); }; @@ -1503,14 +1531,14 @@ LIBBUTL_MODEXPORT namespace butl basic_path_name_value (): base (&path) {} // Create empty/NULL path name. - basic_path_name_value (basic_path_name_value&&); + basic_path_name_value (basic_path_name_value&&) noexcept; basic_path_name_value (const basic_path_name_value&); - basic_path_name_value& operator= (basic_path_name_value&&); + basic_path_name_value& operator= (basic_path_name_value&&) noexcept; basic_path_name_value& operator= (const basic_path_name_value&); }; } -LIBBUTL_MODEXPORT namespace std +namespace std { template <typename C, typename K> struct hash<butl::basic_path<C, K>>: hash<basic_string<C>> diff --git a/libbutl/path.ixx b/libbutl/path.ixx index 9c96cfc..b2fdb6f 100644 --- a/libbutl/path.ixx +++ b/libbutl/path.ixx @@ -1,7 +1,7 @@ // file : libbutl/path.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { // path_abnormality // @@ -117,6 +117,45 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return r; } + template <typename C> + inline bool path_traits<C>:: + sub (const C* s, size_type n, + const C* ps, size_type pn) + { + // The thinking here is that we can use the full string representations + // (including the trailing slash in "/"). + // + if (pn == 0) + return true; + + // The second condition guards against the /foo-bar vs /foo case. + // + return n >= pn && + compare (s, pn, ps, pn) == 0 && + (is_separator (ps[pn - 1]) || // p ends with a separator + n == pn || // *this == p + is_separator (s[pn])); // next char is a separator + } + + template <typename C> + inline bool path_traits<C>:: + sup (const C* s, size_type n, + const C* ps, size_type pn) + { + // The thinking here is that we can use the full string representations + // (including the trailing slash in "/"). + // + if (pn == 0) + return true; + + // The second condition guards against the /foo-bar vs bar case. + // + return n >= pn && + compare (s + n - pn, pn, ps, pn) == 0 && + (n == pn || // *this == p + is_separator (s[n - pn - 1])); // Previous char is a separator. + } + #ifdef _WIN32 template <> inline char path_traits<char>:: @@ -230,52 +269,16 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. inline bool basic_path<C, K>:: sub (const basic_path& p) const { - // The thinking here is that we can use the full string representations - // (including the trailing slash in "/"). - // - const string_type& ps (p.path_); - size_type pn (ps.size ()); - - if (pn == 0) - return true; - - const string_type& s (this->path_); - size_type n (s.size ()); - - // The second condition guards against the /foo-bar vs /foo case. - // - return n >= pn && - traits_type::compare (s.c_str (), pn, ps.c_str (), pn) == 0 && - (traits_type::is_separator (ps.back ()) || // p ends with a separator - n == pn || // *this == p - traits_type::is_separator (s[pn])); // next char is a separator + return traits_type::sub (this->path_.c_str (), this->path_.size (), + p.path_.c_str (), p.path_.size ()); } template <typename C, typename K> inline bool basic_path<C, K>:: sup (const basic_path& p) const { - // The thinking here is that we can use the full string representations - // (including the trailing slash in "/"). - // - const string_type& ps (p.path_); - size_type pn (ps.size ()); - - if (pn == 0) - return true; - - const string_type& s (this->path_); - size_type n (s.size ()); - - // The second condition guards against the /foo-bar vs bar case. - // - return n >= pn && - traits_type::compare (s.c_str () + n - pn, pn, ps.c_str (), pn) == 0 && - (n == pn || // *this == p - // - // Previous char is a separator. - // - traits_type::is_separator (s[n - pn - 1])); + return traits_type::sup (this->path_.c_str (), this->path_.size (), + p.path_.c_str (), p.path_.size ()); } template <typename C, typename K> @@ -779,7 +782,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // template <typename P> inline basic_path_name<P>:: - basic_path_name (basic_path_name&& p) + basic_path_name (basic_path_name&& p) noexcept : basic_path_name (p.path, std::move (p.name)) { } @@ -793,7 +796,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. template <typename P> inline basic_path_name<P>& basic_path_name<P>:: - operator= (basic_path_name&& p) + operator= (basic_path_name&& p) noexcept { if (this != &p) { @@ -821,7 +824,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // template <typename P> inline basic_path_name_value<P>:: - basic_path_name_value (basic_path_name_value&& p) + basic_path_name_value (basic_path_name_value&& p) noexcept : basic_path_name_value (std::move (p.path), std::move (p.name)) { } @@ -835,7 +838,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. template <typename P> inline basic_path_name_value<P>& basic_path_name_value<P>:: - operator= (basic_path_name_value&& p) + operator= (basic_path_name_value&& p) noexcept { if (this != &p) { diff --git a/libbutl/path.txx b/libbutl/path.txx index 45b62bd..60e0f1a 100644 --- a/libbutl/path.txx +++ b/libbutl/path.txx @@ -1,7 +1,7 @@ // file : libbutl/path.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { template <typename C, typename K> basic_path<C, K> basic_path<C, K>:: @@ -103,8 +103,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. #endif template <typename C, typename K> - basic_path<C, K> basic_path<C, K>:: - relative (basic_path<C, K> d) const + optional<basic_path<C, K>> basic_path<C, K>:: + try_relative (basic_path<C, K> d) const { dir_type r; @@ -118,12 +118,22 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // Roots of the paths do not match. // if (d.root ()) - throw invalid_basic_path<C> (this->path_); + return nullopt; } return r / leaf (d); } + template <typename C, typename K> + basic_path<C, K> basic_path<C, K>:: + relative (basic_path<C, K> d) const + { + if (optional<basic_path<C, K>> r = try_relative (std::move (d))) + return std::move (*r); + + throw invalid_basic_path<C> (this->path_); + } + #ifdef _WIN32 // Find the actual spelling of a name in the specified dir. If the name is // found, append it to the result and return true. Otherwise, return false. diff --git a/libbutl/prefix-map.mxx b/libbutl/prefix-map.hxx index 75931da..0895d96 100644 --- a/libbutl/prefix-map.mxx +++ b/libbutl/prefix-map.hxx @@ -1,31 +1,16 @@ -// file : libbutl/prefix-map.mxx -*- C++ -*- +// file : libbutl/prefix-map.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <map> #include <string> #include <utility> // move() #include <algorithm> // min() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.prefix_map; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // A map of hierarchical "paths", e.g., 'foo.bar' or 'foo/bar' with the // ability to retrieve a range of entries that have a specific prefix as @@ -149,6 +134,37 @@ LIBBUTL_MODEXPORT namespace butl const_iterator find_sup (const key_type&) const; + + // As above but additionally evaluate a predicate on each matching entry + // returning the one for which it returns true. + // + template <typename P> + iterator + find_sup_if (const key_type&, P); + + template <typename P> + const_iterator + find_sup_if (const key_type&, P) const; + }; + + template <typename M> + struct prefix_multimap_common: prefix_map_common<M> + { + typedef M map_type; + typedef typename map_type::key_type key_type; + typedef typename map_type::iterator iterator; + typedef typename map_type::const_iterator const_iterator; + + using prefix_map_common<M>::prefix_map_common; + + // Find the most qualified entries that are super-prefixes of the + // specified prefix. + // + std::pair<iterator, iterator> + sup_range (const key_type&); + + std::pair<const_iterator, const_iterator> + sup_range (const key_type&) const; }; template <typename M, typename prefix_map_common<M>::delimiter_type D> @@ -161,6 +177,16 @@ LIBBUTL_MODEXPORT namespace butl : prefix_map_common<M> (std::move (i), D) {} }; + template <typename M, typename prefix_map_common<M>::delimiter_type D> + struct prefix_multimap_impl: prefix_multimap_common<M> + { + typedef typename prefix_multimap_common<M>::value_type value_type; + + prefix_multimap_impl (): prefix_multimap_common<M> (D) {} + prefix_multimap_impl (std::initializer_list<value_type> i) + : prefix_multimap_common<M> (std::move (i), D) {} + }; + template <typename K, typename T, typename compare_prefix<K>::delimiter_type D> @@ -170,7 +196,7 @@ LIBBUTL_MODEXPORT namespace butl typename T, typename compare_prefix<K>::delimiter_type D> using prefix_multimap = - prefix_map_impl<std::multimap<K, T, compare_prefix<K>>, D>; + prefix_multimap_impl<std::multimap<K, T, compare_prefix<K>>, D>; } #include <libbutl/prefix-map.txx> diff --git a/libbutl/prefix-map.txx b/libbutl/prefix-map.txx index e9a99c9..80664bf 100644 --- a/libbutl/prefix-map.txx +++ b/libbutl/prefix-map.txx @@ -1,7 +1,7 @@ // file : libbutl/prefix-map.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { template <typename M> auto prefix_map_common<M>:: @@ -127,4 +127,128 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return i; #endif } + + template <typename M> + template <typename P> + auto prefix_map_common<M>:: + find_sup_if (const key_type& k, P pred) -> iterator + { +#if 0 + const auto& c (this->key_comp ()); + + for (auto i (this->upper_bound (k)), b (this->begin ()); i != b; ) + { + --i; + if (c.prefix (i->first, k) && pred (*i)) + return i; + } + + return this->end (); +#else + auto i (this->find (k)), e (this->end ()); + + if (i == e || !pred (*i)) + { + const auto& c (this->key_comp ()); + + for (key_type p (k); c.prefix (p); ) + { + i = this->find (p); + if (i != e && pred (*i)) + break; + } + } + + return i; +#endif + } + + template <typename M> + template <typename P> + auto prefix_map_common<M>:: + find_sup_if (const key_type& k, P pred) const -> const_iterator + { +#if 0 + const auto& c (this->key_comp ()); + + for (auto i (this->upper_bound (k)), b (this->begin ()); i != b; ) + { + --i; + if (c.prefix (i->first, k) && pred (*i)) + return i; + } + + return this->end (); +#else + auto i (this->find (k)), e (this->end ()); + + if (i == e || !pred (*i)) + { + const auto& c (this->key_comp ()); + + for (key_type p (k); c.prefix (p); ) + { + i = this->find (p); + if (i != e && pred (*i)) + break; + } + } + + return i; +#endif + } + + template <typename M> + auto prefix_multimap_common<M>:: + sup_range (const key_type& k) -> std::pair<iterator, iterator> + { +#if 0 + // TODO (see above). +#else + // First look for the exact match before making any copies. + // + auto r (this->equal_range (k)); + + if (r.first == r.second) + { + const auto& c (this->key_comp ()); + + for (key_type p (k); c.prefix (p); ) + { + r = this->equal_range (p); + if (r.first != r.second) + break; + } + } + + return r; +#endif + } + + template <typename M> + auto prefix_multimap_common<M>:: + sup_range (const key_type& k) const -> std::pair<const_iterator, const_iterator> + { +#if 0 + // TODO (see above). +#else + // First look for the exact match before making any copies. + // + auto r (this->equal_range (k)); + + if (r.first == r.second) + { + const auto& c (this->key_comp ()); + + for (key_type p (k); c.prefix (p); ) + { + r = this->equal_range (p); + if (r.first != r.second) + break; + } + } + + return r; +#endif + } } diff --git a/libbutl/process-details.hxx b/libbutl/process-details.hxx index cf7624d..10d5241 100644 --- a/libbutl/process-details.hxx +++ b/libbutl/process-details.hxx @@ -3,17 +3,25 @@ #pragma once -#include <libbutl/ft/shared_mutex.hxx> +#ifdef LIBBUTL_MINGW_STDTHREAD -#ifdef __cpp_lib_modules_ts -import std.core; //@@ MOD TMP (dummy std.threading). -import std.threading; -#else -#include <mutex> -#if defined(__cpp_lib_shared_mutex) || defined(__cpp_lib_shared_timed_mutex) -# include <shared_mutex> -#endif -#endif +# include <libbutl/mingw-shared_mutex.hxx> + +namespace butl +{ + using shared_mutex = mingw_stdthread::shared_mutex; + using ulock = mingw_stdthread::unique_lock<shared_mutex>; + using slock = mingw_stdthread::shared_lock<shared_mutex>; +} + +#else // LIBBUTL_MINGW_STDTHREADS + +# include <libbutl/ft/shared_mutex.hxx> + +# include <mutex> +# if defined(__cpp_lib_shared_mutex) || defined(__cpp_lib_shared_timed_mutex) +# include <shared_mutex> +# endif namespace butl { @@ -41,7 +49,11 @@ namespace butl using ulock = std::unique_lock<shared_mutex>; using slock = ulock; #endif +} +#endif // LIBBUTL_MINGW_STDTHREADS +namespace butl +{ // Mutex that is acquired to make a sequence of operations atomic in regards // to child process spawning. Must be aquired for exclusive access for child // process startup, and for shared access otherwise. Defined in process.cxx. diff --git a/libbutl/process-io.cxx b/libbutl/process-io.cxx index c29bbc0..0be3a77 100644 --- a/libbutl/process-io.cxx +++ b/libbutl/process-io.cxx @@ -1,36 +1,11 @@ // file : libbutl/process-io.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/process-io.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <ostream> +#include <libbutl/process-io.hxx> #include <cstring> // strchr() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.process_io; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.process; -#endif -import butl.path-io; -#else -#include <libbutl/path-io.mxx> -#endif +#include <libbutl/path-io.hxx> using namespace std; diff --git a/libbutl/process-io.mxx b/libbutl/process-io.hxx index d07a212..29d6d8b 100644 --- a/libbutl/process-io.mxx +++ b/libbutl/process-io.hxx @@ -1,32 +1,15 @@ -// file : libbutl/process-io.mxx -*- C++ -*- +// file : libbutl/process-io.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <ostream> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.process_io; -#ifdef __cpp_lib_modules_ts -import std.core; //@@ MOD TMP (should not be needed). -import std.io; -#endif -import butl.process; -#else -#include <libbutl/process.mxx> -#endif +#include <libbutl/process.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { inline std::ostream& operator<< (std::ostream& o, const process_path& p) diff --git a/libbutl/process-run.cxx b/libbutl/process-run.cxx index c26c20d..b044ea1 100644 --- a/libbutl/process-run.cxx +++ b/libbutl/process-run.cxx @@ -1,35 +1,12 @@ // file : libbutl/process-run.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/process.mxx> -#endif +#include <libbutl/process.hxx> -// C includes. - -#ifndef __cpp_lib_modules_ts #include <cstdlib> // exit() #include <iostream> // cerr -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.process; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.path; -#endif -import butl.utility; // operator<<(ostream,exception) -#else -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utility.hxx> // operator<<(ostream,exception) using namespace std; @@ -47,7 +24,7 @@ namespace butl try { return process (pp, cmd, - in, out, err, + move (in), move (out), move (err), cwd != nullptr ? cwd->string ().c_str () : nullptr, envvars); } diff --git a/libbutl/process-run.txx b/libbutl/process-run.txx index aa1e381..6c903a8 100644 --- a/libbutl/process-run.txx +++ b/libbutl/process-run.txx @@ -1,7 +1,9 @@ // file : libbutl/process-run.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <utility> // forward(), index_sequence + +namespace butl { template <typename V> void process_env:: @@ -85,21 +87,21 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. // valid file descriptor. // inline process::pipe - process_stdin (const process::pipe& v) + process_stdin (process::pipe v) { assert (v.in >= 0); return v; } inline process::pipe - process_stdout (const process::pipe& v) + process_stdout (process::pipe v) { assert (v.out >= 0); return v; } inline process::pipe - process_stderr (const process::pipe& v) + process_stderr (process::pipe v) { assert (v.out >= 0); return v; @@ -129,13 +131,13 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. typename... A, typename std::size_t... index> process - process_start (std::index_sequence<index...>, - const C& cmdc, - I&& in, - O&& out, - E&& err, - const process_env& env, - A&&... args) + process_start_impl (std::index_sequence<index...>, + const C& cmdc, + I&& in, + O&& out, + E&& err, + const process_env& env, + A&&... args) { // Map stdin/stdout/stderr arguments to their integer values, as expected // by the process constructor. @@ -168,7 +170,9 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return process_start (env.cwd, *env.path, cmd.data (), env.vars, - in_i, out_i, err_i); + std::move (in_i), + std::move (out_i), + std::move (err_i)); } template <typename C, @@ -184,13 +188,13 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. const process_env& env, A&&... args) { - return process_start (std::index_sequence_for<A...> (), - cmdc, - std::forward<I> (in), - std::forward<O> (out), - std::forward<E> (err), - env, - std::forward<A> (args)...); + return process_start_impl (std::index_sequence_for<A...> (), + cmdc, + std::forward<I> (in), + std::forward<O> (out), + std::forward<E> (err), + env, + std::forward<A> (args)...); } template <typename I, @@ -255,4 +259,45 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. env, std::forward<A> (args)...); } + + template <typename C, + typename... A, + typename std::size_t... index> + void + process_print_impl (std::index_sequence<index...>, + const C& cmdc, + const process_env& env, + A&&... args) + { + // Construct the command line array. + // + const std::size_t args_size (sizeof... (args)); + + small_vector<const char*, args_size + 2> cmd; + + assert (env.path != nullptr); + cmd.push_back (env.path->recall_string ()); + + std::string storage[args_size != 0 ? args_size : 1]; + + const char* dummy[] = { + nullptr, process_args_as_wrapper (cmd, args, storage[index])... }; + + cmd.push_back (dummy[0]); // NULL (and get rid of unused warning). + + cmdc (cmd.data (), cmd.size ()); + } + + template <typename C, + typename... A> + inline void + process_print_callback (const C& cmdc, + const process_env& env, + A&&... args) + { + process_print_impl (std::index_sequence_for<A...> (), + cmdc, + env, + std::forward<A> (args)...); + } } diff --git a/libbutl/process.cxx b/libbutl/process.cxx index 6c736c1..1b8da98 100644 --- a/libbutl/process.cxx +++ b/libbutl/process.cxx @@ -1,9 +1,7 @@ // file : libbutl/process.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/process.mxx> -#endif +#include <libbutl/process.hxx> #include <errno.h> @@ -49,6 +47,14 @@ # elif defined(__NetBSD__) && __NetBSD__ >= 6 # define LIBBUTL_POSIX_SPAWN // +// On OpenBSD posix_spawn() appeared in 5.2 (see the man page for details). +// +# elif defined(__OpenBSD__) +# include <sys/param.h> // OpenBSD (yyyymm) +# if OpenBSD >= 201211 // 5.2 released on 1 Nov 2012. +# define LIBBUTL_POSIX_SPAWN +# endif +// // posix_spawn() appeared in Version 3 of the Single UNIX Specification that // was implemented in MacOS 10.5 (see the man page for details). // @@ -87,29 +93,20 @@ # endif // _MSC_VER #endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <chrono> -#include <cstdint> -#include <cstddef> -#include <system_error> - #include <ios> // ios_base::failure -#include <cstring> // strlen(), strchr(), strncmp() +#include <memory> // unique_ptr +#include <cstring> // strlen(), strchr(), strpbrk(), strncmp() #include <utility> // move() #include <ostream> +#include <cassert> #ifndef _WIN32 -#include <thread> // this_thread::sleep_for() +# include <thread> // this_thread::sleep_for() #else -#include <map> -#include <ratio> // milli -#include <cstdlib> // __argv[] -#include <algorithm> // find() -#endif +# include <map> +# include <ratio> // milli +# include <cstdlib> // __argv[] +# include <algorithm> // find() #endif #include <libbutl/process-details.hxx> @@ -119,32 +116,8 @@ namespace butl shared_mutex process_spawn_mutex; // Out of module purview. } -#ifdef __cpp_modules_ts -module butl.process; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -import std.threading; // Clang wants it in purview (see process-details.hxx). -#endif -import butl.path; -import butl.fdstream; -import butl.vector_view; -import butl.small_vector; -#endif - -#ifndef _WIN32 -import std.threading; -#endif - -import butl.utility; // icasecmp() -import butl.fdstream; // fdopen_null() -#else -#include <libbutl/utility.mxx> -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/utility.hxx> // icasecmp() +#include <libbutl/fdstream.hxx> // fdopen_null() using namespace std; @@ -217,7 +190,7 @@ namespace butl } void process:: - print (ostream& o, const char* const args[], size_t n) + print (ostream& o, const char* const* args, size_t n) { size_t m (0); const char* const* p (args); @@ -253,6 +226,35 @@ namespace butl } while (*p != nullptr); } +#if defined(LIBBUTL_POSIX_SPAWN) || defined(_WIN32) + // Return true if the NULL-terminated variable list contains an (un)set of + // the specified variable. The NULL list argument denotes an empty list. + // + // Note that on Windows variable names are case-insensitive. + // + static inline bool + contains_envvar (const char* const* vs, const char* v, size_t n) + { + if (vs != nullptr) + { + // Note that we don't expect the number of variables to (un)set to be + // large, so the linear search is OK. + // + while (const char* v1 = *vs++) + { +#ifdef _WIN32 + if (icasecmp (v1, v, n) == 0 && (v1[n] == '=' || v1[n] == '\0')) +#else + if (strncmp (v1, v, n) == 0 && (v1[n] == '=' || v1[n] == '\0')) +#endif + return true; + } + } + + return false; + } +#endif + #ifndef _WIN32 static process_path @@ -260,7 +262,7 @@ namespace butl { // Note that there is a similar version for Win32. - typedef path::traits_type traits; + using traits = path::traits_type; size_t fn (strlen (f)); @@ -381,10 +383,10 @@ namespace butl } process:: - process (const process_path& pp, const char* args[], + process (const process_path& pp, const char* const* args, pipe pin, pipe pout, pipe perr, const char* cwd, - const char* const* envvars) + const char* const* evars) { int in (pin.in); int out (pout.out); @@ -452,6 +454,17 @@ namespace butl else if (err == -2) in_efd.out = open_null (); + // If there is no user-supplied CWD and we have thread-specific override, + // use that instead of defaulting to the process-wide value. + // + if (cwd == nullptr || *cwd == '\0') + { + if (const string* twd = path::traits_type::thread_current_directory ()) + cwd = twd->c_str (); + } + + const char* const* tevars (thread_env ()); + // The posix_spawn()-based implementation. // #ifdef LIBBUTL_POSIX_SPAWN @@ -540,47 +553,45 @@ namespace butl fail (r); #endif - // Set/unset environment variables if requested. + // Set/unset the child process environment variables if requested. // - small_vector<const char*, 8> new_env; + vector<const char*> new_env; - if (envvars != nullptr) + if (tevars != nullptr || evars != nullptr) { - for (const char* const* env (environ); *env != nullptr; ++env) + // Copy the non-overridden process environment variables into the + // child's environment. + // + for (const char* const* ev (environ); *ev != nullptr; ++ev) { - // Lookup the existing variable among those that are requested to be - // (un)set. If not present, than add it to the child process - // environment. - // - // Note that on POSIX variable names are case-sensitive. - // - // Alse note that we don't expect the number of variables to (un)set - // to be large, so the linear search is OK. - // - const char* cv (*env); - const char* eq (strchr (cv, '=')); - size_t n (eq != nullptr ? eq - cv : strlen (cv)); - - const char* const* ev (envvars); - for (; *ev != nullptr; ++ev) - { - const char* v (*ev); - if (strncmp (cv, v, n) == 0 && (v[n] == '=' || v[n] == '\0')) - break; - } + const char* v (*ev); + const char* e (strchr (v, '=')); + size_t n (e != nullptr ? e - v : strlen (v)); - if (*ev == nullptr) - new_env.push_back (cv); + if (!contains_envvar (tevars, v, n) && + !contains_envvar (evars, v, n)) + new_env.push_back (v); } - // Copy the environment variables that are requested to be set. + // Copy non-overridden variable assignments into the child's + // environment. // - for (const char* const* ev (envvars); *ev != nullptr; ++ev) + auto set_vars = [&new_env] (const char* const* vs, + const char* const* ovs = nullptr) { - const char* v (*ev); - if (strchr (v, '=') != nullptr) - new_env.push_back (v); - } + if (vs != nullptr) + { + while (const char* v = *vs++) + { + const char* e (strchr (v, '=')); + if (e != nullptr && !contains_envvar (ovs, v, e - v)) + new_env.push_back (v); + } + } + }; + + set_vars (tevars, evars); + set_vars (evars); new_env.push_back (nullptr); } @@ -598,9 +609,9 @@ namespace butl &fa, nullptr /* attrp */, const_cast<char* const*> (&args[0]), - envvars != nullptr - ? const_cast<char* const*> (new_env.data ()) - : environ); + new_env.empty () + ? environ + : const_cast<char* const*> (new_env.data ())); if (r != 0) fail (r); } // Release the lock in parent. @@ -641,6 +652,10 @@ namespace butl { // Child. // + // NOTE: make sure not to call anything that may acquire a mutex that + // could be already acquired in another thread, most notably + // malloc(). @@ What about exceptions (all the fail() calls)? + // Duplicate the user-supplied (fd > -1) or the created pipe descriptor // to the standard stream descriptor (read end for STDIN_FILENO, write // end otherwise). Close the pipe afterwards. @@ -688,27 +703,38 @@ namespace butl if (cwd != nullptr && *cwd != '\0' && chdir (cwd) != 0) fail (true /* child */); - // Set/unset environment variables if requested. + // Set/unset environment variables. // - if (envvars != nullptr) + auto set_vars = [] (const char* const* vs) { - while (const char* ev = *envvars++) + if (vs != nullptr) { - const char* v (strchr (ev, '=')); - - try + while (const char* v = *vs++) { - if (v != nullptr) - setenv (string (ev, v - ev), v + 1); - else - unsetenv (ev); - } - catch (const system_error& e) - { - throw process_child_error (e.code ().value ()); + const char* e (strchr (v, '=')); + + try + { + // @@ TODO: redo without allocation (PATH_MAX?) Maybe + // also using C API to avoid exceptions. + // + if (e != nullptr) + setenv (string (v, e - v), e + 1); + else + unsetenv (v); + } + catch (const system_error& e) + { + // @@ Should we assume this cannot throw? + // + throw process_child_error (e.code ().value ()); + } } } - } + }; + + set_vars (tevars); + set_vars (evars); // Try to re-exec after the "text file busy" failure for 450ms. // @@ -741,6 +767,13 @@ namespace butl { if (handle != 0) { + // First close any open pipe ends for good measure but ignore any + // errors. + // + out_fd.reset (); + in_ofd.reset (); + in_efd.reset (); + int es; int r (waitpid (handle, &es, 0)); handle = 0; // We have tried. @@ -822,6 +855,12 @@ namespace butl return getpid (); } + process::handle_type process:: + current_handle () + { + return getpid (); + } + // process_exit // process_exit:: @@ -1274,13 +1313,30 @@ namespace butl }; const char* process:: - quote_argument (const char* a, string& s) + quote_argument (const char* a, string& s, bool bat) { - // On Windows we need to protect values with spaces using quotes. - // Since there could be actual quotes in the value, we need to - // escape them. + // On Windows we need to protect values with spaces using quotes. Since + // there could be actual quotes in the value, we need to escape them. + // + // For batch files we also protect equal (`=`), comma (`,`) and semicolon + // (`;`) since otherwise an argument containing any of these will be split + // into several as if they were spaces (that is, the parts will appear in + // %1 %2, etc., instead of all in %1). This of course could break some + // batch files that rely on this semantics (for example, to automatically + // handle --foo=bar as --foo bar) but overall seeing a single argument + // (albeit quoted) is closer to the behavior of real executables. So we do + // this by default and if it becomes a problem we can invent a flag + // (probably in process_env) to disable this quoting (and while at it we + // may add a flag to disable all quoting since the user may need to quote + // some arguments but not others). // - bool q (*a == '\0' || strchr (a, ' ') != nullptr); + // While `()` and `[]` are not special characters, some "subsystems" + // (e.g., Cygwin/MSYS2) try to interpret them in certain contexts (e.g., + // relative paths). So we quote them as well (over-quoting seems to be + // harmless according to the "Parsing C Command-Line Arguments" MSDN + // article). + // + bool q (*a == '\0' || strpbrk (a, bat ? " =,;" : " ()[]") != nullptr); if (!q && strchr (a, '"') == nullptr) return a; @@ -1291,8 +1347,8 @@ namespace butl s += '"'; // Note that backslashes don't need escaping, unless they immediately - // precede the double quote (see `Parsing C Command-Line Arguments` MSDN - // article for more details). For example: + // precede the double quote (see "Parsing C Command-Line Arguments" MSDN + // article for details). For example: // // -DPATH="C:\\foo\\" -> -DPATH=\"C:\\foo\\\\\" // -DPATH=C:\foo bar\ -> "-DPATH=C:\foo bar\\" @@ -1331,10 +1387,10 @@ namespace butl static map<string, bool> detect_msys_cache_; process:: - process (const process_path& pp, const char* args[], + process (const process_path& pp, const char* const* args, pipe pin, pipe pout, pipe perr, const char* cwd, - const char* const* envvars) + const char* const* evars) { int in (pin.in); int out (pout.out); @@ -1345,6 +1401,15 @@ namespace butl throw process_error (m == nullptr ? last_error_msg () : m); }; + // If there is no user-supplied CWD and we have thread-specific override, + // use that instead of defaulting to the process-wide value. + // + if (cwd == nullptr || *cwd == '\0') + { + if (const string* twd = path::traits_type::thread_current_directory ()) + cwd = twd->c_str (); + } + // (Un)set the environment variables for the child process. // // Note that we can not do it incrementally, as for POSIX implementation. @@ -1356,7 +1421,9 @@ namespace butl // vector<char> new_env; - if (envvars != nullptr) + const char* const* tevars (thread_env ()); + + if (tevars != nullptr || evars != nullptr) { // The environment block contains the variables in the following format: // @@ -1365,7 +1432,7 @@ namespace butl // Note the trailing NULL character that follows the last variable // (null-terminated) string. // - unique_ptr<char, void (*)(char*)> cvars ( + unique_ptr<char, void (*)(char*)> pevars ( GetEnvironmentStringsA (), [] (char* p) { @@ -1376,50 +1443,45 @@ namespace butl assert (false); }); - if (cvars.get () == nullptr) + if (pevars.get () == nullptr) fail (); - const char* cv (cvars.get ()); - - // Copy the current environment variables. + // Copy the non-overridden process environment variables into the + // child's environment. // - while (*cv != '\0') + for (const char* v (pevars.get ()); *v != '\0'; ) { - // Lookup the existing variable among those that are requested to be - // (un)set. If not present, than copy it to the new block. - // - // Note that on Windows variable names are case-insensitive. - // - // Alse note that we don't expect the number of variables to (un)set - // to be large, so the linear search is OK. - // - size_t n (strlen (cv) + 1); // Includes NULL character. + size_t n (strlen (v) + 1); // Includes NULL character. - const char* eq (strchr (cv, '=')); - size_t nn (eq != nullptr ? eq - cv : n - 1); - const char* const* ev (envvars); + const char* e (strchr (v, '=')); + size_t nn (e != nullptr ? e - v : n - 1); - for (; *ev != nullptr; ++ev) - { - const char* v (*ev); - if (icasecmp (cv, v, nn) == 0 && (v[nn] == '=' || v[nn] == '\0')) - break; - } - - if (*ev == nullptr) - new_env.insert (new_env.end (), cv, cv + n); + if (!contains_envvar (tevars, v, nn) && + !contains_envvar (evars, v, nn)) + new_env.insert (new_env.end (), v, v + n); - cv += n; + v += n; } - // Copy the environment variables that are requested to be set. + // Copy non-overridden variable assignments into the child's + // environment. // - for (const char* const* ev (envvars); *ev != nullptr; ++ev) + auto set_vars = [&new_env] (const char* const* vs, + const char* const* ovs = nullptr) { - const char* v (*ev); - if (strchr (v, '=') != nullptr) - new_env.insert (new_env.end (), v, v + strlen (v) + 1); - } + if (vs != nullptr) + { + while (const char* v = *vs++) + { + const char* e (strchr (v, '=')); + if (e != nullptr && !contains_envvar (ovs, v, e - v)) + new_env.insert (new_env.end (), v, v + strlen (v) + 1); + } + } + }; + + set_vars (tevars, evars); + set_vars (evars); new_env.push_back ('\0'); // Terminate the new environment block. } @@ -1516,12 +1578,12 @@ namespace butl // string cmd_line; { - auto append = [&cmd_line, buf = string ()] (const char* a) mutable + auto append = [&batch, &cmd_line, buf = string ()] (const char* a) mutable { if (!cmd_line.empty ()) cmd_line += ' '; - cmd_line += quote_argument (a, buf); + cmd_line += quote_argument (a, buf, batch.has_value ()); }; if (batch) @@ -1763,7 +1825,6 @@ namespace butl using namespace chrono; - // Retry for about 1 hour. // system_clock::duration timeout (1h); @@ -1776,7 +1837,7 @@ namespace butl 0, // Primary thread security attributes. true, // Inherit handles. 0, // Creation flags. - envvars != nullptr ? new_env.data () : nullptr, + new_env.empty () ? nullptr : new_env.data (), cwd != nullptr && *cwd != '\0' ? cwd : nullptr, &si, &pi)) @@ -1849,7 +1910,7 @@ namespace butl return PeekNamedPipe (h, &c, 1, &n, nullptr, nullptr) && n == 1; }; - // Hidden by butl::duration that is introduced via fdstream.mxx. + // Hidden by butl::duration that is introduced via fdstream.hxx. // using milli_duration = chrono::duration<DWORD, milli>; @@ -1930,6 +1991,10 @@ namespace butl { if (handle != 0) { + out_fd.reset (); + in_ofd.reset (); + in_efd.reset (); + DWORD es; DWORD e (NO_ERROR); if (WaitForSingleObject (handle, INFINITE) != WAIT_OBJECT_0 || @@ -2037,6 +2102,15 @@ namespace butl return GetCurrentProcessId (); } + process::handle_type process:: + current_handle () + { + // Note that the returned handle is a pseudo handle (-1) that does not + // need to be closed. + // + return GetCurrentProcess (); + } + // process_exit // process_exit:: diff --git a/libbutl/process.mxx b/libbutl/process.hxx index 9106549..bbb7c89 100644 --- a/libbutl/process.mxx +++ b/libbutl/process.hxx @@ -1,17 +1,12 @@ -// file : libbutl/process.mxx -*- C++ -*- +// file : libbutl/process.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif #ifndef _WIN32 # include <sys/types.h> // pid_t #endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <chrono> @@ -20,33 +15,15 @@ #include <cstdint> // uint32_t #include <system_error> -#include <utility> // move(), forward(), index_sequence -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.process; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.path; -import butl.optional; -import butl.fdstream; // auto_fd, fdpipe -import butl.vector_view; -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/optional.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/vector-view.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/optional.hxx> +#include <libbutl/fdstream.hxx> // auto_fd, fdpipe +#include <libbutl/vector-view.hxx> +#include <libbutl/small-vector.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { struct process_error: std::system_error { @@ -140,8 +117,8 @@ LIBBUTL_MODEXPORT namespace butl // Moveable-only type. // - process_path (process_path&&); - process_path& operator= (process_path&&); + process_path (process_path&&) noexcept; + process_path& operator= (process_path&&) noexcept; process_path (const process_path&) = delete; process_path& operator= (const process_path&) = delete; @@ -191,6 +168,26 @@ LIBBUTL_MODEXPORT namespace butl bool normal () const; + // C/C++ don't apply constraints on program exit code other than it being + // of type int. + // + // POSIX specifies that only the least significant 8 bits shall be + // available from wait() and waitpid(); the full value shall be available + // from waitid() (read more at _Exit, _exit Open Group spec). + // + // While the Linux man page for waitid() doesn't mention any deviations + // from the standard, the FreeBSD implementation (as of version 11.0) only + // returns 8 bits like the other wait*() calls. + // + // Windows supports 32-bit exit codes. + // + // Note that in shells some exit values can have special meaning so using + // them can be a source of confusion. For bash values in the [126, 255] + // range are such a special ones (see Appendix E, "Exit Codes With Special + // Meanings" in the Advanced Bash-Scripting Guide). + // + // So [0, 125] appears to be the usable exit code range. + // code_type code () const; @@ -272,7 +269,30 @@ LIBBUTL_MODEXPORT namespace butl // the parent. So you should do this yourself, if required. For example, // to redirect the child process stdout to stderr, you can do: // - // process p (..., 0, 2); + // process pr (..., 0, 2); + // + // Note also that the somewhat roundabout setup with -1 as a redirect + // "instruction" and out_fd/in_ofd/in_efd data members for the result + // helps to make sure the stream instances are destroyed before the + // process instance. For example: + // + // process pr (..., 0, -1, 2); + // ifdstream is (move (pr.in_ofd)); + // + // This is important in case an exception is thrown where we want to make + // sure all our pipe ends are closed before we wait for the process exit + // (which happens in the process destructor). + // + // And speaking of the destruction order, another thing to keep in mind is + // that only one stream can use the skip mode (fdstream_mode::skip; + // because skipping is performed in the blocking mode) and the stream that + // skips should come first so that all other streams are destroyed/closed + // before it (failed that, we may end up in a deadlock). For example: + // + // process pr (..., -1, -1, -1); + // ifdstream is (move (pr.in_ofd), fdstream_mode::skip); // Must be first. + // ifdstream es (move (pr.in_efd)); + // ofdstream os (move (pr.out_fd)); // // The cwd argument allows to change the current working directory of the // child process. NULL and empty arguments are ignored. @@ -290,39 +310,104 @@ LIBBUTL_MODEXPORT namespace butl // Note that the versions without the the process_path argument may // temporarily change args[0] (see path_search() for details). // - process (const char* [], + process (const char**, int in = 0, int out = 1, int err = 2, const char* cwd = nullptr, const char* const* envvars = nullptr); - process (const process_path&, const char* [], + process (const process_path&, const char* const*, + int in = 0, int out = 1, int err = 2, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (std::vector<const char*>&, + int in = 0, int out = 1, int err = 2, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const std::vector<const char*>&, int in = 0, int out = 1, int err = 2, const char* cwd = nullptr, const char* const* envvars = nullptr); // If the descriptors are pipes that you have created, then you should use - // this constructor instead to communicate this information. + // this constructor instead to communicate this information (the parent + // end may need to be "probed" on Windows). // // For generality, if the "other" end of the pipe is -1, then assume this // is not a pipe. // struct pipe { - int in = -1; - int out = -1; - pipe () = default; pipe (int i, int o): in (i), out (o) {} explicit pipe (const fdpipe& p): in (p.in.get ()), out (p.out.get ()) {} + + // Transfer ownership to one end of the pipe. + // + pipe (auto_fd i, int o): in (i.release ()), out (o), own_in (true) {} + pipe (int i, auto_fd o): in (i), out (o.release ()), own_out (true) {} + + // Moveable-only type. + // + pipe (pipe&&) noexcept; + pipe& operator= (pipe&&) noexcept; + + pipe (const pipe&) = delete; + pipe& operator= (const pipe&) = delete; + + ~pipe (); + + public: + int in = -1; + int out = -1; + + bool own_in = false; + bool own_out = false; }; - process (const process_path&, const char* [], + process (const char**, pipe in, pipe out, pipe err, const char* cwd = nullptr, const char* const* envvars = nullptr); + process (const char**, + int in, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const char* const*, + pipe in, pipe out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const char* const*, + int in, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (std::vector<const char*>&, + pipe in, pipe out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (std::vector<const char*>&, + int in, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const std::vector<const char*>&, + pipe in, pipe out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const std::vector<const char*>&, + int in, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + // The "piping" constructor, for example: // // process lhs (..., 0, -1); // Redirect stdout to a pipe. @@ -331,16 +416,36 @@ LIBBUTL_MODEXPORT namespace butl // rhs.wait (); // Wait for last first. // lhs.wait (); // - process (const char* [], + process (const char**, process&, int out = 1, int err = 2, const char* cwd = nullptr, const char* const* envvars = nullptr); - process (const process_path&, const char* [], + process (const process_path&, const char* const*, process&, int out = 1, int err = 2, const char* cwd = nullptr, const char* const* envvars = nullptr); + process (const char**, + process&, pipe out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const char**, + process&, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const char* const*, + process&, pipe out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + + process (const process_path&, const char* const*, + process&, int out, pipe err, + const char* cwd = nullptr, + const char* const* envvars = nullptr); + // Wait for the process to terminate. Return true if the process // terminated normally and with the zero exit code. Unless ignore_error // is true, throw process_error if anything goes wrong. This function can @@ -367,7 +472,7 @@ LIBBUTL_MODEXPORT namespace butl // Note that the destructor will wait for the process but will ignore // any errors and the exit status. // - ~process () {if (handle != 0) wait (true);} + ~process () { if (handle != 0) wait (true); } // Process termination. // @@ -394,8 +499,8 @@ LIBBUTL_MODEXPORT namespace butl // Moveable-only type. // - process (process&&); - process& operator= (process&&); + process (process&&) noexcept; + process& operator= (process&&) noexcept (false); // Note: calls wait(). process (const process&) = delete; process& operator= (const process&) = delete; @@ -417,7 +522,7 @@ LIBBUTL_MODEXPORT namespace butl // // ... // E.g., print args[0]. // - // process p (pp, args); + // process pr (pp, args); // // You can also specify the fallback directory which will be tried last. // This, for example, can be used to implement the Windows "search in the @@ -501,15 +606,17 @@ LIBBUTL_MODEXPORT namespace butl // nameN arg arg ... nullptr nullptr // static void - print (std::ostream&, const char* const args[], size_t n = 0); + print (std::ostream&, const char* const* args, size_t n = 0); - // Quote and escape the specified command line argument. Return the - // original string if neither is necessary and a pointer to the provided - // buffer string containing the escaped version otherwise. + // Quote and escape the specified command line argument. If batch is true + // then also quote the equal (`=`), comma (`,`) and semicolon (`;`) + // characters which are treated as argument separators in batch file. + // Return the original string if neither is necessary and a pointer to the + // provided buffer string containing the escaped version otherwise. // #ifdef _WIN32 static const char* - quote_argument (const char*, std::string& buffer); + quote_argument (const char*, std::string& buffer, bool batch); #endif public: @@ -522,13 +629,16 @@ LIBBUTL_MODEXPORT namespace butl public: handle_type handle; + static handle_type + current_handle (); + // Absence means that the exit information is not (yet) known. This can be // because you haven't called wait() yet or because wait() failed. // optional<process_exit> exit; - // Use the following file descriptors to communicate with the new process's - // standard streams. + // Use the following file descriptors to communicate with the new + // process's standard streams (if redirected to pipes; see above). // auto_fd out_fd; // Write to it to send to stdin. auto_fd in_ofd; // Read from it to receive from stdout. @@ -642,8 +752,8 @@ LIBBUTL_MODEXPORT namespace butl // Moveable-only type. // - process_env (process_env&&); - process_env& operator= (process_env&&); + process_env (process_env&&) noexcept; + process_env& operator= (process_env&&) noexcept; process_env (const process_env&) = delete; process_env& operator= (const process_env&) = delete; @@ -679,7 +789,7 @@ LIBBUTL_MODEXPORT namespace butl // command line or similar. It should be callable with the following // signature: // - // void (const char*[], std::size_t) + // void (const char* const*, std::size_t) // template <typename C, typename I, @@ -720,6 +830,15 @@ LIBBUTL_MODEXPORT namespace butl const process_env&, A&&... args); + // Call the callback without actually running/starting anything. + // + template <typename C, + typename... A> + void + process_print_callback (const C&, + const process_env&, + A&&... args); + // Conversion of types to their C string representations. Can be overloaded // (including via ADL) for custom types. The default implementation calls // to_string() which covers all the numeric values via std::to_string () and diff --git a/libbutl/process.ixx b/libbutl/process.ixx index 7676ce3..e4db474 100644 --- a/libbutl/process.ixx +++ b/libbutl/process.ixx @@ -1,6 +1,9 @@ // file : libbutl/process.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file +#include <cassert> +#include <utility> // move() + namespace butl { // process_path @@ -32,7 +35,7 @@ namespace butl args0_ (nullptr) {} inline process_path:: - process_path (process_path&& p) + process_path (process_path&& p) noexcept : effect (std::move (p.effect)), args0_ (p.args0_) { @@ -45,7 +48,7 @@ namespace butl } inline process_path& process_path:: - operator= (process_path&& p) + operator= (process_path&& p) noexcept { if (this != &p) { @@ -121,6 +124,42 @@ namespace butl } #endif + // process::pipe + // + inline process::pipe:: + pipe (pipe&& p) noexcept + : in (p.in), out (p.out), own_in (p.own_in), own_out (p.own_out) + { + p.in = p.out = -1; + } + + inline process::pipe& process::pipe:: + operator= (pipe&& p) noexcept + { + if (this != &p) + { + int d (own_in ? in : own_out ? out : -1); + if (d != -1) + fdclose (d); + + in = p.in; + out = p.out; + own_in = p.own_in; + own_out = p.own_out; + + p.in = p.out = -1; + } + return *this; + } + + inline process::pipe:: + ~pipe () + { + int d (own_in ? in : own_out ? out : -1); + if (d != -1) + fdclose (d); + } + // process // #ifndef _WIN32 @@ -175,21 +214,37 @@ namespace butl inline process:: process (optional<process_exit> e) - : handle (0), - exit (std::move (e)), - out_fd (-1), - in_ofd (-1), - in_efd (-1) + : handle (0), exit (std::move (e)) + { + } + + inline process:: + process (const process_path& pp, const char* const* args, + int in, int out, int err, + const char* cwd, + const char* const* envvars) + : process (pp, args, + pipe (in, -1), pipe (-1, out), pipe (-1, err), + cwd, + envvars) + { + } + + inline process:: + process (const char** args, + int in, int out, int err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args, in, out, err, cwd, envvars) { } inline process:: - process (const process_path& pp, const char* args[], + process (const process_path& pp, const std::vector<const char*>& args, int in, int out, int err, const char* cwd, const char* const* envvars) - : process (pp, - args, + : process (pp, args.data (), pipe (in, -1), pipe (-1, out), pipe (-1, err), cwd, envvars) @@ -197,32 +252,166 @@ namespace butl } inline process:: - process (const char* args[], + process (std::vector<const char*>& args, int in, int out, int err, const char* cwd, const char* const* envvars) - : process (path_search (args[0]), args, in, out, err, cwd, envvars) {} + : process (path_search (args[0]), args.data (), + in, out, err, + cwd, + envvars) + { + } + + inline process:: + process (const char** args, + pipe in, pipe out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args, + std::move (in), std::move (out), std::move (err), + cwd, envvars) + { + } + + inline process:: + process (const char** args, + int in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args, + pipe (in, -1), pipe (-1, out), std::move (err), + cwd, envvars) + { + } + + inline process:: + process (const process_path& pp, const char* const* args, + int in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (pp, args, + pipe (in, -1), pipe (-1, out), std::move (err), + cwd, + envvars) + { + } + + inline process:: + process (std::vector<const char*>& args, + pipe in, pipe out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args.data (), + std::move (in), std::move (out), std::move (err), + cwd, + envvars) + { + } + + inline process:: + process (std::vector<const char*>& args, + int in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args.data (), + pipe (in, -1), pipe (-1, out), std::move (err), + cwd, + envvars) + { + } + + inline process:: + process (const process_path& pp, const std::vector<const char*>& args, + pipe in, pipe out, pipe err, + const char* cwd, + const char* const* envvars) + : process (pp, args.data (), + std::move (in), std::move (out), std::move (err), + cwd, + envvars) + { + } + + inline process:: + process (const process_path& pp, const std::vector<const char*>& args, + int in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (pp, args.data (), + pipe (in, -1), pipe (-1, out), std::move (err), + cwd, + envvars) + { + } + + inline process:: + process (const process_path& pp, const char* const* args, + process& in, pipe out, pipe err, + const char* cwd, + const char* const* envvars) + : process (pp, args, + [&in] () + { + assert (in.in_ofd != nullfd); // Should be a pipe. + return process::pipe (std::move (in.in_ofd), -1); + } (), + std::move (out), std::move (err), + cwd, envvars) + { + } inline process:: - process (const process_path& pp, const char* args[], + process (const process_path& pp, const char* const* args, process& in, int out, int err, const char* cwd, const char* const* envvars) - : process (pp, args, in.in_ofd.get (), out, err, cwd, envvars) + : process (pp, args, in, pipe (-1, out), pipe (-1, err), cwd, envvars) { - assert (in.in_ofd.get () != -1); // Should be a pipe. - in.in_ofd.reset (); // Close it on our side. } inline process:: - process (const char* args[], + process (const char** args, process& in, int out, int err, const char* cwd, const char* const* envvars) - : process (path_search (args[0]), args, in, out, err, cwd, envvars) {} + : process (path_search (args[0]), args, in, out, err, cwd, envvars) + { + } + + inline process:: + process (const char** args, + process& in, pipe out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args, + in, std::move (out), std::move (err), + cwd, envvars) + { + } + + inline process:: + process (const char** args, + process& in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (path_search (args[0]), args, + in, pipe (-1, out), std::move (err), + cwd, envvars) + { + } + + inline process:: + process (const process_path& pp, const char* const* args, + process& in, int out, pipe err, + const char* cwd, + const char* const* envvars) + : process (pp, args, in, pipe (-1, out), std::move (err), cwd, envvars) + { + } inline process:: - process (process&& p) + process (process&& p) noexcept : handle (p.handle), exit (std::move (p.exit)), out_fd (std::move (p.out_fd)), @@ -233,7 +422,7 @@ namespace butl } inline process& process:: - operator= (process&& p) + operator= (process&& p) noexcept (false) { if (this != &p) { @@ -270,13 +459,13 @@ namespace butl // process_env // inline process_env:: - process_env (process_env&& e) + process_env (process_env&& e) noexcept { *this = std::move (e); } inline process_env& process_env:: - operator= (process_env&& e) + operator= (process_env&& e) noexcept { if (this != &e) { diff --git a/libbutl/project-name.cxx b/libbutl/project-name.cxx index 7a14b49..a7ed8a8 100644 --- a/libbutl/project-name.cxx +++ b/libbutl/project-name.cxx @@ -1,38 +1,16 @@ // file : libbutl/project-name.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/project-name.mxx> -#endif +#include <libbutl/project-name.hxx> -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <utility> // move() #include <algorithm> // find() #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.project_name; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.utility; -#endif - -import butl.path; // path::traits -import butl.utility; // alpha(), alnum() -#else -#include <libbutl/path.mxx> -#include <libbutl/utility.mxx> -#endif + +#include <libbutl/path.hxx> // path::traits +#include <libbutl/utility.hxx> // alpha(), alnum() using namespace std; diff --git a/libbutl/project-name.mxx b/libbutl/project-name.hxx index 1117e28..6e1f925 100644 --- a/libbutl/project-name.mxx +++ b/libbutl/project-name.hxx @@ -1,34 +1,17 @@ -// file : libbutl/project-name.mxx -*- C++ -*- +// file : libbutl/project-name.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <utility> // move() #include <ostream> -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.project_name; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.utility; // icasecmp(), sanitize_identifier() -#else -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utility.hxx> // icasecmp(), sanitize_identifier() #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Build system project name. // diff --git a/libbutl/prompt.cxx b/libbutl/prompt.cxx index 1c0820a..154522c 100644 --- a/libbutl/prompt.cxx +++ b/libbutl/prompt.cxx @@ -1,33 +1,11 @@ // file : libbutl/prompt.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/prompt.mxx> -#endif - -#ifndef __cpp_lib_modules_ts -#include <string> +#include <libbutl/prompt.hxx> #include <iostream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.prompt; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif -import butl.diagnostics; -#else -#include <libbutl/diagnostics.mxx> // diag_stream -#endif +#include <libbutl/diagnostics.hxx> // diag_stream using namespace std; @@ -66,8 +44,8 @@ namespace butl if (!e) a = def; } - } while (a != "y" && a != "n"); + } while (a != "y" && a != "Y" && a != "n" && a != "N"); - return a == "y"; + return a == "y" || a == "Y"; } } diff --git a/libbutl/prompt.mxx b/libbutl/prompt.hxx index 2489b2f..2a07708 100644 --- a/libbutl/prompt.mxx +++ b/libbutl/prompt.hxx @@ -1,28 +1,13 @@ -// file : libbutl/prompt.mxx -*- C++ -*- +// file : libbutl/prompt.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.prompt; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // The Y/N prompt. The def argument, if specified, should be either 'y' or // 'n'. It is used as the default answer, in case the user just hits enter. @@ -30,6 +15,10 @@ LIBBUTL_MODEXPORT namespace butl // Write the prompt to diag_stream. Throw ios_base::failure if no answer // could be extracted from stdin (for example, because it was closed). // + // Note that the implementation accepts both lower and upper case y/n as + // valid answers (apparently the capitalized default answer confuses some + // users into answering with capital letters). + // LIBBUTL_SYMEXPORT bool yn_prompt (const std::string&, char def = '\0'); } diff --git a/libbutl/regex.cxx b/libbutl/regex.cxx index 83e296c..34536f2 100644 --- a/libbutl/regex.cxx +++ b/libbutl/regex.cxx @@ -1,42 +1,17 @@ // file : libbutl/regex.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/regex.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <regex> -#include <string> +#include <libbutl/regex.hxx> #include <ostream> #include <sstream> #include <stdexcept> // runtime_error + #if defined(_MSC_VER) && _MSC_VER < 2000 # include <cstring> // strstr() #endif -#endif - -// Other includes. -#ifdef __cpp_modules_ts -module butl.regex; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -import std.regex; -#endif -#endif - -import butl.utility; // operator<<(ostream, exception) -#else -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utility.hxx> // operator<<(ostream, exception) namespace std { diff --git a/libbutl/regex.mxx b/libbutl/regex.hxx index 84b024f..9b31075 100644 --- a/libbutl/regex.mxx +++ b/libbutl/regex.hxx @@ -1,22 +1,13 @@ -// file : libbutl/regex.mxx -*- C++ -*- +// file : libbutl/regex.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. -#ifndef __cpp_lib_modules_ts #include <regex> #include <iosfwd> #include <string> #include <utility> // pair - -#include <locale> #include <cstddef> // size_t -#include <utility> // move(), make_pair() -#endif #if defined(__clang__) # if __has_include(<__config>) @@ -24,20 +15,9 @@ # endif #endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.regex; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -import std.regex; // @@ MOD TODO should probably be re-exported. -#endif -#endif - #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // The regex semantics for the following functions is like that of // std::regex_replace() extended the standard ECMA-262 substitution escape @@ -93,9 +73,54 @@ LIBBUTL_MODEXPORT namespace butl regex_replace_match (const std::basic_string<C>&, const std::basic_regex<C>&, const std::basic_string<C>& fmt); + + // As above but using match_results. + // + template <typename C> + std::basic_string<C> + regex_replace_match_results ( + const std::match_results<typename std::basic_string<C>::const_iterator>&, + const std::basic_string<C>& fmt); + + template <typename C> + std::basic_string<C> + regex_replace_match_results ( + const std::match_results<typename std::basic_string<C>::const_iterator>&, + const C* fmt, std::size_t fmt_n); + + // Parse the '/<regex>/<format>/' replacement string into the regex/format + // pair. Other character can be used as a delimiter instead of '/'. Throw + // std::invalid_argument or std::regex_error on parsing error. + // + // Note: escaping of the delimiter character is not (yet) supported. + // + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const std::basic_string<C>&, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C*, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C*, size_t, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + // As above but return string instead of regex and do not fail if there is + // text after the last delimiter instead returning its position. + // + template <typename C> + std::pair<std::basic_string<C>, std::basic_string<C>> + regex_replace_parse (const C*, size_t, size_t& end); } -LIBBUTL_MODEXPORT namespace std +namespace std { // Print regex error description but only if it is meaningful (this is also // why we have to print leading colon). diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx index dec15d1..08962cf 100644 --- a/libbutl/regex.ixx +++ b/libbutl/regex.ixx @@ -1,7 +1,9 @@ // file : libbutl/regex.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <utility> // move(), make_pair() + +namespace butl { template <typename C> inline std::pair<std::basic_string<C>, bool> @@ -21,4 +23,30 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return make_pair (move (r), match); } + + template <typename C> + inline std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const std::basic_string<C>& s, + std::regex_constants::syntax_option_type f) + { + return regex_replace_parse (s.c_str (), s.size (), f); + } + + template <typename C> + inline std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C* s, + std::regex_constants::syntax_option_type f) + { + return regex_replace_parse ( + s, std::basic_string<C>::traits_type::length (s), f); + } + + template <typename C> + inline std::basic_string<C> + regex_replace_match_results ( + const std::match_results<typename std::basic_string<C>::const_iterator>& m, + const std::basic_string<C>& fmt) + { + return regex_replace_match_results (m, fmt.c_str (), fmt.size ()); + } } diff --git a/libbutl/regex.txx b/libbutl/regex.txx index b785708..214d949 100644 --- a/libbutl/regex.txx +++ b/libbutl/regex.txx @@ -1,15 +1,16 @@ // file : libbutl/regex.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <locale> +#include <stdexcept> // invalid_argument + +namespace butl { - // Replace the regex match results using the format string. - // template <typename C> std::basic_string<C> regex_replace_match_results ( const std::match_results<typename std::basic_string<C>::const_iterator>& m, - const std::basic_string<C>& fmt) + const C* fmt, std::size_t n) { using namespace std; @@ -60,7 +61,6 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. } }; - size_t n (fmt.size ()); for (size_t i (0); i < n; ++i) { C c (fmt[i]); @@ -278,4 +278,71 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return match; } + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C* s, size_t n, + std::regex_constants::syntax_option_type f) + { + using namespace std; + + using string_type = basic_string<C>; + + size_t e; + pair<string_type, string_type> r (regex_replace_parse (s, n, e)); + + if (e != n) + throw invalid_argument ("junk after trailing delimiter"); + + return make_pair (basic_regex<C> (r.first, f), move (r.second)); + } + + template <typename C> + std::pair<std::basic_string<C>, std::basic_string<C>> + regex_replace_parse (const C* s, size_t n, size_t& e) + { + using namespace std; + + using string_type = basic_string<C>; + + if (n == 0) + throw invalid_argument ("no leading delimiter"); + + const C* b (s); // Save the beginning of the string. + + char delim (s[0]); + + // Position to the regex first character and find the regex-terminating + // delimiter. + // + --n; + ++s; + + const C* p (string_type::traits_type::find (s, n, delim)); + + if (p == nullptr) + throw invalid_argument ("no delimiter after regex"); + + // Empty regex matches nothing, so not of much use. + // + if (p == s) + throw invalid_argument ("empty regex"); + + // Save the regex. + // + string_type re (s, p - s); + + // Position to the format first character and find the trailing delimiter. + // + n -= p - s + 1; + s = p + 1; + + p = string_type::traits_type::find (s, n, delim); + + if (p == nullptr) + throw invalid_argument ("no delimiter after replacement"); + + e = p - b + 1; + return make_pair (move (re), string_type (s, p - s)); + } } diff --git a/libbutl/semantic-version.cxx b/libbutl/semantic-version.cxx index eaf709d..9e0a1ef 100644 --- a/libbutl/semantic-version.cxx +++ b/libbutl/semantic-version.cxx @@ -1,39 +1,12 @@ // file : libbutl/semantic-version.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/semantic-version.mxx> -#endif +#include <libbutl/semantic-version.hxx> #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <cstdint> -#include <ostream> - #include <cstring> // strchr() -#include <cstdlib> // strtoull() #include <utility> // move() #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.semantic_version; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -#endif -#else -#endif using namespace std; @@ -80,9 +53,9 @@ namespace butl } semantic_version:: - semantic_version (const std::string& s, size_t p, const char* bs) + semantic_version (const std::string& s, size_t p, flags fs, const char* bs) { - semantic_version_result r (parse_semantic_version_impl (s, p, bs)); + semantic_version_result r (parse_semantic_version_impl (s, p, fs, bs)); if (r.version) *this = move (*r.version); @@ -98,8 +71,27 @@ namespace butl uint64_t min = 0, uint64_t max = uint64_t (~0)); semantic_version_result - parse_semantic_version_impl (const string& s, size_t p, const char* bs) + parse_semantic_version_impl (const string& s, size_t p, + semantic_version::flags fs, + const char* bs) { + bool allow_build ((fs & semantic_version::allow_build) != 0); + + // If build separators are specified, then the allow_build flag must be + // specified explicitly. + // + assert (bs == nullptr || allow_build); + + if (allow_build && bs == nullptr) + bs = "-+"; + + bool require_minor ((fs & semantic_version::allow_omit_minor) == 0); + + if (!require_minor) + fs |= semantic_version::allow_omit_patch; + + bool require_patch ((fs & semantic_version::allow_omit_patch) == 0); + auto bail = [] (string m) { return semantic_version_result {nullopt, move (m)}; @@ -110,31 +102,47 @@ namespace butl if (!parse_uint64 (s, p, r.major)) return bail ("invalid major version"); - if (s[p] != '.') - return bail ("'.' expected after major version"); - - if (!parse_uint64 (s, ++p, r.minor)) - return bail ("invalid minor version"); - - if (s[p] == '.') + if (s[p] == '.') // Is there a minor version? { - // Treat it as build if failed to parse as patch (e.g., 1.2.alpha). + // Try to parse the minor version and treat it as build on failure + // (e.g., 1.alpha). // - if (!parse_uint64 (s, ++p, r.patch)) + if (parse_uint64 (s, ++p, r.minor)) + { + if (s[p] == '.') // Is there a patch version? + { + // Try to parse the patch version and treat it as build on failure + // (e.g., 1.2.alpha). + // + if (parse_uint64 (s, ++p, r.patch)) + ; + else + { + if (require_patch) + return bail ("invalid patch version"); + + --p; + // Fall through. + } + } + else if (require_patch) + return bail ("'.' expected after minor version"); + } + else { - //if (require_patch) - // return bail ("invalid patch version"); + if (require_minor) + return bail ("invalid minor version"); --p; // Fall through. } } - //else if (require_patch) - // return bail ("'.' expected after minor version"); + else if (require_minor) + return bail ("'.' expected after major version"); if (char c = s[p]) { - if (bs == nullptr || (*bs != '\0' && strchr (bs, c) == nullptr)) + if (!allow_build || (*bs != '\0' && strchr (bs, c) == nullptr)) return bail ("junk after version"); r.build.assign (s, p, string::npos); diff --git a/libbutl/semantic-version.mxx b/libbutl/semantic-version.hxx index 566d192..4eba38a 100644 --- a/libbutl/semantic-version.mxx +++ b/libbutl/semantic-version.hxx @@ -1,32 +1,15 @@ -// file : libbutl/semantic-version.mxx -*- C++ -*- +// file : libbutl/semantic-version.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. -#ifndef __cpp_lib_modules_ts #include <string> #include <cstddef> // size_t #include <cstdint> // uint*_t #include <utility> // move() #include <ostream> -#endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.semantic_version; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -#else -#include <libbutl/optional.mxx> -#endif +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> @@ -40,19 +23,13 @@ import butl.optional; # undef minor #endif -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Semantic or semantic-like version. // - // <major>.<minor>[.<patch>][<build>] + // <major>[.<minor>[.<patch>]][<build>] // - // If the patch component is absent, then it defaults to 0. - // - // @@ Currently there is no way to enforce the three-component version. - // Supporting this will require changing allow_build to a bit-wise - // flag. See parse_semantic_version_impl() for some sketched code. - // We may also want to pass these flags to string() to not print - // 0 patch. + // If the minor and patch components are absent, then they default to 0. // // By default, a version containing the <build> component is considered // valid only if separated from <patch> with '-' (semver pre-release) or '+' @@ -80,23 +57,36 @@ LIBBUTL_MODEXPORT namespace butl std::uint64_t patch, std::string build = ""); - // The build_separators argument can be NULL (no build component allowed), - // empty (any build component allowed), or a string of characters to allow - // as separators. When allow_build is true build_separators defaults to - // "-+". + // If the allow_build flag is specified, then build_separators argument + // can be a string of characters to allow as separators, empty (any build + // component allowed), or NULL (defaults to "-+"). // - explicit - semantic_version (const std::string&, bool allow_build = true); + // Note: allow_omit_minor implies allow_omit_patch. + // + enum flags + { + none = 0, // Exact <major>.<minor>.<patch> form. + allow_omit_minor = 0x01, // Allow <major> form. + allow_omit_patch = 0x02, // Allow <major>.<minor> form. + allow_build = 0x04, // Allow <major>.<minor>.<patch>-<build> form. + }; - semantic_version (const std::string&, const char* build_separators); + explicit + semantic_version (const std::string&, + flags = none, + const char* build_separators = nullptr); // As above but parse from the specified position until the end of the // string. // - semantic_version (const std::string&, std::size_t pos, bool = true); - - semantic_version (const std::string&, std::size_t pos, const char*); + semantic_version (const std::string&, + std::size_t pos, + flags = none, + const char* = nullptr); + // @@ We may also want to pass allow_* flags not to print 0 minor/patch or + // maybe invent ignore_* flags. + // std::string string (bool ignore_build = false) const; @@ -133,16 +123,15 @@ LIBBUTL_MODEXPORT namespace butl // Try to parse a string as a semantic version returning nullopt if invalid. // optional<semantic_version> - parse_semantic_version (const std::string&, bool allow_build = true); - - optional<semantic_version> - parse_semantic_version (const std::string&, const char* build_separators); - - optional<semantic_version> - parse_semantic_version (const std::string&, std::size_t pos, bool = true); + parse_semantic_version (const std::string&, + semantic_version::flags = semantic_version::none, + const char* build_separators = nullptr); optional<semantic_version> - parse_semantic_version (const std::string&, std::size_t pos, const char*); + parse_semantic_version (const std::string&, + std::size_t pos, + semantic_version::flags = semantic_version::none, + const char* = nullptr); // NOTE: comparison operators take the build component into account. // @@ -187,6 +176,18 @@ LIBBUTL_MODEXPORT namespace butl { return o << x.string (); } + + semantic_version::flags + operator& (semantic_version::flags, semantic_version::flags); + + semantic_version::flags + operator| (semantic_version::flags, semantic_version::flags); + + semantic_version::flags + operator&= (semantic_version::flags&, semantic_version::flags); + + semantic_version::flags + operator|= (semantic_version::flags&, semantic_version::flags); } #include <libbutl/semantic-version.ixx> diff --git a/libbutl/semantic-version.ixx b/libbutl/semantic-version.ixx index 6bf7584..8de1554 100644 --- a/libbutl/semantic-version.ixx +++ b/libbutl/semantic-version.ixx @@ -15,23 +15,9 @@ namespace butl { } - // Note: the order is important to MinGW GCC (DLL linkage). - // inline semantic_version:: - semantic_version (const std::string& s, std::size_t p, bool ab) - : semantic_version (s, p, ab ? "-+" : nullptr) - { - } - - inline semantic_version:: - semantic_version (const std::string& s, const char* bs) - : semantic_version (s, 0, bs) - { - } - - inline semantic_version:: - semantic_version (const std::string& s, bool ab) - : semantic_version (s, ab ? "-+" : nullptr) + semantic_version (const std::string& s, flags fs, const char* bs) + : semantic_version (s, 0, fs, bs) { } @@ -42,29 +28,53 @@ namespace butl }; LIBBUTL_SYMEXPORT semantic_version_result - parse_semantic_version_impl (const std::string&, std::size_t, const char*); + parse_semantic_version_impl (const std::string&, + std::size_t, + semantic_version::flags, + const char*); inline optional<semantic_version> - parse_semantic_version (const std::string& s, bool ab) + parse_semantic_version (const std::string& s, + semantic_version::flags fs, + const char* bs) { - return parse_semantic_version (s, ab ? "-+" : nullptr); + return parse_semantic_version_impl (s, 0, fs, bs).version; } inline optional<semantic_version> - parse_semantic_version (const std::string& s, const char* bs) + parse_semantic_version (const std::string& s, + std::size_t p, + semantic_version::flags fs, + const char* bs) { - return parse_semantic_version_impl (s, 0, bs).version; + return parse_semantic_version_impl (s, p, fs, bs).version; } - inline optional<semantic_version> - parse_semantic_version (const std::string& s, std::size_t p, bool ab) + inline semantic_version::flags + operator&= (semantic_version::flags& x, semantic_version::flags y) { - return parse_semantic_version (s, p, ab ? "-+" : nullptr); + return x = static_cast<semantic_version::flags> ( + static_cast<std::uint16_t> (x) & + static_cast<std::uint16_t> (y)); } - inline optional<semantic_version> - parse_semantic_version (const std::string& s, std::size_t p, const char* bs) + inline semantic_version::flags + operator|= (semantic_version::flags& x, semantic_version::flags y) + { + return x = static_cast<semantic_version::flags> ( + static_cast<std::uint16_t> (x) | + static_cast<std::uint16_t> (y)); + } + + inline semantic_version::flags + operator& (semantic_version::flags x, semantic_version::flags y) + { + return x &= y; + } + + inline semantic_version::flags + operator| (semantic_version::flags x, semantic_version::flags y) { - return parse_semantic_version_impl (s, p, bs).version; + return x |= y; } } diff --git a/libbutl/sendmail.cxx b/libbutl/sendmail.cxx index 1038cf4..5fec1a6 100644 --- a/libbutl/sendmail.cxx +++ b/libbutl/sendmail.cxx @@ -1,32 +1,7 @@ // file : libbutl/sendmail.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/sendmail.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <string> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.sendmail; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.process; -import butl.fdstream; -import butl.small_vector; -#endif - -#endif +#include <libbutl/sendmail.hxx> using namespace std; diff --git a/libbutl/sendmail.mxx b/libbutl/sendmail.hxx index 0d5b239..97a4d82 100644 --- a/libbutl/sendmail.mxx +++ b/libbutl/sendmail.hxx @@ -1,38 +1,17 @@ -// file : libbutl/sendmail.mxx -*- C++ -*- +// file : libbutl/sendmail.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> -#include <cstddef> // size_t -#include <utility> // move(), forward() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.sendmail; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.process; -import butl.fdstream; -import butl.small_vector; -#else -#include <libbutl/process.mxx> -#include <libbutl/fdstream.mxx> -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/process.hxx> +#include <libbutl/fdstream.hxx> +#include <libbutl/small-vector.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Send email using the sendmail(1) program. // diff --git a/libbutl/sendmail.ixx b/libbutl/sendmail.ixx index 105c1af..35b5c47 100644 --- a/libbutl/sendmail.ixx +++ b/libbutl/sendmail.ixx @@ -1,7 +1,10 @@ // file : libbutl/sendmail.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <cstddef> // size_t +#include <utility> // move(), forward() + +namespace butl { template <typename E, typename... O> inline sendmail:: diff --git a/libbutl/sha1.c b/libbutl/sha1.c index 37e862e..98fce5e 100644 --- a/libbutl/sha1.c +++ b/libbutl/sha1.c @@ -121,11 +121,17 @@ main () #include <string.h> +/* Assume if bzero/bcopy are defined as macros, then they do what we need. */ + /* void bzero(void *s, size_t n); */ -#define bzero(s, n) memset((s), 0, (n)) +#ifndef bzero +# define bzero(s, n) memset((s), 0, (n)) +#endif /* void bcopy(const void *s1, void *s2, size_t n); */ -#define bcopy(s1, s2, n) memmove((s2), (s1), (n)) +#ifndef bcopy +# define bcopy(s1, s2, n) memmove((s2), (s1), (n)) +#endif /* The rest is the unmodified (except for adjusting function declarations and adding a few explicit casts to make compilable in C++ without warnings) diff --git a/libbutl/sha1.cxx b/libbutl/sha1.cxx index 6a5e9db..e546922 100644 --- a/libbutl/sha1.cxx +++ b/libbutl/sha1.cxx @@ -1,9 +1,7 @@ // file : libbutl/sha1.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/sha1.mxx> -#endif +#include <libbutl/sha1.hxx> // C interface for sha1c. // @@ -42,29 +40,9 @@ extern "C" #define SHA1_Final(x, y) sha1_result((y), (char(&)[20])(x)) #include <cassert> +#include <istream> -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <cstdint> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.sha1; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif - -import butl.fdstream; -#else -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/bufstreambuf.hxx> using namespace std; @@ -91,12 +69,12 @@ namespace butl } void sha1:: - append (ifdstream& is) + append (istream& is) { - fdbuf* buf (dynamic_cast<fdbuf*> (is.rdbuf ())); + bufstreambuf* buf (dynamic_cast<bufstreambuf*> (is.rdbuf ())); assert (buf != nullptr); - while (is.peek () != ifdstream::traits_type::eof () && is.good ()) + while (is.peek () != istream::traits_type::eof () && is.good ()) { size_t n (buf->egptr () - buf->gptr ()); append (buf->gptr (), n); diff --git a/libbutl/sha1.mxx b/libbutl/sha1.hxx index 07c469c..62710f4 100644 --- a/libbutl/sha1.mxx +++ b/libbutl/sha1.hxx @@ -1,34 +1,18 @@ -// file : libbutl/sha1.mxx -*- C++ -*- +// file : libbutl/sha1.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts +#include <iosfwd> // istream #include <string> -#include <cstddef> // size_t +#include <cstddef> // size_t #include <cstdint> -#include <cstring> // strlen() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.sha1; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif +#include <cstring> // strlen() #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { - class ifdstream; - // SHA1 checksum calculator. // // For a single chunk of data a sum can be obtained in one line, for @@ -67,11 +51,14 @@ LIBBUTL_MODEXPORT namespace butl // Append stream. // + // Note that currently the stream is expected to be bufstreambuf-based + // (e.g., ifdstream). + // void - append (ifdstream&); + append (std::istream&); explicit - sha1 (ifdstream& i): sha1 () {append (i);} + sha1 (std::istream& i): sha1 () {append (i);} // Check if any data has been hashed. // diff --git a/libbutl/sha256.cxx b/libbutl/sha256.cxx index 2528693..95987ec 100644 --- a/libbutl/sha256.cxx +++ b/libbutl/sha256.cxx @@ -1,9 +1,7 @@ // file : libbutl/sha256.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/sha256.mxx> -#endif +#include <libbutl/sha256.hxx> // C interface for sha256c. // @@ -26,39 +24,13 @@ extern "C" #include "sha256c.c" } -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <cstdint> - #include <cctype> // isxdigit() +#include <cassert> +#include <istream> #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.sha256; - -// Only imports additional to interface. -#ifdef __cpp_lib_modules_ts -import std.io; -#endif - -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif -import butl.utility; // *case() -import butl.fdstream; -#else -#include <libbutl/utility.mxx> -#include <libbutl/fdstream.mxx> -#endif +#include <libbutl/utility.hxx> // *case() +#include <libbutl/bufstreambuf.hxx> using namespace std; @@ -85,12 +57,12 @@ namespace butl } void sha256:: - append (ifdstream& is) + append (istream& is) { - fdbuf* buf (dynamic_cast<fdbuf*> (is.rdbuf ())); + bufstreambuf* buf (dynamic_cast<bufstreambuf*> (is.rdbuf ())); assert (buf != nullptr); - while (is.peek () != ifdstream::traits_type::eof () && is.good ()) + while (is.peek () != istream::traits_type::eof () && is.good ()) { size_t n (buf->egptr () - buf->gptr ()); append (buf->gptr (), n); diff --git a/libbutl/sha256.mxx b/libbutl/sha256.hxx index 9bc0971..566068f 100644 --- a/libbutl/sha256.mxx +++ b/libbutl/sha256.hxx @@ -1,35 +1,19 @@ -// file : libbutl/sha256.mxx -*- C++ -*- +// file : libbutl/sha256.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> +#include <iosfwd> // istream #include <cstddef> // size_t #include <cstdint> #include <cstring> // strlen(), memcpy() #include <type_traits> // enable_if, is_integral -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.sha256; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { - class ifdstream; - // SHA256 checksum calculator. // // For a single chunk of data a sum can be obtained in one line, for @@ -101,11 +85,14 @@ LIBBUTL_MODEXPORT namespace butl // Append stream. // + // Note that currently the stream is expected to be bufstreambuf-based + // (e.g., ifdstream). + // void - append (ifdstream&); + append (std::istream&); explicit - sha256 (ifdstream& i): sha256 () {append (i);} + sha256 (std::istream& i): sha256 () {append (i);} // Check if any data has been hashed. // diff --git a/libbutl/small-allocator.mxx b/libbutl/small-allocator.hxx index 5ef74be..429ba41 100644 --- a/libbutl/small-allocator.mxx +++ b/libbutl/small-allocator.hxx @@ -1,30 +1,16 @@ -// file : libbutl/small-allocator.mxx -*- C++ -*- +// file : libbutl/small-allocator.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif #include <cassert> - -#ifndef __cpp_lib_modules_ts #include <cstddef> // size_t #include <utility> // move() #include <type_traits> // true_type, is_same -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.small_allocator; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Implementation of the allocator (and its buffer) for small containers. // diff --git a/libbutl/small-forward-list.mxx b/libbutl/small-forward-list.hxx index 6aa4986..8d1cf68 100644 --- a/libbutl/small-forward-list.mxx +++ b/libbutl/small-forward-list.hxx @@ -1,31 +1,18 @@ -// file : libbutl/small-forward-list.mxx -*- C++ -*- +// file : libbutl/small-forward-list.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#ifndef __cpp_lib_modules_ts #include <cstddef> // size_t #include <utility> // move() +#include <type_traits> // is_nothrow_move_constructible #include <forward_list> -#endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.small_forward_list; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.small_allocator; -#else -#include <libbutl/small-allocator.mxx> -#endif +#include <libbutl/small-allocator.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Issues and limitations. // @@ -115,14 +102,20 @@ LIBBUTL_MODEXPORT namespace butl return *this; } + // See small_vector for the move-constructor/assignment noexept + // expressions reasoning. + // small_forward_list (small_forward_list&& v) +#if !defined(_MSC_VER) || _MSC_VER > 1900 + noexcept (std::is_nothrow_move_constructible<T>::value) +#endif : base_type (allocator_type (this)) { *this = std::move (v); // Delegate to operator=(&&). } small_forward_list& - operator= (small_forward_list&& v) + operator= (small_forward_list&& v) noexcept (false) { // VC14's implementation of operator=(&&) swaps pointers without regard // for allocator (fixed in 15). diff --git a/libbutl/small-list.mxx b/libbutl/small-list.hxx index ff62192..7cb51fd 100644 --- a/libbutl/small-list.mxx +++ b/libbutl/small-list.hxx @@ -1,31 +1,18 @@ -// file : libbutl/small-list.mxx -*- C++ -*- +// file : libbutl/small-list.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#ifndef __cpp_lib_modules_ts #include <list> #include <cstddef> // size_t #include <utility> // move() -#endif +#include <type_traits> // is_nothrow_move_constructible -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.small_list; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.small_allocator; -#else -#include <libbutl/small-allocator.mxx> -#endif +#include <libbutl/small-allocator.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Issues and limitations. // @@ -117,14 +104,20 @@ LIBBUTL_MODEXPORT namespace butl return *this; } + // See small_vector for the move-constructor/assignment noexept + // expressions reasoning. + // small_list (small_list&& v) +#if !defined(__GLIBCXX__) && (!defined(_MSC_VER) || _MSC_VER > 1900) + noexcept (std::is_nothrow_move_constructible<T>::value) +#endif : base_type (allocator_type (this)) { *this = std::move (v); // Delegate to operator=(&&). } small_list& - operator= (small_list&& v) + operator= (small_list&& v) noexcept (false) { // libstdc++'s implementation prior to GCC 6 is broken (calls swap()). // Since there is no easy way to determine this library's version, for @@ -136,7 +129,7 @@ LIBBUTL_MODEXPORT namespace butl #if defined(__GLIBCXX__) || (defined(_MSC_VER) && _MSC_VER <= 1900) this->clear (); for (T& x: v) - this->push_back (std::move (x)); + this->push_back (std::move (x)); // Note: can throw bad_alloc. v.clear (); #else // Note: propagate_on_container_move_assignment = false diff --git a/libbutl/small-vector-odb.hxx b/libbutl/small-vector-odb.hxx index af9d96c..289ca38 100644 --- a/libbutl/small-vector-odb.hxx +++ b/libbutl/small-vector-odb.hxx @@ -5,7 +5,7 @@ #include <odb/pre.hxx> -#include <libbutl/small-vector.mxx> +#include <libbutl/small-vector.hxx> #include <odb/container-traits.hxx> diff --git a/libbutl/small-vector.mxx b/libbutl/small-vector.hxx index 2a92182..44a3ef5 100644 --- a/libbutl/small-vector.mxx +++ b/libbutl/small-vector.hxx @@ -1,31 +1,18 @@ -// file : libbutl/small-vector.mxx -*- C++ -*- +// file : libbutl/small-vector.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#ifndef __cpp_lib_modules_ts #include <vector> #include <cstddef> // size_t #include <utility> // move() -#endif - -// Other includes. +#include <type_traits> // is_nothrow_move_constructible -#ifdef __cpp_modules_ts -export module butl.small_vector; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.small_allocator; -#else -#include <libbutl/small-allocator.mxx> -#endif +#include <libbutl/small-allocator.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Issues and limitations. // @@ -38,6 +25,9 @@ LIBBUTL_MODEXPORT namespace butl // // - swap() is deleted (see notes below). // + // - In contrast to std::vector, the references, pointers, and iterators + // referring to elements are invalidated after moving from it. + // template <typename T, std::size_t N> class small_vector: private small_allocator_buffer<T, N>, public std::vector<T, small_allocator<T, N>> @@ -118,17 +108,46 @@ LIBBUTL_MODEXPORT namespace butl return *this; } + // Note that while the move constructor is implemented via the move + // assignment it may not throw if the value type is no-throw move + // constructible. + // + // Specifically, if v.size() > N then allocators evaluate as equal and the + // buffer ownership is transferred. Otherwise, the allocators do not + // evaluate as equal and the individual elements are move-constructed in + // the preallocated buffer. + // + // Also note that this constructor ends up calling + // base_type::operator=(base_type&&) whose noexcept expression evaluates + // to false (propagate_on_container_move_assignment and is_always_equal + // are false for small_allocator; see std::vector documentation for + // details). We, however, assume that the noexcept expression we use here + // is strict enough for all "sane" std::vector implementations since + // small_allocator never throws directly. + // small_vector (small_vector&& v) + noexcept (std::is_nothrow_move_constructible<T>::value) : base_type (allocator_type (this)) { if (v.size () <= N) reserve (); *this = std::move (v); // Delegate to operator=(&&). + + // Note that in contrast to the move assignment operator, the + // constructor must clear the other vector. + // + v.clear (); } + // Note that when size() <= N and v.size() > N, then allocators of this + // and other containers do not evaluate as equal. Thus, the memory for the + // new elements is allocated on the heap and so std::bad_alloc can be + // thrown. @@ TODO: maybe we could re-implement this case in terms of + // swap()? + // small_vector& - operator= (small_vector&& v) + operator= (small_vector&& v) noexcept (false) { // VC's implementation of operator=(&&) (both 14 and 15) frees the // memory and then reallocated with capacity equal to v.size(). This is diff --git a/libbutl/standard-version.cxx b/libbutl/standard-version.cxx index a9f5eb8..36f4830 100644 --- a/libbutl/standard-version.cxx +++ b/libbutl/standard-version.cxx @@ -1,41 +1,14 @@ // file : libbutl/standard-version.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/standard-version.mxx> -#endif +#include <libbutl/standard-version.hxx> #include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstdint> -#include <cstddef> -#include <ostream> - #include <cstdlib> // strtoull() #include <utility> // move() #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.standard_version; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -#endif -import butl.utility; -#else -#include <libbutl/utility.mxx> // alnum() -#endif +#include <libbutl/utility.hxx> // alnum() using namespace std; @@ -60,6 +33,7 @@ namespace butl const char* b (s.c_str () + p); char* e (nullptr); + errno = 0; // We must clear it according to POSIX. uint64_t v (strtoull (b, &e, 10)); // Can't throw. if (errno == ERANGE || b == e || v < min || v > max) diff --git a/libbutl/standard-version.mxx b/libbutl/standard-version.hxx index b86e3a9..e973352 100644 --- a/libbutl/standard-version.mxx +++ b/libbutl/standard-version.hxx @@ -1,31 +1,14 @@ -// file : libbutl/standard-version.mxx -*- C++ -*- +// file : libbutl/standard-version.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif - -// C includes. -#ifndef __cpp_lib_modules_ts #include <string> #include <cstdint> // uint*_t #include <cstddef> // size_t #include <ostream> -#endif -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.standard_version; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -#else -#include <libbutl/optional.mxx> -#endif +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> @@ -39,7 +22,7 @@ import butl.optional; # undef minor #endif -LIBBUTL_MODEXPORT namespace butl +namespace butl { // The build2 "standard version" (normal, earliest, and stub): // @@ -221,7 +204,7 @@ LIBBUTL_MODEXPORT namespace butl // Create empty version. // - standard_version () {} // = default; @@ MOD VC + standard_version () = default; }; // Try to parse a string as a standard version returning nullopt if invalid. diff --git a/libbutl/string-parser.cxx b/libbutl/string-parser.cxx index 5d5ec47..af5c1b3 100644 --- a/libbutl/string-parser.cxx +++ b/libbutl/string-parser.cxx @@ -1,33 +1,7 @@ // file : libbutl/string-parser.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/string-parser.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <cstddef> -#include <utility> // move() -#include <stdexcept> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.string_parser; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif - -#endif +#include <libbutl/string-parser.hxx> using namespace std; @@ -40,7 +14,7 @@ namespace butl inline static bool space (char c) noexcept { - return c == ' ' || c == '\t'; + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } vector<pair<string, size_t>> diff --git a/libbutl/string-parser.mxx b/libbutl/string-parser.hxx index 4ff1590..9fc20c0 100644 --- a/libbutl/string-parser.mxx +++ b/libbutl/string-parser.hxx @@ -1,32 +1,17 @@ -// file : libbutl/string-parser.mxx -*- C++ -*- +// file : libbutl/string-parser.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> #include <cstddef> // size_t #include <utility> // pair #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.string_parser; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { namespace string_parser { diff --git a/libbutl/string-table.mxx b/libbutl/string-table.hxx index 78c6cd6..010fb01 100644 --- a/libbutl/string-table.mxx +++ b/libbutl/string-table.hxx @@ -1,36 +1,18 @@ -// file : libbutl/string-table.mxx -*- C++ -*- +// file : libbutl/string-table.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> #include <vector> +#include <cassert> #include <unordered_map> -#include <limits> // numeric_limits -#include <cstddef> // size_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.string_table; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.multi_index; -#else -#include <libbutl/multi-index.mxx> -#endif +#include <libbutl/multi-index.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // A pool of strings and, optionally, other accompanying data in which each // entry is assigned an individual index (or id) of type I (e.g., uint8_t, diff --git a/libbutl/string-table.txx b/libbutl/string-table.txx index 4db0a6b..8416b48 100644 --- a/libbutl/string-table.txx +++ b/libbutl/string-table.txx @@ -1,6 +1,9 @@ // file : libbutl/string-table.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file +#include <limits> // numeric_limits +#include <cstddef> // size_t + namespace butl { template <typename I, typename D> diff --git a/libbutl/tab-parser.cxx b/libbutl/tab-parser.cxx index cca2792..d7e5a14 100644 --- a/libbutl/tab-parser.cxx +++ b/libbutl/tab-parser.cxx @@ -1,39 +1,12 @@ // file : libbutl/tab-parser.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/tab-parser.mxx> -#endif - -#include <cassert> - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <vector> -#include <cstdint> -#include <stdexcept> +#include <libbutl/tab-parser.hxx> #include <istream> #include <sstream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.tab_parser; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif -import butl.string_parser; -#else -#include <libbutl/string-parser.mxx> -#endif +#include <libbutl/string-parser.hxx> using namespace std; diff --git a/libbutl/tab-parser.mxx b/libbutl/tab-parser.hxx index a7f7e01..2dc612b 100644 --- a/libbutl/tab-parser.mxx +++ b/libbutl/tab-parser.hxx @@ -1,33 +1,17 @@ -// file : libbutl/tab-parser.mxx -*- C++ -*- +// file : libbutl/tab-parser.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <iosfwd> #include <string> #include <vector> #include <cstdint> // uint64_t #include <stdexcept> // runtime_error -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.tab_parser; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { class LIBBUTL_SYMEXPORT tab_parsing: public std::runtime_error { diff --git a/libbutl/target-triplet.cxx b/libbutl/target-triplet.cxx index db71e3c..e28f119 100644 --- a/libbutl/target-triplet.cxx +++ b/libbutl/target-triplet.cxx @@ -1,33 +1,9 @@ // file : libbutl/target-triplet.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/target-triplet.mxx> -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <ostream> +#include <libbutl/target-triplet.hxx> #include <stdexcept> // invalid_argument -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.target_triplet; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -#endif using namespace std; @@ -112,6 +88,13 @@ namespace butl if (system.front () == '-' || system.back () == '-') bad ("invalid os/kernel/abi"); + // Canonicalize SYSTEM. + // + if (system == "linux") + system = "linux-gnu"; // Per config.sub. + else if (system == "windows-gnu" && vendor == "w64") // Clang's innovation. + system = "mingw32"; + // Extract VERSION for some recognized systems. // string::size_type v (0); @@ -129,6 +112,14 @@ namespace butl version.assign (system, v, string::npos); system.resize (system.size () - version.size ()); } + else if (vendor == "apple" && system.compare (0, 3, "ios") == 0) + { + // Handle iosNN[-...]. + // + string::size_type p (system.find ('-')); + version.assign (system, 3, p == string::npos ? p : p - 3); + system.erase (3, version.size ()); + } // Determine class for some recognized systems. // @@ -136,6 +127,8 @@ namespace butl class_ = "linux"; else if (vendor == "apple" && system == "darwin") class_ = "macos"; + else if (vendor == "apple" && system.compare (0, 3, "ios") == 0) + class_ = "ios"; else if (system == "freebsd" || system == "openbsd" || system == "netbsd") @@ -167,7 +160,10 @@ namespace butl if (!version.empty ()) { - r += version; + if (vendor == "apple" && system.compare (0, 3, "ios") == 0) + r.insert (r.size () - system.size () + 3, version); + else + r += version; } return r; @@ -191,7 +187,10 @@ namespace butl if (!version.empty ()) { - r += version; + if (vendor == "apple" && system.compare (0, 3, "ios") == 0) + r.insert (r.size () - system.size () + 3, version); + else + r += version; } return r; diff --git a/libbutl/target-triplet.mxx b/libbutl/target-triplet.hxx index 1ecc7e5..bfb2c00 100644 --- a/libbutl/target-triplet.mxx +++ b/libbutl/target-triplet.hxx @@ -1,30 +1,14 @@ -// file : libbutl/target-triplet.mxx -*- C++ -*- +// file : libbutl/target-triplet.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <ostream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.target_triplet; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // This is the ubiquitous 'target triplet' that loosely has the CPU-VENDOR-OS // form which, these days, quite often takes the CPU-VENDOR-OS-ABI form. Plus @@ -91,14 +75,19 @@ LIBBUTL_MODEXPORT namespace butl // arm-softfloat-linux-gnu arm softfloat linux-gnu // i686-pc-mingw32 i686 mingw32 // i686-w64-mingw32 i686 w64 mingw32 + // i686-w64-windows-gnu i686 w64 mingw32 // i686-lfs-linux-gnu i686 lfs linux-gnu // x86_64-unknown-linux-gnu x86_64 linux-gnu + // x86_64-redhat-linux x86_64 redhat linux-gnu // x86_64-linux-gnux32 x86_64 linux-gnux32 // x86_64-microsoft-win32-msvc14.0 x86_64 microsoft win32-msvc 14.0 // x86_64-pc-windows-msvc x86_64 windows-msvc // x86_64-pc-windows-msvc19.11.25547 x86_64 windows-msvc 19.11.25547 // wasm32-unknown-emscripten wasm32 emscripten // arm64-apple-darwin20.1.0 aarch64 apple darwin 20.1.0 + // arm64-apple-ios14.4 aarch64 apple ios 14.4 + // arm64-apple-ios14.4-simulator aarch64 apple ios-simulator 14.4 + // x86_64-apple-ios14.4-macabi x86_64 apple ios-macabi 14.4 // // Similar to version splitting, for certain commonly-used targets we also // derive the "target class" which can be used as a shorthand, more @@ -110,6 +99,9 @@ LIBBUTL_MODEXPORT namespace butl // macos *-apple-darwin* // bsd *-*-(freebsd|openbsd|netbsd)* // windows *-*-win32-* | *-*-windows-* | *-*-mingw32 + // ios *-apple-ios* + // + // NOTE: see also os_release if adding anything new here. // // References: // @@ -160,7 +152,7 @@ LIBBUTL_MODEXPORT namespace butl explicit target_triplet (const std::string&); - target_triplet () {} // = default; @@ MOD VC + target_triplet () = default; }; inline bool diff --git a/libbutl/timestamp.cxx b/libbutl/timestamp.cxx index 9be2a82..260fbef 100644 --- a/libbutl/timestamp.cxx +++ b/libbutl/timestamp.cxx @@ -1,9 +1,7 @@ // file : libbutl/timestamp.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/timestamp.mxx> -#endif +#include <libbutl/timestamp.hxx> #include <time.h> // localtime_{r,s}(), gmtime_{r,s}(), strptime(), timegm() #include <errno.h> // EINVAL @@ -25,22 +23,18 @@ #ifdef __GLIBCXX__ extern "C" { -#include "strptime.c" +# include "strptime.c" } #else -#include <locale.h> // LC_ALL +# include <locale.h> // LC_ALL #endif #endif -#ifndef __cpp_lib_modules_ts -#include <string> -#include <chrono> - -#include <ctime> // tm, time_t, mktime(), strftime()[__GLIBCXX__] +#include <ctime> // tm, time_t, mktime(), strftime()[libstdc++] #include <cstdlib> // strtoull() -#include <sstream> +#include <sstream> // ostringstream, stringstream[VC] #include <iomanip> // put_time(), setw(), dec, right -#include <cstring> // strlen(), memcpy() +#include <cstring> // strlen(), memcpy(), strchr()[VC] #include <ostream> #include <utility> // pair, make_pair() #include <stdexcept> // runtime_error @@ -49,30 +43,14 @@ extern "C" // #ifdef _WIN32 #ifndef __GLIBCXX__ -#include <locale> -#include <clocale> -#include <iomanip> -#endif +# include <ios> +# include <locale> +# include <clocale> +# include <iomanip> #endif #endif -// Other includes. - -#ifdef __cpp_modules_ts -module butl.timestamp; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -import butl.utility; -#else -#include <libbutl/utility.mxx> // throw_generic_error() -#endif +#include <libbutl/utility.hxx> // throw_generic_error() using namespace std; @@ -180,24 +158,85 @@ strptime (const char* input, const char* format, tm* time) { // VC std::get_time()-based implementation. // - istringstream is (input); + // Note that the major difference in semantics of strptime() and + // std::get_time() is that the former always fails if the format string is + // not fully processed, while the latter can succeed in such a case, + // specifically if the end of the stream is reached after a conversion + // specifier was successfully applied. See this post for some background: + // + // https://stackoverflow.com/questions/67060956/what-is-the-correct-behavior-of-stdget-time-for-short-input + // + // The consequence of this fact is that there is no easy way to detect if + // the format was fully processed when the end of input is reached. It seems + // that the only way to resolve this ambiguity is to append some end marker + // to both the input and format and re-parse. We can dedicate some character + // that is unlikely to be used in the time format/input (for example '\x1') + // to serve as an end marker. + // + // Alternatively, we can abandon the use of std::get_time() altogether and, + // for example, use a FreeBSD-based strptime() implementation. This feels a + // bit too radical at the moment, though. + // + const char em ('\x1'); + + if (strchr (input, em) != nullptr || strchr (format, em) != nullptr) + return nullptr; + + stringstream ss (input); // Input/output stream. // The original strptime() function behaves according to the process' C // locale (set with std::setlocale()), which can differ from the process C++ // locale (set with std::locale::global()). // - is.imbue (locale (setlocale (LC_ALL, nullptr))); + ss.imbue (locale (setlocale (LC_ALL, nullptr))); - if (!(is >> get_time (time, format))) + // Bail out on the parsing error. + // + if (!(ss >> get_time (time, format))) return nullptr; - else - // tellg() behaves as UnformattedInputFunction, so returns failure status - // if eofbit is set. - // - return const_cast<char*> ( - input + (is.eof () - ? strlen (input) - : static_cast<size_t> (is.tellg ()))); + + // If the end of input is not reached then the format string is fully + // processed. + // + if (!ss.eof ()) + return const_cast<char*> (input + static_cast<size_t> (ss.tellg ())); + + // Since eof is reached, we cannot say if the format string was fully + // processed or not. For example: + // + // %b %Y - format + // Feb 2016 - eofbit is set with a format fully processed + // Feb - eofbit is set with a format partially processed + // + // So append the end marker character to both input and format and re-parse. + // + ss.clear (); // Clear eof. + ss.seekp (0, ios_base::end); // Position to the end for writing. + ss.put (em); // Append the end marker. + ss.seekg (0); // Rewind for reading. + + string fm (format); + fm += em; // Append the end marker. + + // Fail if the input is "shorter" than the format. For example: + // + // %b %Y\x1 - format + // Feb\x1 - stream + // + // Note that we can reuse the time object for re-parsing, since on success + // its fields will be overwritten with the same values. + // + if (!(ss >> get_time (time, fm.c_str ()))) + return nullptr; + + // We would fail earlier otherwise. + // + assert (ss.eof () || ss.get () == stringstream::traits_type::eof ()); + + // tellg() behaves as UnformattedInputFunction, so returns failure status if + // eofbit is set. + // + return const_cast<char*> (input + strlen (input)); } #endif diff --git a/libbutl/timestamp.mxx b/libbutl/timestamp.hxx index 141e13d..2714a0d 100644 --- a/libbutl/timestamp.mxx +++ b/libbutl/timestamp.hxx @@ -1,34 +1,15 @@ -// file : libbutl/timestamp.mxx -*- C++ -*- +// file : libbutl/timestamp.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <iosfwd> #include <string> #include <chrono> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.timestamp; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -//@@ MOD TODO: should't we re-export chrono (for somparison operator, etc)? -// or ADL should kick in? #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // On all three main platforms that we target (GNU/Linux, Windows (both // VC++ and GCC/MinGW64), and MacOS X) with recent C++ runtimes, @@ -61,21 +42,12 @@ LIBBUTL_MODEXPORT namespace butl // unreal and all of them are less than any non-special value (strictly // speaking unreal is no greater (older) than any real value). // -#if defined(__cpp_modules_ts) && defined(__clang__) //@@ MOD Clang duplicate sym. - inline const timestamp::rep timestamp_unknown_rep = -1; - inline const timestamp timestamp_unknown = timestamp (duration (-1)); - inline const timestamp::rep timestamp_nonexistent_rep = 0; - inline const timestamp timestamp_nonexistent = timestamp (duration (0)); - inline const timestamp::rep timestamp_unreal_rep = 1; - inline const timestamp timestamp_unreal = timestamp (duration (1)); -#else const timestamp::rep timestamp_unknown_rep = -1; const timestamp timestamp_unknown = timestamp (duration (-1)); const timestamp::rep timestamp_nonexistent_rep = 0; const timestamp timestamp_nonexistent = timestamp (duration (0)); const timestamp::rep timestamp_unreal_rep = 1; const timestamp timestamp_unreal = timestamp (duration (1)); -#endif // Print human-readable representation of the timestamp. // diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx index 4219846..294bb3f 100644 --- a/libbutl/unicode.cxx +++ b/libbutl/unicode.cxx @@ -1,32 +1,11 @@ // file : libbutl/unicode.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/unicode.mxx> -#endif - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <ostream> -#include <cstdint> +#include <libbutl/unicode.hxx> #include <cstddef> // size_t #include <utility> // pair #include <algorithm> // lower_bound() -#endif - -#ifdef __cpp_modules_ts -module butl.unicode; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -#endif using namespace std; diff --git a/libbutl/unicode.mxx b/libbutl/unicode.hxx index b846476..8d99d0e 100644 --- a/libbutl/unicode.mxx +++ b/libbutl/unicode.hxx @@ -1,31 +1,15 @@ -// file : libbutl/unicode.mxx -*- C++ -*- +// file : libbutl/unicode.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <ostream> #include <cstdint> // uint16_t -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.unicode; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to // only be used in the context of the UTF-16 character encoding form. Thus, diff --git a/libbutl/url.mxx b/libbutl/url.hxx index 713bc3e..5721cfd 100644 --- a/libbutl/url.mxx +++ b/libbutl/url.hxx @@ -1,50 +1,23 @@ -// file : libbutl/url.mxx -*- C++ -*- +// file : libbutl/url.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> +#include <cassert> +#include <cstddef> // size_t #include <cstdint> // uint*_t #include <utility> // move() #include <ostream> #include <iterator> // back_inserter -#include <cstddef> // size_t -#include <stdexcept> // invalid_argument -#include <algorithm> // find(), find_if() -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.url; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.path; -import butl.utility; -import butl.optional; - -import butl.small_vector; -#else -#include <libbutl/path.mxx> -#include <libbutl/utility.mxx> -#include <libbutl/optional.mxx> - -#include <libbutl/small-vector.mxx> -#endif +#include <libbutl/path.hxx> +#include <libbutl/utility.hxx> +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // RFC3986 Uniform Resource Locator (URL). // diff --git a/libbutl/url.ixx b/libbutl/url.ixx index b823ee7..19d54c7 100644 --- a/libbutl/url.ixx +++ b/libbutl/url.ixx @@ -1,7 +1,7 @@ // file : libbutl/url.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +namespace butl { // url_traits // diff --git a/libbutl/url.txx b/libbutl/url.txx index 0951e80..b2caa37 100644 --- a/libbutl/url.txx +++ b/libbutl/url.txx @@ -1,7 +1,12 @@ // file : libbutl/url.txx -*- C++ -*- // license : MIT; see accompanying LICENSE file -LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +#include <stdexcept> // invalid_argument +#include <algorithm> // find(), find_if() + +#include <libbutl/small-vector.hxx> + +namespace butl { // Convenience functions. // diff --git a/libbutl/utf8.mxx b/libbutl/utf8.hxx index 15e8ded..697f77a 100644 --- a/libbutl/utf8.mxx +++ b/libbutl/utf8.hxx @@ -1,33 +1,17 @@ -// file : libbutl/utf8.mxx -*- C++ -*- +// file : libbutl/utf8.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <string> #include <cstdint> // uint8_t #include <utility> // pair -#endif - -// Other includes. -#ifdef __cpp_modules_ts -export module butl.utf8; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.unicode; -#else -#include <libbutl/unicode.mxx> -#endif +#include <libbutl/unicode.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Here and below we will refer to bytes that encode a singe Unicode // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence" diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx index 3d2e092..10624f8 100644 --- a/libbutl/utf8.ixx +++ b/libbutl/utf8.ixx @@ -116,7 +116,7 @@ namespace butl { if (b < 0xFE) { - *what = b < 0xFC ? "5" : "6"; + *what = b < 0xFC ? '5' : '6'; *what += "-byte length UTF-8 sequence"; } else diff --git a/libbutl/utility.cxx b/libbutl/utility.cxx index bbeafd2..b03a8f8 100644 --- a/libbutl/utility.cxx +++ b/libbutl/utility.cxx @@ -1,44 +1,23 @@ // file : libbutl/utility.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/utility.mxx> -#endif +#include <libbutl/utility.hxx> #ifdef _WIN32 #include <libbutl/win32-utility.hxx> #endif -#include <stdlib.h> // setenv(), unsetenv(), _putenv() - -#ifndef __cpp_lib_modules_ts -#include <string> -#include <cstddef> -#include <utility> +#include <stdlib.h> // getenv(), setenv(), unsetenv(), _putenv() +#include <cstring> // strncmp(), strlen() #include <ostream> #include <type_traits> // enable_if, is_base_of #include <system_error> -#endif #include <libbutl/ft/lang.hxx> #include <libbutl/ft/exception.hxx> -#ifdef __cpp_modules_ts -module butl.utility; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -import butl.utf8; -#else -#include <libbutl/utf8.mxx> -#endif +#include <libbutl/utf8.hxx> namespace butl { @@ -192,13 +171,42 @@ namespace butl for (; i != n && ws (l[i]); ++i) ; for (; n != i && ws (l[n - 1]); --n) ; - if (i != 0) + if (n != l.size ()) l.resize (n); + if (i != 0) l.erase (0, i); + + return l; + } + + string& + trim_left (string& l) + { + auto ws = [] (char c ) { - string s (l, i, n - i); - l.swap (s); - } - else if (n != l.size ()) - l.resize (n); + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; + }; + + size_t i (0), n (l.size ()); + + for (; i != n && ws (l[i]); ++i) ; + + if (i != 0) l.erase (0, i); + + return l; + } + + string& + trim_right (string& l) + { + auto ws = [] (char c ) + { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; + }; + + size_t i (0), n (l.size ()); + + for (; n != i && ws (l[n - 1]); --n) ; + + if (n != l.size ()) l.resize (n); return l; } @@ -332,6 +340,55 @@ namespace butl s.resize (d - s.begin ()); } +#ifdef __cpp_thread_local + thread_local +#else + __thread +#endif + const char* const* thread_env_ = nullptr; + +#ifdef _WIN32 + const char* const* + thread_env () {return thread_env_;} + + void + thread_env (const char* const* v) {thread_env_ = v;} +#endif + + optional<std::string> + getenv (const char* name) + { + if (const char* const* vs = thread_env_) + { + size_t n (strlen (name)); + + for (; *vs != nullptr; ++vs) + { + const char* v (*vs); + + // Note that on Windows variable names are case-insensitive. + // +#ifdef _WIN32 + if (icasecmp (name, v, n) == 0) +#else + if (strncmp (name, v, n) == 0) +#endif + { + switch (v[n]) + { + case '=': return string (v + n + 1); + case '\0': return nullopt; + } + } + } + } + + if (const char* r = ::getenv (name)) + return std::string (r); + + return nullopt; + } + void setenv (const string& name, const string& value) { diff --git a/libbutl/utility.mxx b/libbutl/utility.hxx index 8a0059a..9eb052d 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.hxx @@ -1,9 +1,7 @@ -// file : libbutl/utility.mxx -*- C++ -*- +// file : libbutl/utility.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif #ifndef _WIN32 # include <strings.h> // strcasecmp(), strncasecmp() @@ -11,7 +9,6 @@ # include <string.h> // _stricmp(), _strnicmp() #endif -#ifndef __cpp_lib_modules_ts #include <string> #include <iosfwd> // ostream #include <istream> @@ -20,29 +17,17 @@ #include <cstring> // strcmp(), strlen() #include <exception> // exception, uncaught_exception[s]() //#include <functional> // hash -#endif #include <libbutl/ft/lang.hxx> // thread_local #include <libbutl/ft/exception.hxx> // uncaught_exceptions -#ifdef __cpp_modules_ts -export module butl.utility; -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.utf8; -import butl.unicode; -import butl.optional; -#else -#include <libbutl/utf8.mxx> -#include <libbutl/unicode.mxx> -#include <libbutl/optional.mxx> -#endif +#include <libbutl/utf8.hxx> +#include <libbutl/unicode.hxx> +#include <libbutl/optional.hxx> #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // Throw std::system_error with generic_category or system_category, // respectively. @@ -147,11 +132,13 @@ LIBBUTL_MODEXPORT namespace butl bool digit (char); bool alnum (char); bool xdigit (char); + bool wspace (char); bool alpha (wchar_t); bool digit (wchar_t); bool alnum (wchar_t); bool xdigit (wchar_t); + bool wspace (wchar_t); // Basic string utilities. // @@ -161,13 +148,31 @@ LIBBUTL_MODEXPORT namespace butl LIBBUTL_SYMEXPORT std::string& trim (std::string&); + LIBBUTL_SYMEXPORT std::string& + trim_left (std::string&); + + LIBBUTL_SYMEXPORT std::string& + trim_right (std::string&); + inline std::string trim (std::string&& s) { return move (trim (s)); } - // Find the beginning and end poistions of the next word. Return the size + inline std::string + trim_left (std::string&& s) + { + return move (trim_left (s)); + } + + inline std::string + trim_right (std::string&& s) + { + return move (trim_right (s)); + } + + // Find the beginning and end positions of the next word. Return the size // of the word or 0 and set b = e = n if there are no more words. For // example: // @@ -185,6 +190,24 @@ LIBBUTL_MODEXPORT namespace butl // // The second version examines up to the n'th character in the string. // + // The third version, instead of skipping consecutive delimiters, treats + // them as separating empty words. The additional m variable contains an + // unspecified internal state and should be initialized to 0. Note that in + // this case you should use the (b == n) condition to detect the end. Note + // also that a leading delimiter is considered as separating an empty word + // from the rest and the trailing delimiter is considered as separating the + // rest from an empty word. For example, this is how to parse lines while + // observing blanks: + // + // for (size_t b (0), e (0), m (0), n (s.size ()); + // next_word (s, n, b, e, m, '\n', '\r'), b != n; ) + // { + // string l (s, b, e - b); + // } + // + // For string "\na\n" this code will observe the {"", "a", ""} words. And + // for just "\n" it will observe the {"", ""} words. + // std::size_t next_word (const std::string&, std::size_t& b, std::size_t& e, char d1 = ' ', char d2 = '\0'); @@ -193,6 +216,11 @@ LIBBUTL_MODEXPORT namespace butl next_word (const std::string&, std::size_t n, std::size_t& b, std::size_t& e, char d1 = ' ', char d2 = '\0'); + std::size_t + next_word (const std::string&, std::size_t n, + std::size_t& b, std::size_t& e, std::size_t& m, + char d1 = ' ', char d2 = '\0'); + // Sanitize a string to only contain characters valid in an identifier // (ASCII alphanumeric plus `_`) replacing all others with `_`. // @@ -266,17 +294,82 @@ LIBBUTL_MODEXPORT namespace butl // Environment variables. // - optional<std::string> - getenv (const std::string&); + // Our getenv() wrapper (as well as the relevant process startup functions) + // have a notion of a "thread environment", that is, thread-specific + // environment variables. However, unlike the process environment (in the + // form of the environ array), the thread environment is specified as a set + // of overrides over the process environment (sets and unsets), the same as + // for the process startup. + // + // See also path_traits::thread_current_directory(). + // + extern +#ifdef __cpp_thread_local + thread_local +#else + __thread +#endif + const char* const* thread_env_; + + // On Windows one cannot export a thread-local variable so we have to + // use wrapper functions. + // +#ifdef _WIN32 + LIBBUTL_SYMEXPORT const char* const* + thread_env (); + + LIBBUTL_SYMEXPORT void + thread_env (const char* const*); +#else + const char* const* + thread_env (); + + void + thread_env (const char* const*); +#endif + + struct auto_thread_env + { + optional<const char* const*> prev_env; + + auto_thread_env () = default; + + explicit + auto_thread_env (const char* const*); + + // Move-to-empty-only type. + // + auto_thread_env (auto_thread_env&&) noexcept; + auto_thread_env& operator= (auto_thread_env&&) noexcept; + + auto_thread_env (const auto_thread_env&) = delete; + auto_thread_env& operator= (const auto_thread_env&) = delete; + + ~auto_thread_env (); + }; + + // Get the environment variables taking into account the current thread's + // overrides (thread_env). + // + LIBBUTL_SYMEXPORT optional<std::string> + getenv (const char*); + + inline optional<std::string> + getenv (const std::string& n) + { + return getenv (n.c_str ()); + } - // Throw system_error on failure. + // Set the process environment variable. Best done before starting any + // threads (see thread_env). Throw system_error on failure. // // Note that on Windows setting an empty value unsets the variable. // LIBBUTL_SYMEXPORT void setenv (const std::string& name, const std::string& value); - // Throw system_error on failure. + // Unset the process environment variable. Best done before starting any + // threads (see thread_env). Throw system_error on failure. // LIBBUTL_SYMEXPORT void unsetenv (const std::string&); @@ -477,7 +570,7 @@ LIBBUTL_MODEXPORT namespace butl #endif } -LIBBUTL_MODEXPORT namespace std +namespace std { // Sanitize the exception description before printing. This includes: // diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index fa37a14..fda1ce5 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -1,13 +1,10 @@ // file : libbutl/utility.ixx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_lib_modules_ts #include <cctype> // toupper(), tolower(), is*() #include <cwctype> // isw*() -#include <cstdlib> // getenv() #include <algorithm> // for_each() #include <stdexcept> // invalid_argument -#endif namespace butl { @@ -146,6 +143,12 @@ namespace butl } inline bool + wspace (char c) + { + return std::isspace (c); + } + + inline bool alpha (wchar_t c) { return std::iswalpha (c); @@ -169,6 +172,12 @@ namespace butl return std::iswxdigit (c); } + inline bool + wspace (wchar_t c) + { + return std::iswspace (c); + } + inline std::size_t next_word (const std::string& s, std::size_t& b, std::size_t& e, char d1, char d2) @@ -176,7 +185,7 @@ namespace butl return next_word (s, s.size (), b, e, d1, d2); } - inline size_t + inline std::size_t next_word (const std::string& s, std::size_t n, std::size_t& b, std::size_t& e, char d1, char d2) @@ -201,6 +210,66 @@ namespace butl return e - b; } + inline std::size_t + next_word (const std::string& s, + std::size_t n, std::size_t& b, std::size_t& e, std::size_t& m, + char d1, char d2) + { + // An empty word will necessarily be represented as b and e being the + // position of a delimiter. Consider these corner cases (in all three we + // should produce two words): + // + // \n + // a\n + // \na + // + // It feels sensible to represent an empty word as the position of the + // trailing delimiter except if it is the last character (the first two + // cases). Thus the additional m state, which, if 0 or 1 indicates the + // number of delimiters to skip before parsing the next word and 2 if + // this is a trailing delimiter for which we need to fake an empty word + // with the leading delimiter. + + if (b != e) + b = e; + + if (m > 1) + { + --m; + return 0; + } + + // Skip the leading delimiter, if any. + // + b += m; + + if (b == n) + { + e = n; + return 0; + } + + // Find first trailing delimiter. + // + m = 0; + for (e = b; e != n; ++e) + { + if (s[e] == d1 || s[e] == d2) + { + m = 1; + + // Handle the special delimiter as the last character case. + // + if (e + 1 == n) + ++m; + + break; + } + } + + return e - b; + } + inline std::string& sanitize_identifier (std::string& s) { @@ -228,7 +297,7 @@ namespace butl inline void sanitize_strlit (const std::string& s, std::string& o) { - for (size_t i (0), j;; i = j + 1) + for (std::size_t i (0), j;; i = j + 1) { j = s.find_first_of ("\\\"\n", i); o.append (s.c_str () + i, (j == std::string::npos ? s.size () : j) - i); @@ -333,13 +402,58 @@ namespace butl return utf8_length_impl (s, nullptr, ts, wl).has_value (); } - inline optional<std::string> - getenv (const std::string& name) +#ifndef _WIN32 + inline const char* const* + thread_env () + { + return thread_env_; + } + + inline void + thread_env (const char* const* v) + { + thread_env_ = v; + } +#endif + + // auto_thread_env + // + inline auto_thread_env:: + auto_thread_env (const char* const* new_env) { - if (const char* r = std::getenv (name.c_str ())) - return std::string (r); + const char* const* cur_env (thread_env ()); - return nullopt; + if (cur_env != new_env) + { + prev_env = cur_env; + thread_env (new_env); + } + } + + inline auto_thread_env:: + auto_thread_env (auto_thread_env&& x) noexcept + : prev_env (std::move (x.prev_env)) + { + x.prev_env = nullopt; + } + + inline auto_thread_env& auto_thread_env:: + operator= (auto_thread_env&& x) noexcept + { + if (this != &x) + { + prev_env = std::move (x.prev_env); + x.prev_env = nullopt; + } + + return *this; + } + + inline auto_thread_env:: + ~auto_thread_env () + { + if (prev_env) + thread_env (*prev_env); } template <typename F, typename P> diff --git a/libbutl/uuid-linux.cxx b/libbutl/uuid-linux.cxx index 7689088..82af2e9 100644 --- a/libbutl/uuid-linux.cxx +++ b/libbutl/uuid-linux.cxx @@ -13,7 +13,7 @@ #include <utility> // move() #include <system_error> -#include <libbutl/utility.mxx> // function_cast() +#include <libbutl/utility.hxx> // function_cast() using namespace std; diff --git a/libbutl/uuid-openbsd.cxx b/libbutl/uuid-openbsd.cxx new file mode 100644 index 0000000..b64436b --- /dev/null +++ b/libbutl/uuid-openbsd.cxx @@ -0,0 +1,80 @@ +// file : libbutl/uuid-openbsd.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef BUILD2_BOOTSTRAP + +#include <libbutl/uuid.hxx> + +#include <uuid.h> + +#include <errno.h> + +#include <cassert> +#include <cstring> // memcpy() +#include <system_error> + +using namespace std; + +namespace butl +{ + void + uuid_throw_weak (); // uuid.cxx + + uuid uuid_system_generator:: + generate (bool strong) + { + // The OpenBSD uuid_*() (<uuid.h>, uuid_compare(3)) API generates version + // 4 UUIDs (i.e. randomly generated) at least from version 6.4. For now we + // will assume that only random ones are strong. + // + // Here we assume uuid_t has the same definition as in FreeBSD/NetBSD (it + // is defined in <sys/uuid.h>). + // + uuid_t d; + uint32_t s; + uuid_create (&d, &s); + + // None of the uuid_s_* errors seem plausible for this function so let's + // return the generic "not supported" error code. + // + if (s != uuid_s_ok) + throw system_error (ENOSYS, system_category ()); + + uuid r; + + // This is effectively just memcpy() but let's reference the member names + // in case anything changes on either side. + // + r.time_low = d.time_low; + r.time_mid = d.time_mid; + r.time_hiv = d.time_hi_and_version; + r.clock_seq_hir = d.clock_seq_hi_and_reserved; + r.clock_seq_low = d.clock_seq_low; + memcpy (r.node, d.node, 6); + + assert (r.variant () == uuid_variant::dce); // Sanity check. + + if (strong) + { + switch (r.version ()) + { + case uuid_version::random: break; + default: uuid_throw_weak (); + } + } + + return r; + } + + void uuid_system_generator:: + initialize () + { + } + + void uuid_system_generator:: + terminate () + { + } +} + +#endif // BUILD2_BOOTSTRAP diff --git a/libbutl/uuid.cxx b/libbutl/uuid.cxx index 377afb7..2132808 100644 --- a/libbutl/uuid.cxx +++ b/libbutl/uuid.cxx @@ -5,7 +5,7 @@ #include <errno.h> // ENOTSUP -#include <cstdio> // sprintf() scanf() +#include <cstdio> // snprintf() sscanf() #include <cstring> // strlen() #include <stdexcept> #include <system_error> @@ -19,16 +19,17 @@ namespace butl { array<char, 37> r; - sprintf (r.data (), - (upper - ? "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X" - : "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"), - time_low, - time_mid, - time_hiv, - clock_seq_hir, - clock_seq_low, - node[0], node[1], node[2], node[3], node[4], node[5]); + snprintf (r.data (), + 37, + (upper + ? "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X" + : "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"), + time_low, + time_mid, + time_hiv, + clock_seq_hir, + clock_seq_low, + node[0], node[1], node[2], node[3], node[4], node[5]); return r; } diff --git a/libbutl/uuid.hxx b/libbutl/uuid.hxx index 2361640..862f02d 100644 --- a/libbutl/uuid.hxx +++ b/libbutl/uuid.hxx @@ -48,12 +48,12 @@ namespace butl { // Normally not accessed directly (see RFC4122 Section 4.1.2). // - std::uint32_t time_low = 0; - std::uint16_t time_mid = 0; - std::uint16_t time_hiv = 0; // hi_and_version - std::uint8_t clock_seq_hir = 0; // hi_and_reserved - std::uint8_t clock_seq_low = 0; - std::uint8_t node[6] = {0, 0, 0, 0, 0, 0}; + std::uint32_t time_low = 0; + std::uint16_t time_mid = 0; + std::uint16_t time_hiv = 0; // hi_and_version + std::uint8_t clock_seq_hir = 0; // hi_and_reserved + std::uint8_t clock_seq_low = 0; + std::uint8_t node[6] = {0, 0, 0, 0, 0, 0}; // System UUID generator. See the uuid_generator interface for details. // @@ -158,10 +158,10 @@ namespace butl void swap (uuid&); - uuid (uuid&&); + uuid (uuid&&) noexcept; uuid (const uuid&) = default; - uuid& operator= (uuid&&); + uuid& operator= (uuid&&) noexcept; uuid& operator= (const uuid&) = default; }; @@ -183,7 +183,7 @@ namespace butl ~uuid_generator () = default; // Generate a UUID. If strong is true (default), generate a strongly- - // unique UUID. Throw std::runtime_error to report errors, including if + // unique UUID. Throw std::system_error to report errors, including if // strong uniqueness cannot be guaranteed. // // A weak UUID is not guaranteed to be unique, neither universialy nor @@ -207,7 +207,7 @@ namespace butl // Optional explicit initialization and termination. Note that it is not // thread-safe and must only be performed once (normally from main()) // before/after any calls to generate(), respectively. Both functions may - // throw std::runtime_error to report errors. + // throw std::system_error to report errors. // static void initialize (); diff --git a/libbutl/uuid.ixx b/libbutl/uuid.ixx index 6744af7..6115be1 100644 --- a/libbutl/uuid.ixx +++ b/libbutl/uuid.ixx @@ -39,14 +39,14 @@ namespace butl } inline uuid:: - uuid (uuid&& u) + uuid (uuid&& u) noexcept : uuid () // nil { swap (u); } inline uuid& uuid:: - operator= (uuid&& u) + operator= (uuid&& u) noexcept { if (this != &u) { diff --git a/libbutl/vector-view.mxx b/libbutl/vector-view.hxx index 7924371..16ab08e 100644 --- a/libbutl/vector-view.mxx +++ b/libbutl/vector-view.hxx @@ -1,32 +1,17 @@ -// file : libbutl/vector-view.mxx -*- C++ -*- +// file : libbutl/vector-view.hxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts #pragma once -#endif -// C includes. - -#ifndef __cpp_lib_modules_ts #include <vector> #include <cstddef> // size_t, ptrdiff_t #include <utility> // swap() #include <iterator> // reverse_iterator #include <stdexcept> // out_of_range -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.vector_view; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -#endif #include <libbutl/export.hxx> -LIBBUTL_MODEXPORT namespace butl +namespace butl { // In our version a const view allows the modification of the elements // unless T is made const (the same semantics as in smart pointers). diff --git a/libbutl/win32-utility.cxx b/libbutl/win32-utility.cxx index 3b44d60..c69842b 100644 --- a/libbutl/win32-utility.cxx +++ b/libbutl/win32-utility.cxx @@ -8,16 +8,9 @@ // #ifdef _WIN32 -#ifndef __cpp_lib_modules_ts -#include <string> #include <memory> // unique_ptr -#include <libbutl/utility.mxx> // throw_system_error() -#else -import std.core; - -import butl.utility; -#endif +#include <libbutl/utility.hxx> // throw_system_error() using namespace std; diff --git a/libbutl/win32-utility.hxx b/libbutl/win32-utility.hxx index b71eb1a..9bed647 100644 --- a/libbutl/win32-utility.hxx +++ b/libbutl/win32-utility.hxx @@ -31,11 +31,7 @@ # endif #endif -#ifndef __cpp_lib_modules_ts #include <string> -#else -import std.core; -#endif #include <libbutl/export.hxx> diff --git a/libbutl/xxhash.c b/libbutl/xxhash.c new file mode 100644 index 0000000..ff28749 --- /dev/null +++ b/libbutl/xxhash.c @@ -0,0 +1,1030 @@ +/* +* xxHash - Fast Hash algorithm +* Copyright (C) 2012-2016, Yann Collet +* +* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following disclaimer +* in the documentation and/or other materials provided with the +* distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +* You can contact the author at : +* - xxHash homepage: http://www.xxhash.com +* - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7S__) )) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. + * When this macro is enabled, xxHash actively checks input for null pointer. + * It it is, result for null input pointers is the same as a null-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; + * set it to 0 when the input is guaranteed to be aligned, + * or when alignment doesn't matter for performance. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! Modify the local functions below should you wish to use some other memory routines +* for malloc(), free() */ +#include <stdlib.h> +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } +/*! and for memcpy() */ +#include <string.h> +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } + +#include <assert.h> /* assert */ + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash.h" + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define FORCE_INLINE static __forceinline +#else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; +# endif +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; } __attribute__((packed)) unalign; +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN +static int XXH_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_sa = 1/(int)(!!(c)) }; } /* use after variable declarations */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const U32 PRIME32_1 = 2654435761U; +static const U32 PRIME32_2 = 2246822519U; +static const U32 PRIME32_3 = 3266489917U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; + +static U32 XXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +/* mix all bits */ +static U32 XXH32_avalanche(U32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +static U32 +XXH32_finalize(U32 h32, const void* ptr, size_t len, + XXH_endianess endian, XXH_alignment align) + +{ + const BYTE* p = (const BYTE*)ptr; + +#define PROCESS1 \ + h32 += (*p++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; + +#define PROCESS4 \ + h32 += XXH_get32bits(p) * PRIME32_3; \ + p+=4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + + switch(len&15) /* or switch(bEnd - p) */ + { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + assert(0); + return h32; /* reaching this point is deemed impossible */ +} + + +FORCE_INLINE U32 +XXH32_endian_align(const void* input, size_t len, U32 seed, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) { + const BYTE* const limit = bEnd - 15; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; + v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; + v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4; + v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4; + } while (p < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (U32)len; + + return XXH32_finalize(h32, p, len&15, endian, align); +} + + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, input, len); + return XXH32_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + + +/*====== Hash streaming ======*/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +FORCE_INLINE XXH_errorcode +XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + +FORCE_INLINE U32 +XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +{ + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned); +} + + +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + +/*====== Canonical representation ======*/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, remaining comparable across different systems. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/*====== Memory access ======*/ + +#ifndef MEM_MODULE +# define MEM_MODULE +# if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint64_t U64; +# else + /* if compiler doesn't support unsigned long long, replace by another 64-bit type */ + typedef unsigned long long U64; +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64; +static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + + +/*====== xxh64 ======*/ + +static const U64 PRIME64_1 = 11400714785074694791ULL; +static const U64 PRIME64_2 = 14029467366897019727ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; + +static U64 XXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static U64 XXH64_avalanche(U64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +static U64 +XXH64_finalize(U64 h64, const void* ptr, size_t len, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)ptr; + +#define PROCESS1_64 \ + h64 ^= (*p++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; + +#define PROCESS4_64 \ + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \ + p+=4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + +#define PROCESS8_64 { \ + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \ + p+=8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} + + switch(len&31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + + /* impossible to reach */ + assert(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +FORCE_INLINE U64 +XXH64_endian_align(const void* input, size_t len, U64 seed, + XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8; + v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8; + v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8; + v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8; + } while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + return XXH64_finalize(h64, p, len, endian, align); +} + + +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/*====== Hash Streaming ======*/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + +FORCE_INLINE XXH_errorcode +XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + +FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +{ + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (U64) state->total_len; + + return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned); +} + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + +/*====== Canonical representation ======*/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#endif /* XXH_NO_LONG_LONG */ diff --git a/libbutl/xxhash.h b/libbutl/xxhash.h new file mode 100644 index 0000000..d6bad94 --- /dev/null +++ b/libbutl/xxhash.h @@ -0,0 +1,328 @@ +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2016, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************** +* Definitions +******************************/ +#include <stddef.h> /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/* **************************** + * API modifier + ******************************/ +/** XXH_INLINE_ALL (and XXH_PRIVATE_API) + * This is useful to include xxhash functions in `static` mode + * in order to inline them, and remove their symbol from the public list. + * Inlining can offer dramatic performance improvement on small keys. + * Methodology : + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * `xxhash.c` is automatically included. + * It's not useful to compile and link it as a separate module. + */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/*! XXH_NAMESPACE, aka Namespace Emulation : + * + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, + * + * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values). + * + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 5 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +typedef unsigned int XXH32_hash_t; + +/*! XXH32() : + Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); + +/*====== Streaming ======*/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/* + * Streaming functions generate the xxHash of an input provided in multiple segments. + * Note that, for small input, they are slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * XXH state must first be allocated, using XXH*_createState() . + * + * Start a new hash by initializing state with a seed, using XXH*_reset(). + * + * Then, feed the hash state by calling XXH*_update() as many times as necessary. + * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using XXH*_digest(). + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a digest, + * and generate some new hashes later on, by calling again XXH*_digest(). + * + * When done, free XXH state space if it was allocated dynamically. + */ + +/*====== Canonical representation ======*/ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. + * The canonical representation uses human-readable write convention, aka big-endian (large digits first). + * These functions allow transformation of hash result into and from its canonical format. + * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. + */ + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +typedef unsigned long long XXH64_hash_t; + +/*! XXH64() : + Calculate the 64-bit hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark). +*/ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); + +/*====== Streaming ======*/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/*====== Canonical representation ======*/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); +#endif /* XXH_NO_LONG_LONG */ + + + +#ifdef XXH_STATIC_LINKING_ONLY + +/* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== */ + +/* These definitions are only present to allow + * static allocation of XXH state, on stack or in a struct for example. + * Never **ever** use members directly. */ + +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + +struct XXH32_state_s { + uint32_t total_len_32; + uint32_t large_len; + uint32_t v1; + uint32_t v2; + uint32_t v3; + uint32_t v4; + uint32_t mem32[4]; + uint32_t memsize; + uint32_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + +struct XXH64_state_s { + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; + uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + +# else + +struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; + unsigned memsize; + unsigned reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + +# ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ +struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; + unsigned memsize; + unsigned reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ +# endif + +# endif + + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ +#endif + +#endif /* XXH_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* XXHASH_H_5627135585666179 */ |