From 3efc32d82de64216410936b746d11ab4aaf561ec Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 9 Mar 2021 12:58:34 +0200 Subject: Add support for compressing/decompressing fdstreams with LZ4 Importing LZ4 code from version 1.9.3. --- libbutl/lz4.cxx | 411 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 411 insertions(+) create mode 100644 libbutl/lz4.cxx (limited to 'libbutl/lz4.cxx') diff --git a/libbutl/lz4.cxx b/libbutl/lz4.cxx new file mode 100644 index 0000000..54a1d82 --- /dev/null +++ b/libbutl/lz4.cxx @@ -0,0 +1,411 @@ +// file : libbutl/lz4.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include + +// This careful macro dance makes sure that all the LZ4 C API functions are +// made static while making sure we include the headers in the same way as the +// implementation files that we include below. +// +#define LZ4LIB_VISIBILITY static +#define LZ4_STATIC_LINKING_ONLY +#define LZ4_PUBLISH_STATIC_FUNCTIONS +#define LZ4_DISABLE_DEPRECATE_WARNINGS +#include "lz4.h" +#include "lz4hc.h" + +#define LZ4FLIB_VISIBILITY static +#define LZ4F_STATIC_LINKING_ONLY +#define LZ4F_PUBLISH_STATIC_FUNCTIONS +#define LZ4F_DISABLE_DEPRECATE_WARNINGS +#include "lz4frame.h" + +#include // bad_alloc +#include // unique_ptr +#include // memcpy() +#include +#include // invalid_argument, logic_error + +#include // eos() + +using namespace std; + +namespace butl +{ + namespace lz4 + { + struct cctx + { + LZ4F_cctx* ctx; + + operator LZ4F_cctx* () const {return ctx;}; + + cctx () + { + if (LZ4F_isError (LZ4F_createCompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); + } + + ~cctx () + { + LZ4F_errorCode_t e (LZ4F_freeCompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + }; + + struct dctx + { + LZ4F_dctx* ctx; + + operator LZ4F_dctx* () const {return ctx;}; + + dctx () + { + if (LZ4F_isError (LZ4F_createDecompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); + } + + ~dctx () + { + LZ4F_errorCode_t e (LZ4F_freeDecompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + }; + + static inline size_t + block_size (LZ4F_blockSizeID_t id) + { + return (id == LZ4F_max4MB ? 4 * 1024 * 1024 : + id == LZ4F_max1MB ? 1 * 1024 * 1024 : + id == LZ4F_max256KB ? 256 * 1024 : + id == LZ4F_max64KB ? 64 * 1024 : 0); + } + + [[noreturn]] static void + throw_exception (LZ4F_errorCodes c) + { + using i = invalid_argument; + + switch (c) + { + case LZ4F_ERROR_GENERIC: throw i ("generic error"); + case LZ4F_ERROR_maxBlockSize_invalid: throw i ("invalid block size"); + case LZ4F_ERROR_blockMode_invalid: throw i ("invalid block mode"); + case LZ4F_ERROR_contentChecksumFlag_invalid: throw i ("invalid content checksum flag"); + case LZ4F_ERROR_compressionLevel_invalid: throw i ("invalid compression level"); + case LZ4F_ERROR_headerVersion_wrong: throw i ("wrong header version"); + case LZ4F_ERROR_blockChecksum_invalid: throw i ("invalid block checksum"); + case LZ4F_ERROR_reservedFlag_set: throw i ("reserved flag set"); + case LZ4F_ERROR_srcSize_tooLarge: throw i ("input too large"); + case LZ4F_ERROR_dstMaxSize_tooSmall: throw i ("output too small"); + case LZ4F_ERROR_frameHeader_incomplete: throw i ("incomplete frame header"); + case LZ4F_ERROR_frameType_unknown: throw i ("unknown frame type"); + case LZ4F_ERROR_frameSize_wrong: throw i ("wrong frame size"); + case LZ4F_ERROR_decompressionFailed: throw i ("invalid compressed content"); + case LZ4F_ERROR_headerChecksum_invalid: throw i ("invalid header checksum"); + case LZ4F_ERROR_contentChecksum_invalid: throw i ("invalid content checksum"); + + case LZ4F_ERROR_allocation_failed: throw bad_alloc (); + + // These seem to be programming errors. + // + case LZ4F_ERROR_srcPtr_wrong: // NULL pointer. + case LZ4F_ERROR_frameDecoding_alreadyStarted: // Incorrect call seq. + + // We should never get these. + // + case LZ4F_OK_NoError: + case LZ4F_ERROR_maxCode: + case _LZ4F_dummy_error_enum_for_c89_never_used: + break; + } + + assert (false); + throw logic_error (LZ4F_getErrorName ((LZ4F_errorCode_t)(-c))); + } + + // As above but for erroneous LZ4F_*() function result. + // + [[noreturn]] static inline void + throw_exception (size_t r) + { + throw_exception (LZ4F_getErrorCode (r)); + } + + // Return the compressed size. + // + uint64_t + compress (ofdstream& os, ifdstream& is, + int level, + int block_id, + optional content_size) + { + assert (block_id >= 4 && block_id <= 7); + + LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; + prefs.autoFlush = 1; + prefs.favorDecSpeed = 0; + prefs.compressionLevel = level; + prefs.frameInfo.blockMode = LZ4F_blockLinked; + prefs.frameInfo.blockSizeID = static_cast (block_id); + prefs.frameInfo.blockChecksumFlag = LZ4F_noBlockChecksum; + prefs.frameInfo.contentChecksumFlag = LZ4F_contentChecksumEnabled; + prefs.frameInfo.contentSize = + content_size ? static_cast (*content_size) : 0; + + // Input/output buffer capacities. + // + size_t ic (block_size (prefs.frameInfo.blockSizeID)); + size_t oc; + + // Input/output buffers. + // + unique_ptr ibg (new char[ic]); char* ib (ibg.get ()); + unique_ptr obg; char* ob; + + // Read into the input buffer returning the number of bytes read and + // updating the total read and the eof flag. + // + // Note that we could try to do direct fd read/write but that would + // complicate things quite a bit (error handling, stream state, etc). + // + uint64_t it (0); + bool eof (false); + auto read = [&is, ib, ic, &it, &eof] () -> size_t + { + eof = butl::eof (is.read (ib, ic)); + size_t n (static_cast (is.gcount ())); + it += n; + return n; + }; + + // Write the specified number of bytes from the output buffer updating + // the total written. + // + uint64_t ot (0); + auto write = [&os, &ob, &ot] (size_t n) + { + os.write (ob, static_cast (n)); + ot += n; + }; + + // Unlike the decompression case below, compression cannot fail due to + // invalid content. So any LZ4F_*() function failure is either due to a + // programming bug or argument inconsistencies (e.g., content size does + // not match actual). + + // To be binary compatible with the lz4 utility we have to compress + // files that fit into the block with a single LZ4F_compressFrame() + // call. + // + size_t in (read ()); + size_t on; + + if (eof && in < ic) // Should be really <= but that's not lz4-compatible. + { + oc = LZ4F_compressFrameBound (in, &prefs); + obg.reset ((ob = new char[oc])); + + on = LZ4F_compressFrame (ob, oc, ib, in, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + write (on); + + // Verify specified and actual content sizes match. + // + // LZ4F_compressFrame() does not fail if it doesn't match instead + // replacing it with the actual value. + // + if (content_size && *content_size != it) + throw_exception (LZ4F_ERROR_frameSize_wrong); + } + else + { + cctx ctx; + + oc = LZ4F_compressBound (ic, &prefs); + obg.reset ((ob = new char[oc])); + + // Write the header. + // + on = LZ4F_compressBegin (ctx, ob, oc, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + write (on); + + // Keep compressing, writing, and reading chunks of content. + // + for (;;) + { + on = LZ4F_compressUpdate (ctx, ob, oc, ib, in, nullptr); + if (LZ4F_isError (on)) + throw_exception (on); + + if (on != 0) // LZ4F_compressUpdate() may just buffer the data. + write (on); + + if (eof) + break; + + in = read (); + } + + // Write the end marker. + // + // Note that this call also verifies specified and actual content + // sizes match. + // + on = LZ4F_compressEnd (ctx, ob, oc, nullptr); + if (LZ4F_isError (on)) + throw_exception (on); + + write (on); + } + + return ot; + } + + uint64_t + decompress (ofdstream& os, ifdstream& is) + { + // The LZ4F_*() decompression functions return a hint of how much data + // they want on the next call. So the plan is to allocate the input + // buffer large enough to hold anything that can be asked for and then + // fill it in in the asked chunks. This way we avoid having to shift the + // unread data, etc. + // + dctx ctx; + + // Input/output buffer capacities and sizes. + // + size_t ic, oc; + size_t in, on; + + // Input/output buffers. + // + unique_ptr ibg; char* ib; + unique_ptr obg; char* ob; + + // Read into the specified buffer returning the number of bytes read and + // updating the eof flag. + // + bool eof (false); + auto read = [&is, &eof] (char* b, size_t c) -> size_t + { + size_t n (0); + do + { + eof = butl::eof (is.read (b + n, c - n)); + n += static_cast (is.gcount ()); + } + while (!eof && n != c); + + return n; + }; + + // Write the specified number of bytes from the output buffer updating + // the total written. + // + uint64_t ot (0); + auto write = [&os, &ob, &ot] (size_t n) + { + os.write (ob, static_cast (n)); + ot += n; + }; + + // Input hint and end as signalled by the LZ4F_*() functions. + // + size_t ih, ie; + + // Read the header. + // + LZ4F_frameInfo_t info = LZ4F_INIT_FRAMEINFO; + { + char hb[LZ4F_HEADER_SIZE_MAX]; + in = read (hb, sizeof (hb)); + + ih = LZ4F_getFrameInfo (ctx, &info, hb, &(ie = in)); + if (LZ4F_isError (ih)) + throw_exception (ih); + + // Use the block size for the output buffer capacity and compressed + // bound plus the header size for the input. The expectation is that + // LZ4F_decompress() should never hint for more than that. + // + oc = block_size (info.blockSizeID); + ic = LZ4F_compressBound (oc, nullptr) + LZ4F_BLOCK_HEADER_SIZE; + + assert (ih <= ic); + + ibg.reset ((ib = new char[ic])); + obg.reset ((ob = new char[oc])); + + // Copy over whatever is left in the header buffer and read up to + // the hinted size. + // + in -= ie; + memcpy (ib, hb + ie, in); + in += read (ib + in, ih - in); + } + + // Keep decompressing, writing, and reading chunks of compressed + // content. + // + // Note that LZ4F_decompress() verifies specified and actual content + // sizes match (similar to compression). + // + for (;;) + { + ih = LZ4F_decompress (ctx, ob, &(on = oc), ib, &(ie = in), nullptr); + if (LZ4F_isError (ih)) + throw_exception (ih); + + // We expect LZ4F_decompress() to consume what it asked for. + // + assert (ie == in); + + write (on); + + if (ih == 0) + break; + + if (eof) + throw invalid_argument ("incomplete compressed content"); + + assert (ih <= ic); + in = read (ib, ih); + } + + return ot; + } + } +} + +// Include the implementation into our translation unit. Let's keep it last +// since the implementation defines a bunch of macros. +// +#if defined(__clang__) || defined (__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +// This header is only include in the implementation so we can include it +// here instead of the above. +// +#define XXH_PRIVATE_API // Makes API static and includes xxhash.c. +#include "xxhash.h" + +// Note that the order of inclusion is important (see *_SRC_INCLUDED macros). +// +extern "C" +{ +#include "lz4.c" +#include "lz4hc.c" +#include "lz4frame.c" +} + +#if defined(__clang__) || defined (__GNUC__) +# pragma GCC diagnostic pop +#endif -- cgit v1.1