diff options
Diffstat (limited to 'util/compress/libdeflate/lib/deflate_compress.c')
-rw-r--r-- | util/compress/libdeflate/lib/deflate_compress.c | 2854 |
1 files changed, 0 insertions, 2854 deletions
diff --git a/util/compress/libdeflate/lib/deflate_compress.c b/util/compress/libdeflate/lib/deflate_compress.c deleted file mode 100644 index cf4379824..000000000 --- a/util/compress/libdeflate/lib/deflate_compress.c +++ /dev/null @@ -1,2854 +0,0 @@ -/* - * deflate_compress.c - a compressor for DEFLATE - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "deflate_compress.h" -#include "deflate_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -/* - * By default, the near-optimal parsing algorithm is enabled at compression - * level 8 and above. The near-optimal parsing algorithm produces a compression - * ratio significantly better than the greedy and lazy algorithms implemented - * here, and also the algorithm used by zlib at level 9. However, it is slow. - */ -#define SUPPORT_NEAR_OPTIMAL_PARSING 1 - -/* - * Define to 1 to maintain the full map from match offsets to offset slots. - * This slightly speeds up translations of match offsets to offset slots, but it - * uses 32769 bytes of memory rather than the 512 bytes used by the condensed - * map. The speedup provided by the larger map is most helpful when the - * near-optimal parsing algorithm is being used. - */ -#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters - * appropriately. - */ -#define MATCHFINDER_WINDOW_ORDER 15 - -#include "hc_matchfinder.h" -#if SUPPORT_NEAR_OPTIMAL_PARSING -# include "bt_matchfinder.h" -#endif - -/* - * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes, - * except if the last block has to be shorter. - */ -#define MIN_BLOCK_LENGTH 10000 - -/* - * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but - * the final length might be slightly longer due to matches extending beyond - * this limit. - */ -#define SOFT_MAX_BLOCK_LENGTH 300000 - -/* - * The number of observed matches or literals that represents sufficient data to - * decide whether the current block should be terminated or not. - */ -#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 - - -#if SUPPORT_NEAR_OPTIMAL_PARSING -/* Constants specific to the near-optimal parsing algorithm */ - -/* - * The maximum number of matches the matchfinder can find at a single position. - * Since the matchfinder never finds more than one match for the same length, - * presuming one of each possible length is sufficient for an upper bound. - * (This says nothing about whether it is worthwhile to consider so many - * matches; this is just defining the worst case.) - */ -# define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) - -/* - * The number of lz_match structures in the match cache, excluding the extra - * "overflow" entries. This value should be high enough so that nearly the - * time, all matches found in a given block can fit in the match cache. - * However, fallback behavior (immediately terminating the block) on cache - * overflow is still required. - */ -# define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* - * These are the compressor-side limits on the codeword lengths for each Huffman - * code. To make outputting bits slightly faster, some of these limits are - * lower than the limits defined by the DEFLATE format. This does not - * significantly affect the compression ratio, at least for the block lengths we - * use. - */ -#define MAX_LITLEN_CODEWORD_LEN 14 -#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN -#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN - -/* Table: length slot => length slot base value */ -static const unsigned deflate_length_slot_base[] = { - 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , - 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 , - 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 , - 131 , 163 , 195 , 227 , 258 , -}; - -/* Table: length slot => number of extra length bits */ -static const u8 deflate_extra_length_bits[] = { - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , - 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , - 5 , 5 , 5 , 5 , 0 , -}; - -/* Table: offset slot => offset slot base value */ -static const unsigned deflate_offset_slot_base[] = { - 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , - 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , - 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , - 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , -}; - -/* Table: offset slot => number of extra offset bits */ -static const u8 deflate_extra_offset_bits[] = { - 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , - 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , - 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , - 11 , 11 , 12 , 12 , 13 , 13 , -}; - -/* Table: length => length slot */ -static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { - 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, - 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, - 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, - 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, - 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 28, -}; - -/* The order in which precode codeword lengths are stored */ -static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 -}; - -/* Codewords for the DEFLATE Huffman codes. */ -struct deflate_codewords { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* Codeword lengths (in bits) for the DEFLATE Huffman codes. - * A zero length means the corresponding symbol had zero frequency. */ -struct deflate_lens { - u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u8 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* Codewords and lengths for the DEFLATE Huffman codes. */ -struct deflate_codes { - struct deflate_codewords codewords; - struct deflate_lens lens; -}; - -/* Symbol frequency counters for the DEFLATE Huffman codes. */ -struct deflate_freqs { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* Costs for the near-optimal parsing algorithm. */ -struct deflate_costs { - - /* The cost to output each possible literal. */ - u32 literal[DEFLATE_NUM_LITERALS]; - - /* The cost to output each possible match length. */ - u32 length[DEFLATE_MAX_MATCH_LEN + 1]; - - /* The cost to output a match offset of each possible offset slot. */ - u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* - * COST_SHIFT is a scaling factor that makes it possible to consider fractional - * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT. - * - * Note: this is only useful as a statistical trick for when the true costs are - * unknown. In reality, each token in DEFLATE requires a whole number of bits - * to output. - */ -#define COST_SHIFT 3 - -/* - * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to - * be needed to output a symbol that was unused in the previous optimization - * pass. Assigning a default cost allows the symbol to be used in the next - * optimization pass. However, the cost should be relatively high because the - * symbol probably won't be used very many times (if at all). - */ -#define LITERAL_NOSTAT_BITS 13 -#define LENGTH_NOSTAT_BITS 13 -#define OFFSET_NOSTAT_BITS 10 - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* - * Represents a run of literals followed by a match or end-of-block. This - * struct is needed to temporarily store items chosen by the parser, since items - * cannot be written until all items for the block have been chosen and the - * block's Huffman codes have been computed. - */ -struct deflate_sequence { - - /* Bits 0..22: the number of literals in this run. This may be 0 and - * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not - * stored explicitly in this structure; instead, they are read directly - * from the uncompressed data. - * - * Bits 23..31: the length of the match which follows the literals, or 0 - * if this literal run was the last in the block, so there is no match - * which follows it. */ - u32 litrunlen_and_length; - - /* If 'length' doesn't indicate end-of-block, then this is the offset of - * the match which follows the literals. */ - u16 offset; - - /* If 'length' doesn't indicate end-of-block, then this is the offset - * symbol of the match which follows the literals. */ - u8 offset_symbol; - - /* If 'length' doesn't indicate end-of-block, then this is the length - * slot of the match which follows the literals. */ - u8 length_slot; -}; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * This structure represents a byte position in the input data and a node in the - * graph of possible match/literal choices for the current block. - * - * Logically, each incoming edge to this node is labeled with a literal or a - * match that can be taken to reach this position from an earlier position; and - * each outgoing edge from this node is labeled with a literal or a match that - * can be taken to advance from this position to a later position. - * - * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we - * associate with each node just two pieces of information: - * - * 'cost_to_end' is the minimum cost to reach the end of the block from - * this position. - * - * 'item' represents the literal or match that must be chosen from here to - * reach the end of the block with the minimum cost. Equivalently, this - * can be interpreted as the label of the outgoing edge on the minimum-cost - * path to the "end of block" node from this node. - */ -struct deflate_optimum_node { - - u32 cost_to_end; - - /* - * Notes on the match/literal representation used here: - * - * The low bits of 'item' are the length: 1 if this is a literal, - * or the match length if this is a match. - * - * The high bits of 'item' are the actual literal byte if this is a - * literal, or the match offset if this is a match. - */ -#define OPTIMUM_OFFSET_SHIFT 9 -#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) - u32 item; - -}; - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Block split statistics. See "Block splitting algorithm" below. */ -#define NUM_LITERAL_OBSERVATION_TYPES 8 -#define NUM_MATCH_OBSERVATION_TYPES 2 -#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES) -struct block_split_stats { - u32 new_observations[NUM_OBSERVATION_TYPES]; - u32 observations[NUM_OBSERVATION_TYPES]; - u32 num_new_observations; - u32 num_observations; -}; - -/* The main DEFLATE compressor structure */ -struct libdeflate_compressor { - - /* Pointer to the compress() implementation chosen at allocation time */ - size_t (*impl)(struct libdeflate_compressor *, - const u8 *, size_t, u8 *, size_t); - - /* Frequency counters for the current block */ - struct deflate_freqs freqs; - - /* Dynamic Huffman codes for the current block */ - struct deflate_codes codes; - - /* Static Huffman codes */ - struct deflate_codes static_codes; - - /* Block split statistics for the currently pending block */ - struct block_split_stats split_stats; - - /* A table for fast lookups of offset slot by match offset. - * - * If the full table is being used, it is a direct mapping from offset - * to offset slot. - * - * If the condensed table is being used, the first 256 entries map - * directly to the offset slots of offsets 1 through 256. The next 256 - * entries map to the offset slots for the remaining offsets, stepping - * through the offsets with a stride of 128. This relies on the fact - * that each of the remaining offset slots contains at least 128 offsets - * and has an offset base that is a multiple of 128. */ -#if USE_FULL_OFFSET_SLOT_FAST - u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; -#else - u8 offset_slot_fast[512]; -#endif - - /* The "nice" match length: if a match of this length is found, choose - * it immediately without further consideration. */ - unsigned nice_match_length; - - /* The maximum search depth: consider at most this many potential - * matches at each position. */ - unsigned max_search_depth; - - /* The compression level with which this compressor was created. */ - unsigned compression_level; - - /* Anything smaller than this we won't bother trying to compress. */ - unsigned min_size_to_compress; - - /* Temporary space for Huffman code output */ - u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; - u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; - u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; - unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; - unsigned num_litlen_syms; - unsigned num_offset_syms; - unsigned num_explicit_lens; - unsigned num_precode_items; - - union { - /* Data for greedy or lazy parsing */ - struct { - /* Hash chain matchfinder */ - struct hc_matchfinder hc_mf; - - /* The matches and literals that the parser has chosen - * for the current block. The required length of this - * array is limited by the maximum number of matches - * that can ever be chosen for a single block, plus one - * for the special entry at the end. */ - struct deflate_sequence sequences[ - DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, - DEFLATE_MIN_MATCH_LEN) + 1]; - } g; /* (g)reedy */ - - #if SUPPORT_NEAR_OPTIMAL_PARSING - /* Data for near-optimal parsing */ - struct { - - /* Binary tree matchfinder */ - struct bt_matchfinder bt_mf; - - /* - * Cached matches for the current block. This array - * contains the matches that were found at each position - * in the block. Specifically, for each position, there - * is a list of matches found at that position, if any, - * sorted by strictly increasing length. In addition, - * following the matches for each position, there is a - * special 'struct lz_match' whose 'length' member - * contains the number of matches found at that - * position, and whose 'offset' member contains the - * literal at that position. - * - * Note: in rare cases, there will be a very high number - * of matches in the block and this array will overflow. - * If this happens, we force the end of the current - * block. CACHE_LENGTH is the length at which we - * actually check for overflow. The extra slots beyond - * this are enough to absorb the worst case overflow, - * which occurs if starting at &match_cache[CACHE_LENGTH - * - 1], we write MAX_MATCHES_PER_POS matches and a - * match count header, then skip searching for matches - * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write - * the match count header for each. - */ - struct lz_match match_cache[CACHE_LENGTH + - MAX_MATCHES_PER_POS + - DEFLATE_MAX_MATCH_LEN - 1]; - - /* - * Array of nodes, one per position, for running the - * minimum-cost path algorithm. - * - * This array must be large enough to accommodate the - * worst-case number of nodes, which occurs if we find a - * match of length DEFLATE_MAX_MATCH_LEN at position - * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of - * length SOFT_MAX_BLOCK_LENGTH - 1 + - * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block - * node. - */ - struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 + - DEFLATE_MAX_MATCH_LEN + 1]; - - /* The current cost model being used. */ - struct deflate_costs costs; - - unsigned num_optim_passes; - } n; /* (n)ear-optimal */ - #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - - } p; /* (p)arser */ -}; - -/* - * The type for the bitbuffer variable, which temporarily holds bits that are - * being packed into bytes and written to the output buffer. For best - * performance, this should have size equal to a machine word. - */ -typedef machine_word_t bitbuf_t; -#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) - -/* Can the specified number of bits always be added to 'bitbuf' after any - * pending bytes have been flushed? */ -#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7) - -/* - * Structure to keep track of the current state of sending bits to the - * compressed output buffer. - */ -struct deflate_output_bitstream { - - /* Bits that haven't yet been written to the output buffer. */ - bitbuf_t bitbuf; - - /* Number of bits currently held in @bitbuf. */ - unsigned bitcount; - - /* Pointer to the beginning of the output buffer. */ - u8 *begin; - - /* Pointer to the position in the output buffer at which the next byte - * should be written. */ - u8 *next; - - /* Pointer just past the end of the output buffer. */ - u8 *end; -}; - -/* - * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be - * present following os->end, in order to not overrun the buffer when generating - * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) - * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, - * to make the compression algorithm produce the same result on all CPU - * architectures (which is sometimes desirable), we have to unconditionally use - * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. - */ -#define OUTPUT_END_PADDING 8 - -/* Initialize the output bitstream. 'size' is assumed to be at least - * OUTPUT_END_PADDING. */ -static void -deflate_init_output(struct deflate_output_bitstream *os, - void *buffer, size_t size) -{ - os->bitbuf = 0; - os->bitcount = 0; - os->begin = buffer; - os->next = os->begin; - os->end = os->begin + size - OUTPUT_END_PADDING; -} - -/* Add some bits to the bitbuffer variable of the output bitstream. The caller - * must make sure there is enough room. */ -static forceinline void -deflate_add_bits(struct deflate_output_bitstream *os, - const bitbuf_t bits, const unsigned num_bits) -{ - os->bitbuf |= bits << os->bitcount; - os->bitcount += num_bits; -} - -/* Flush bits from the bitbuffer variable to the output buffer. */ -static forceinline void -deflate_flush_bits(struct deflate_output_bitstream *os) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - /* Flush a whole word (branchlessly). */ - put_unaligned_leword(os->bitbuf, os->next); - os->bitbuf >>= os->bitcount & ~7; - os->next += MIN(os->end - os->next, os->bitcount >> 3); - os->bitcount &= 7; - } else { - /* Flush a byte at a time. */ - while (os->bitcount >= 8) { - *os->next = os->bitbuf; - if (os->next != os->end) - os->next++; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - } -} - -/* Align the bitstream on a byte boundary. */ -static forceinline void -deflate_align_bitstream(struct deflate_output_bitstream *os) -{ - os->bitcount += -os->bitcount & 7; - deflate_flush_bits(os); -} - -/* - * Flush any remaining bits to the output buffer if needed. Return the total - * number of bytes written to the output buffer, or 0 if an overflow occurred. - */ -static size_t -deflate_flush_output(struct deflate_output_bitstream *os) -{ - if (os->next == os->end) /* overflow? */ - return 0; - - while ((int)os->bitcount > 0) { - *os->next++ = os->bitbuf; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - - return os->next - os->begin; -} - -/* Given the binary tree node A[subtree_idx] whose children already - * satisfy the maxheap property, swap the node with its greater child - * until it is greater than both its children, so that the maxheap - * property is satisfied in the subtree rooted at A[subtree_idx]. */ -static void -heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) -{ - unsigned parent_idx; - unsigned child_idx; - u32 v; - - v = A[subtree_idx]; - parent_idx = subtree_idx; - while ((child_idx = parent_idx * 2) <= length) { - if (child_idx < length && A[child_idx + 1] > A[child_idx]) - child_idx++; - if (v >= A[child_idx]) - break; - A[parent_idx] = A[child_idx]; - parent_idx = child_idx; - } - A[parent_idx] = v; -} - -/* Rearrange the array 'A' so that it satisfies the maxheap property. - * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. - */ -static void -heapify_array(u32 A[], unsigned length) -{ - unsigned subtree_idx; - - for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) - heapify_subtree(A, length, subtree_idx); -} - -/* - * Sort the array 'A', which contains 'length' unsigned 32-bit integers. - * - * Note: name this function heap_sort() instead of heapsort() to avoid colliding - * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't - * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. - */ -static void -heap_sort(u32 A[], unsigned length) -{ - A--; /* Use 1-based indices */ - - heapify_array(A, length); - - while (length >= 2) { - u32 tmp = A[length]; - A[length] = A[1]; - A[1] = tmp; - length--; - heapify_subtree(A, length, 1); - } -} - -#define NUM_SYMBOL_BITS 10 -#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) - -#define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3) -/* - * Sort the symbols primarily by frequency and secondarily by symbol - * value. Discard symbols with zero frequency and fill in an array with - * the remaining symbols, along with their frequencies. The low - * NUM_SYMBOL_BITS bits of each array entry will contain the symbol - * value, and the remaining bits will contain the frequency. - * - * @num_syms - * Number of symbols in the alphabet. - * Can't be greater than (1 << NUM_SYMBOL_BITS). - * - * @freqs[num_syms] - * The frequency of each symbol. - * - * @lens[num_syms] - * An array that eventually will hold the length of each codeword. - * This function only fills in the codeword lengths for symbols that - * have zero frequency, which are not well defined per se but will - * be set to 0. - * - * @symout[num_syms] - * The output array, described above. - * - * Returns the number of entries in 'symout' that were filled. This is - * the number of symbols that have nonzero frequency. - */ -static unsigned -sort_symbols(unsigned num_syms, const u32 freqs[restrict], - u8 lens[restrict], u32 symout[restrict]) -{ - unsigned sym; - unsigned i; - unsigned num_used_syms; - unsigned num_counters; - unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; - - /* We rely on heapsort, but with an added optimization. Since - * it's common for most symbol frequencies to be low, we first do - * a count sort using a limited number of counters. High - * frequencies will be counted in the last counter, and only they - * will be sorted with heapsort. - * - * Note: with more symbols, it is generally beneficial to have more - * counters. About 1 counter per 4 symbols seems fast. - * - * Note: I also tested radix sort, but even for large symbol - * counts (> 255) and frequencies bounded at 16 bits (enabling - * radix sort by just two base-256 digits), it didn't seem any - * faster than the method implemented here. - * - * Note: I tested the optimized quicksort implementation from - * glibc (with indirection overhead removed), but it was only - * marginally faster than the simple heapsort implemented here. - * - * Tests were done with building the codes for LZX. Results may - * vary for different compression algorithms...! */ - - num_counters = GET_NUM_COUNTERS(num_syms); - - memset(counters, 0, num_counters * sizeof(counters[0])); - - /* Count the frequencies. */ - for (sym = 0; sym < num_syms; sym++) - counters[MIN(freqs[sym], num_counters - 1)]++; - - /* Make the counters cumulative, ignoring the zero-th, which - * counted symbols with zero frequency. As a side effect, this - * calculates the number of symbols with nonzero frequency. */ - num_used_syms = 0; - for (i = 1; i < num_counters; i++) { - unsigned count = counters[i]; - counters[i] = num_used_syms; - num_used_syms += count; - } - - /* Sort nonzero-frequency symbols using the counters. At the - * same time, set the codeword lengths of zero-frequency symbols - * to 0. */ - for (sym = 0; sym < num_syms; sym++) { - u32 freq = freqs[sym]; - if (freq != 0) { - symout[counters[MIN(freq, num_counters - 1)]++] = - sym | (freq << NUM_SYMBOL_BITS); - } else { - lens[sym] = 0; - } - } - - /* Sort the symbols counted in the last counter. */ - heap_sort(symout + counters[num_counters - 2], - counters[num_counters - 1] - counters[num_counters - 2]); - - return num_used_syms; -} - -/* - * Build the Huffman tree. - * - * This is an optimized implementation that - * (a) takes advantage of the frequencies being already sorted; - * (b) only generates non-leaf nodes, since the non-leaf nodes of a - * Huffman tree are sufficient to generate a canonical code; - * (c) Only stores parent pointers, not child pointers; - * (d) Produces the nodes in the same memory used for input - * frequency information. - * - * Array 'A', which contains 'sym_count' entries, is used for both input - * and output. For this function, 'sym_count' must be at least 2. - * - * For input, the array must contain the frequencies of the symbols, - * sorted in increasing order. Specifically, each entry must contain a - * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low - * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. - * Although these bits will, in fact, contain the symbols that correspond - * to the frequencies, this function is concerned with frequencies only - * and keeps the symbols as-is. - * - * For output, this function will produce the non-leaf nodes of the - * Huffman tree. These nodes will be stored in the first (sym_count - 1) - * entries of the array. Entry A[sym_count - 2] will represent the root - * node. Each other node will contain the zero-based index of its parent - * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low - * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, - * note that although these low bits will, in fact, contain a symbol - * value, this symbol will have *no relationship* with the Huffman tree - * node that happens to occupy the same slot. This is because this - * implementation only generates the non-leaf nodes of the tree. - */ -static void -build_tree(u32 A[], unsigned sym_count) -{ - /* Index, in 'A', of next lowest frequency symbol that has not - * yet been processed. */ - unsigned i = 0; - - /* Index, in 'A', of next lowest frequency parentless non-leaf - * node; or, if equal to 'e', then no such node exists yet. */ - unsigned b = 0; - - /* Index, in 'A', of next node to allocate as a non-leaf. */ - unsigned e = 0; - - do { - unsigned m, n; - u32 freq_shifted; - - /* Choose the two next lowest frequency entries. */ - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - m = i++; - else - m = b++; - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - n = i++; - else - n = b++; - - /* Allocate a non-leaf node and link the entries to it. - * - * If we link an entry that we're visiting for the first - * time (via index 'i'), then we're actually linking a - * leaf node and it will have no effect, since the leaf - * will be overwritten with a non-leaf when index 'e' - * catches up to it. But it's not any slower to - * unconditionally set the parent index. - * - * We also compute the frequency of the non-leaf node as - * the sum of its two children's frequencies. */ - - freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); - - A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; - e++; - } while (sym_count - e > 1); - /* When just one entry remains, it is a "leaf" that was - * linked to some other node. We ignore it, since the - * rest of the array contains the non-leaves which we - * need. (Note that we're assuming the cases with 0 or 1 - * symbols were handled separately.) */ -} - -/* - * Given the stripped-down Huffman tree constructed by build_tree(), - * determine the number of codewords that should be assigned each - * possible length, taking into account the length-limited constraint. - * - * @A - * The array produced by build_tree(), containing parent index - * information for the non-leaf nodes of the Huffman tree. Each - * entry in this array is a node; a node's parent always has a - * greater index than that node itself. This function will - * overwrite the parent index information in this array, so - * essentially it will destroy the tree. However, the data in the - * low NUM_SYMBOL_BITS of each entry will be preserved. - * - * @root_idx - * The 0-based index of the root node in 'A', and consequently one - * less than the number of tree node entries in 'A'. (Or, really 2 - * less than the actual length of 'A'.) - * - * @len_counts - * An array of length ('max_codeword_len' + 1) in which the number of - * codewords having each length <= max_codeword_len will be - * returned. - * - * @max_codeword_len - * The maximum permissible codeword length. - */ -static void -compute_length_counts(u32 A[restrict], unsigned root_idx, - unsigned len_counts[restrict], unsigned max_codeword_len) -{ - unsigned len; - int node; - - /* The key observations are: - * - * (1) We can traverse the non-leaf nodes of the tree, always - * visiting a parent before its children, by simply iterating - * through the array in reverse order. Consequently, we can - * compute the depth of each node in one pass, overwriting the - * parent indices with depths. - * - * (2) We can initially assume that in the real Huffman tree, - * both children of the root are leaves. This corresponds to two - * codewords of length 1. Then, whenever we visit a (non-leaf) - * node during the traversal, we modify this assumption to - * account for the current node *not* being a leaf, but rather - * its two children being leaves. This causes the loss of one - * codeword for the current depth and the addition of two - * codewords for the current depth plus one. - * - * (3) We can handle the length-limited constraint fairly easily - * by simply using the largest length available when a depth - * exceeds max_codeword_len. - */ - - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - len_counts[1] = 2; - - /* Set the root node's depth to 0. */ - A[root_idx] &= SYMBOL_MASK; - - for (node = root_idx - 1; node >= 0; node--) { - - /* Calculate the depth of this node. */ - - unsigned parent = A[node] >> NUM_SYMBOL_BITS; - unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; - unsigned depth = parent_depth + 1; - unsigned len = depth; - - /* Set the depth of this node so that it is available - * when its children (if any) are processed. */ - - A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); - - /* If needed, decrease the length to meet the - * length-limited constraint. This is not the optimal - * method for generating length-limited Huffman codes! - * But it should be good enough. */ - if (len >= max_codeword_len) { - len = max_codeword_len; - do { - len--; - } while (len_counts[len] == 0); - } - - /* Account for the fact that we have a non-leaf node at - * the current depth. */ - len_counts[len]--; - len_counts[len + 1] += 2; - } -} - -/* - * Generate the codewords for a canonical Huffman code. - * - * @A - * The output array for codewords. In addition, initially this - * array must contain the symbols, sorted primarily by frequency and - * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of - * each entry. - * - * @len - * Output array for codeword lengths. - * - * @len_counts - * An array that provides the number of codewords that will have - * each possible length <= max_codeword_len. - * - * @max_codeword_len - * Maximum length, in bits, of each codeword. - * - * @num_syms - * Number of symbols in the alphabet, including symbols with zero - * frequency. This is the length of the 'A' and 'len' arrays. - */ -static void -gen_codewords(u32 A[restrict], u8 lens[restrict], - const unsigned len_counts[restrict], - unsigned max_codeword_len, unsigned num_syms) -{ - u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned i; - unsigned len; - unsigned sym; - - /* Given the number of codewords that will have each length, - * assign codeword lengths to symbols. We do this by assigning - * the lengths in decreasing order to the symbols sorted - * primarily by increasing frequency and secondarily by - * increasing symbol value. */ - for (i = 0, len = max_codeword_len; len >= 1; len--) { - unsigned count = len_counts[len]; - while (count--) - lens[A[i++] & SYMBOL_MASK] = len; - } - - /* Generate the codewords themselves. We initialize the - * 'next_codewords' array to provide the lexicographically first - * codeword of each length, then assign codewords in symbol - * order. This produces a canonical code. */ - next_codewords[0] = 0; - next_codewords[1] = 0; - for (len = 2; len <= max_codeword_len; len++) - next_codewords[len] = - (next_codewords[len - 1] + len_counts[len - 1]) << 1; - - for (sym = 0; sym < num_syms; sym++) - A[sym] = next_codewords[lens[sym]]++; -} - -/* - * --------------------------------------------------------------------- - * make_canonical_huffman_code() - * --------------------------------------------------------------------- - * - * Given an alphabet and the frequency of each symbol in it, construct a - * length-limited canonical Huffman code. - * - * @num_syms - * The number of symbols in the alphabet. The symbols are the - * integers in the range [0, num_syms - 1]. This parameter must be - * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). - * - * @max_codeword_len - * The maximum permissible codeword length. - * - * @freqs - * An array of @num_syms entries, each of which specifies the - * frequency of the corresponding symbol. It is valid for some, - * none, or all of the frequencies to be 0. - * - * @lens - * An array of @num_syms entries in which this function will return - * the length, in bits, of the codeword assigned to each symbol. - * Symbols with 0 frequency will not have codewords per se, but - * their entries in this array will be set to 0. No lengths greater - * than @max_codeword_len will be assigned. - * - * @codewords - * An array of @num_syms entries in which this function will return - * the codeword for each symbol, right-justified and padded on the - * left with zeroes. Codewords for symbols with 0 frequency will be - * undefined. - * - * --------------------------------------------------------------------- - * - * This function builds a length-limited canonical Huffman code. - * - * A length-limited Huffman code contains no codewords longer than some - * specified length, and has exactly (with some algorithms) or - * approximately (with the algorithm used here) the minimum weighted path - * length from the root, given this constraint. - * - * A canonical Huffman code satisfies the properties that a longer - * codeword never lexicographically precedes a shorter codeword, and the - * lexicographic ordering of codewords of the same length is the same as - * the lexicographic ordering of the corresponding symbols. A canonical - * Huffman code, or more generally a canonical prefix code, can be - * reconstructed from only a list containing the codeword length of each - * symbol. - * - * The classic algorithm to generate a Huffman code creates a node for - * each symbol, then inserts these nodes into a min-heap keyed by symbol - * frequency. Then, repeatedly, the two lowest-frequency nodes are - * removed from the min-heap and added as the children of a new node - * having frequency equal to the sum of its two children, which is then - * inserted into the min-heap. When only a single node remains in the - * min-heap, it is the root of the Huffman tree. The codeword for each - * symbol is determined by the path needed to reach the corresponding - * node from the root. Descending to the left child appends a 0 bit, - * whereas descending to the right child appends a 1 bit. - * - * The classic algorithm is relatively easy to understand, but it is - * subject to a number of inefficiencies. In practice, it is fastest to - * first sort the symbols by frequency. (This itself can be subject to - * an optimization based on the fact that most frequencies tend to be - * low.) At the same time, we sort secondarily by symbol value, which - * aids the process of generating a canonical code. Then, during tree - * construction, no heap is necessary because both the leaf nodes and the - * unparented non-leaf nodes can be easily maintained in sorted order. - * Consequently, there can never be more than two possibilities for the - * next-lowest-frequency node. - * - * In addition, because we're generating a canonical code, we actually - * don't need the leaf nodes of the tree at all, only the non-leaf nodes. - * This is because for canonical code generation we don't need to know - * where the symbols are in the tree. Rather, we only need to know how - * many leaf nodes have each depth (codeword length). And this - * information can, in fact, be quickly generated from the tree of - * non-leaves only. - * - * Furthermore, we can build this stripped-down Huffman tree directly in - * the array in which the codewords are to be generated, provided that - * these array slots are large enough to hold a symbol and frequency - * value. - * - * Still furthermore, we don't even need to maintain explicit child - * pointers. We only need the parent pointers, and even those can be - * overwritten in-place with depth information as part of the process of - * extracting codeword lengths from the tree. So in summary, we do NOT - * need a big structure like: - * - * struct huffman_tree_node { - * unsigned int symbol; - * unsigned int frequency; - * unsigned int depth; - * struct huffman_tree_node *left_child; - * struct huffman_tree_node *right_child; - * }; - * - * - * ... which often gets used in "naive" implementations of Huffman code - * generation. - * - * Many of these optimizations are based on the implementation in 7-Zip - * (source file: C/HuffEnc.c), which has been placed in the public domain - * by Igor Pavlov. - */ -static void -make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[restrict], - u8 lens[restrict], u32 codewords[restrict]) -{ - u32 *A = codewords; - unsigned num_used_syms; - - STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); - - /* We begin by sorting the symbols primarily by frequency and - * secondarily by symbol value. As an optimization, the array - * used for this purpose ('A') shares storage with the space in - * which we will eventually return the codewords. */ - - num_used_syms = sort_symbols(num_syms, freqs, lens, A); - - /* 'num_used_syms' is the number of symbols with nonzero - * frequency. This may be less than @num_syms. 'num_used_syms' - * is also the number of entries in 'A' that are valid. Each - * entry consists of a distinct symbol and a nonzero frequency - * packed into a 32-bit integer. */ - - /* Handle special cases where only 0 or 1 symbols were used (had - * nonzero frequency). */ - - if (unlikely(num_used_syms == 0)) { - /* Code is empty. sort_symbols() already set all lengths - * to 0, so there is nothing more to do. */ - return; - } - - if (unlikely(num_used_syms == 1)) { - /* Only one symbol was used, so we only need one - * codeword. But two codewords are needed to form the - * smallest complete Huffman code, which uses codewords 0 - * and 1. Therefore, we choose another symbol to which - * to assign a codeword. We use 0 (if the used symbol is - * not 0) or 1 (if the used symbol is 0). In either - * case, the lesser-valued symbol must be assigned - * codeword 0 so that the resulting code is canonical. */ - - unsigned sym = A[0] & SYMBOL_MASK; - unsigned nonzero_idx = sym ? sym : 1; - - codewords[0] = 0; - lens[0] = 1; - codewords[nonzero_idx] = 1; - lens[nonzero_idx] = 1; - return; - } - - /* Build a stripped-down version of the Huffman tree, sharing the - * array 'A' with the symbol values. Then extract length counts - * from the tree and use them to generate the final codewords. */ - - build_tree(A, num_used_syms); - - { - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - - compute_length_counts(A, num_used_syms - 2, - len_counts, max_codeword_len); - - gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); - } -} - -/* - * Clear the Huffman symbol frequency counters. - * This must be called when starting a new DEFLATE block. - */ -static void -deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) -{ - memset(&c->freqs, 0, sizeof(c->freqs)); -} - -/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ -static u32 -deflate_reverse_codeword(u32 codeword, u8 len) -{ - /* The following branchless algorithm is faster than going bit by bit. - * Note: since no codewords are longer than 16 bits, we only need to - * reverse the low 16 bits of the 'u32'. */ - STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); - - /* Flip adjacent 1-bit fields */ - codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1); - - /* Flip adjacent 2-bit fields */ - codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2); - - /* Flip adjacent 4-bit fields */ - codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4); - - /* Flip adjacent 8-bit fields */ - codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8); - - /* Return the high 'len' bits of the bit-reversed 16 bit value. */ - return codeword >> (16 - len); -} - -/* Make a canonical Huffman code with bit-reversed codewords. */ -static void -deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[], u8 lens[], u32 codewords[]) -{ - unsigned sym; - - make_canonical_huffman_code(num_syms, max_codeword_len, - freqs, lens, codewords); - - for (sym = 0; sym < num_syms; sym++) - codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]); -} - -/* - * Build the literal/length and offset Huffman codes for a DEFLATE block. - * - * This takes as input the frequency tables for each code and produces as output - * a set of tables that map symbols to codewords and codeword lengths. - */ -static void -deflate_make_huffman_codes(const struct deflate_freqs *freqs, - struct deflate_codes *codes) -{ - STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); - STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); - - deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, - MAX_LITLEN_CODEWORD_LEN, - freqs->litlen, - codes->lens.litlen, - codes->codewords.litlen); - - deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, - MAX_OFFSET_CODEWORD_LEN, - freqs->offset, - codes->lens.offset, - codes->codewords.offset); -} - -/* Initialize c->static_codes. */ -static void -deflate_init_static_codes(struct libdeflate_compressor *c) -{ - unsigned i; - - for (i = 0; i < 144; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - for (; i < 256; i++) - c->freqs.litlen[i] = 1 << (9 - 9); - for (; i < 280; i++) - c->freqs.litlen[i] = 1 << (9 - 7); - for (; i < 288; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - - for (i = 0; i < 32; i++) - c->freqs.offset[i] = 1 << (5 - 5); - - deflate_make_huffman_codes(&c->freqs, &c->static_codes); -} - -/* Return the offset slot for the specified match offset. */ -static forceinline unsigned -deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset) -{ -#if USE_FULL_OFFSET_SLOT_FAST - return c->offset_slot_fast[offset]; -#else - if (offset <= 256) - return c->offset_slot_fast[offset - 1]; - else - return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; -#endif -} - -/* Write the header fields common to all DEFLATE block types. */ -static void -deflate_write_block_header(struct deflate_output_bitstream *os, - bool is_final_block, unsigned block_type) -{ - deflate_add_bits(os, is_final_block, 1); - deflate_add_bits(os, block_type, 2); - deflate_flush_bits(os); -} - -static unsigned -deflate_compute_precode_items(const u8 lens[restrict], - const unsigned num_lens, - u32 precode_freqs[restrict], - unsigned precode_items[restrict]) -{ - unsigned *itemptr; - unsigned run_start; - unsigned run_end; - unsigned extra_bits; - u8 len; - - memset(precode_freqs, 0, - DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); - - itemptr = precode_items; - run_start = 0; - do { - /* Find the next run of codeword lengths. */ - - /* len = the length being repeated */ - len = lens[run_start]; - - /* Extend the run. */ - run_end = run_start; - do { - run_end++; - } while (run_end != num_lens && len == lens[run_end]); - - if (len == 0) { - /* Run of zeroes. */ - - /* Symbol 18: RLE 11 to 138 zeroes at a time. */ - while ((run_end - run_start) >= 11) { - extra_bits = MIN((run_end - run_start) - 11, 0x7F); - precode_freqs[18]++; - *itemptr++ = 18 | (extra_bits << 5); - run_start += 11 + extra_bits; - } - - /* Symbol 17: RLE 3 to 10 zeroes at a time. */ - if ((run_end - run_start) >= 3) { - extra_bits = MIN((run_end - run_start) - 3, 0x7); - precode_freqs[17]++; - *itemptr++ = 17 | (extra_bits << 5); - run_start += 3 + extra_bits; - } - } else { - - /* A run of nonzero lengths. */ - - /* Symbol 16: RLE 3 to 6 of the previous length. */ - if ((run_end - run_start) >= 4) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - do { - extra_bits = MIN((run_end - run_start) - 3, 0x3); - precode_freqs[16]++; - *itemptr++ = 16 | (extra_bits << 5); - run_start += 3 + extra_bits; - } while ((run_end - run_start) >= 3); - } - } - - /* Output any remaining lengths without RLE. */ - while (run_start != run_end) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - } - } while (run_start != num_lens); - - return itemptr - precode_items; -} - -/* - * Huffman codeword lengths for dynamic Huffman blocks are compressed using a - * separate Huffman code, the "precode", which contains a symbol for each - * possible codeword length in the larger code as well as several special - * symbols to represent repeated codeword lengths (a form of run-length - * encoding). The precode is itself constructed in canonical form, and its - * codeword lengths are represented literally in 19 3-bit fields that - * immediately precede the compressed codeword lengths of the larger code. - */ - -/* Precompute the information needed to output Huffman codes. */ -static void -deflate_precompute_huffman_header(struct libdeflate_compressor *c) -{ - /* Compute how many litlen and offset symbols are needed. */ - - for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; - c->num_litlen_syms > 257; - c->num_litlen_syms--) - if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0) - break; - - for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; - c->num_offset_syms > 1; - c->num_offset_syms--) - if (c->codes.lens.offset[c->num_offset_syms - 1] != 0) - break; - - /* If we're not using the full set of literal/length codeword lengths, - * then temporarily move the offset codeword lengths over so that the - * literal/length and offset codeword lengths are contiguous. */ - - STATIC_ASSERT(offsetof(struct deflate_lens, offset) == - DEFLATE_NUM_LITLEN_SYMS); - - if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + c->num_litlen_syms, - (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - c->num_offset_syms); - } - - /* Compute the "items" (RLE / literal tokens and extra bits) with which - * the codeword lengths in the larger code will be output. */ - c->num_precode_items = - deflate_compute_precode_items((u8 *)&c->codes.lens, - c->num_litlen_syms + - c->num_offset_syms, - c->precode_freqs, - c->precode_items); - - /* Build the precode. */ - STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); - deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, - MAX_PRE_CODEWORD_LEN, - c->precode_freqs, c->precode_lens, - c->precode_codewords); - - /* Count how many precode lengths we actually need to output. */ - for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; - c->num_explicit_lens > 4; - c->num_explicit_lens--) - if (c->precode_lens[deflate_precode_lens_permutation[ - c->num_explicit_lens - 1]] != 0) - break; - - /* Restore the offset codeword lengths if needed. */ - if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - (u8 *)&c->codes.lens + c->num_litlen_syms, - c->num_offset_syms); - } -} - -/* Output the Huffman codes. */ -static void -deflate_write_huffman_header(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os) -{ - unsigned i; - - deflate_add_bits(os, c->num_litlen_syms - 257, 5); - deflate_add_bits(os, c->num_offset_syms - 1, 5); - deflate_add_bits(os, c->num_explicit_lens - 4, 4); - deflate_flush_bits(os); - - /* Output the lengths of the codewords in the precode. */ - for (i = 0; i < c->num_explicit_lens; i++) { - deflate_add_bits(os, c->precode_lens[ - deflate_precode_lens_permutation[i]], 3); - deflate_flush_bits(os); - } - - /* Output the encoded lengths of the codewords in the larger code. */ - for (i = 0; i < c->num_precode_items; i++) { - unsigned precode_item = c->precode_items[i]; - unsigned precode_sym = precode_item & 0x1F; - deflate_add_bits(os, c->precode_codewords[precode_sym], - c->precode_lens[precode_sym]); - if (precode_sym >= 16) { - if (precode_sym == 16) - deflate_add_bits(os, precode_item >> 5, 2); - else if (precode_sym == 17) - deflate_add_bits(os, precode_item >> 5, 3); - else - deflate_add_bits(os, precode_item >> 5, 7); - } - STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7)); - deflate_flush_bits(os); - } -} - -static void -deflate_write_sequences(struct deflate_output_bitstream * restrict os, - const struct deflate_codes * restrict codes, - const struct deflate_sequence sequences[restrict], - const u8 * restrict in_next) -{ - const struct deflate_sequence *seq = sequences; - - for (;;) { - u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF; - unsigned length = seq->litrunlen_and_length >> 23; - unsigned length_slot; - unsigned litlen_symbol; - unsigned offset_symbol; - - if (litrunlen) { - #if 1 - while (litrunlen >= 4) { - unsigned lit0 = in_next[0]; - unsigned lit1 = in_next[1]; - unsigned lit2 = in_next[2]; - unsigned lit3 = in_next[3]; - - deflate_add_bits(os, codes->codewords.litlen[lit0], - codes->lens.litlen[lit0]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit1], - codes->lens.litlen[lit1]); - if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit2], - codes->lens.litlen[lit2]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit3], - codes->lens.litlen[lit3]); - deflate_flush_bits(os); - in_next += 4; - litrunlen -= 4; - } - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - } - } - if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - } - #else - do { - unsigned lit = *in_next++; - deflate_add_bits(os, codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - deflate_flush_bits(os); - } while (--litrunlen); - #endif - } - - if (length == 0) - return; - - in_next += length; - - length_slot = seq->length_slot; - litlen_symbol = 257 + length_slot; - - /* Litlen symbol */ - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - /* Extra length bits */ - STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS)); - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Offset symbol */ - offset_symbol = seq->offset_symbol; - deflate_add_bits(os, codes->codewords.offset[offset_symbol], - codes->lens.offset[offset_symbol]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Extra offset bits */ - deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol], - deflate_extra_offset_bits[offset_symbol]); - - deflate_flush_bits(os); - - seq++; - } -} - -#if SUPPORT_NEAR_OPTIMAL_PARSING -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and write out the matches/literals using the specified - * Huffman codes. - * - * Note: this is slightly duplicated with deflate_write_sequences(), the reason - * being that we don't want to waste time translating between intermediate - * match/literal representations. - */ -static void -deflate_write_item_list(struct deflate_output_bitstream *os, - const struct deflate_codes *codes, - struct libdeflate_compressor *c, - u32 block_length) -{ - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - unsigned litlen_symbol; - unsigned length_slot; - unsigned offset_slot; - - if (length == 1) { - /* Literal */ - litlen_symbol = offset; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - deflate_flush_bits(os); - } else { - /* Match length */ - length_slot = deflate_length_slot[length]; - litlen_symbol = 257 + length_slot; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - - /* Match offset */ - offset_slot = deflate_get_offset_slot(c, offset); - deflate_add_bits(os, codes->codewords.offset[offset_slot], - codes->lens.offset[offset_slot]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], - deflate_extra_offset_bits[offset_slot]); - - deflate_flush_bits(os); - } - cur_node += length; - } while (cur_node != end_node); -} -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Output the end-of-block symbol. */ -static void -deflate_write_end_of_block(struct deflate_output_bitstream *os, - const struct deflate_codes *codes) -{ - deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], - codes->lens.litlen[DEFLATE_END_OF_BLOCK]); - deflate_flush_bits(os); -} - -static void -deflate_write_uncompressed_block(struct deflate_output_bitstream *os, - const u8 *data, u16 len, - bool is_final_block) -{ - deflate_write_block_header(os, is_final_block, - DEFLATE_BLOCKTYPE_UNCOMPRESSED); - deflate_align_bitstream(os); - - if (4 + (u32)len >= os->end - os->next) { - os->next = os->end; - return; - } - - put_unaligned_le16(len, os->next); - os->next += 2; - put_unaligned_le16(~len, os->next); - os->next += 2; - memcpy(os->next, data, len); - os->next += len; -} - -static void -deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, - const u8 *data, size_t data_length, - bool is_final_block) -{ - do { - u16 len = MIN(data_length, UINT16_MAX); - - deflate_write_uncompressed_block(os, data, len, - is_final_block && len == data_length); - data += len; - data_length -= len; - } while (data_length != 0); -} - -/* - * Choose the best type of block to use (dynamic Huffman, static Huffman, or - * uncompressed), then output it. - */ -static void -deflate_flush_block(struct libdeflate_compressor * restrict c, - struct deflate_output_bitstream * restrict os, - const u8 * restrict block_begin, u32 block_length, - bool is_final_block, bool use_item_list) -{ - static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, - }; - - /* Costs are measured in bits */ - u32 dynamic_cost = 0; - u32 static_cost = 0; - u32 uncompressed_cost = 0; - struct deflate_codes *codes; - int block_type; - unsigned sym; - - /* Tally the end-of-block symbol. */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - - /* Build dynamic Huffman codes. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); - - /* Account for the cost of sending dynamic Huffman codes. */ - deflate_precompute_huffman_header(c); - dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); - for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { - u32 extra = deflate_extra_precode_bits[sym]; - dynamic_cost += c->precode_freqs[sym] * - (extra + c->precode_lens[sym]); - } - - /* Account for the cost of encoding literals. */ - for (sym = 0; sym < 256; sym++) { - dynamic_cost += c->freqs.litlen[sym] * - c->codes.lens.litlen[sym]; - } - for (sym = 0; sym < 144; sym++) - static_cost += c->freqs.litlen[sym] * 8; - for (; sym < 256; sym++) - static_cost += c->freqs.litlen[sym] * 9; - - /* Account for the cost of encoding the end-of-block symbol. */ - dynamic_cost += c->codes.lens.litlen[256]; - static_cost += 7; - - /* Account for the cost of encoding lengths. */ - for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) { - u32 extra = deflate_extra_length_bits[sym - 257]; - dynamic_cost += c->freqs.litlen[sym] * - (extra + c->codes.lens.litlen[sym]); - static_cost += c->freqs.litlen[sym] * - (extra + c->static_codes.lens.litlen[sym]); - } - - /* Account for the cost of encoding offsets. */ - for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { - u32 extra = deflate_extra_offset_bits[sym]; - dynamic_cost += c->freqs.offset[sym] * - (extra + c->codes.lens.offset[sym]); - static_cost += c->freqs.offset[sym] * (extra + 5); - } - - /* Compute the cost of using uncompressed blocks. */ - uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 + - (40 * (DIV_ROUND_UP(block_length, - UINT16_MAX) - 1)) + - (8 * block_length); - - /* Choose the cheapest block type. */ - if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { - block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; - codes = &c->codes; - } else if (static_cost < uncompressed_cost) { - block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; - codes = &c->static_codes; - } else { - block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; - } - - /* Now actually output the block. */ - - if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { - /* Note: the length being flushed may exceed the maximum length - * of an uncompressed block (65535 bytes). Therefore, more than - * one uncompressed block might be needed. */ - deflate_write_uncompressed_blocks(os, block_begin, block_length, - is_final_block); - } else { - /* Output the block header. */ - deflate_write_block_header(os, is_final_block, block_type); - - /* Output the Huffman codes (dynamic Huffman blocks only). */ - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) - deflate_write_huffman_header(c, os); - - /* Output the literals, matches, and end-of-block symbol. */ - #if SUPPORT_NEAR_OPTIMAL_PARSING - if (use_item_list) - deflate_write_item_list(os, codes, c, block_length); - else - #endif - deflate_write_sequences(os, codes, c->p.g.sequences, - block_begin); - deflate_write_end_of_block(os, codes); - } -} - -static forceinline void -deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, - u32 *litrunlen_p) -{ - c->freqs.litlen[literal]++; - ++*litrunlen_p; -} - -static forceinline void -deflate_choose_match(struct libdeflate_compressor *c, - unsigned length, unsigned offset, - u32 *litrunlen_p, struct deflate_sequence **next_seq_p) -{ - struct deflate_sequence *seq = *next_seq_p; - unsigned length_slot = deflate_length_slot[length]; - unsigned offset_slot = deflate_get_offset_slot(c, offset); - - c->freqs.litlen[257 + length_slot]++; - c->freqs.offset[offset_slot]++; - - seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p; - seq->offset = offset; - seq->length_slot = length_slot; - seq->offset_symbol = offset_slot; - - *litrunlen_p = 0; - *next_seq_p = seq + 1; -} - -static forceinline void -deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen) -{ - seq->litrunlen_and_length = litrunlen; /* length = 0 */ -} - -/******************************************************************************/ - -/* - * Block splitting algorithm. The problem is to decide when it is worthwhile to - * start a new block with new Huffman codes. There is a theoretically optimal - * solution: recursively consider every possible block split, considering the - * exact cost of each block, and choose the minimum cost approach. But this is - * far too slow. Instead, as an approximation, we can count symbols and after - * every N symbols, compare the expected distribution of symbols based on the - * previous data with the actual distribution. If they differ "by enough", then - * start a new block. - * - * As an optimization and heuristic, we don't distinguish between every symbol - * but rather we combine many symbols into a single "observation type". For - * literals we only look at the high bits and low bits, and for matches we only - * look at whether the match is long or not. The assumption is that for typical - * "real" data, places that are good block boundaries will tend to be noticeable - * based only on changes in these aggregate frequencies, without looking for - * subtle differences in individual symbols. For example, a change from ASCII - * bytes to non-ASCII bytes, or from few matches (generally less compressible) - * to many matches (generally more compressible), would be easily noticed based - * on the aggregates. - * - * For determining whether the frequency distributions are "different enough" to - * start a new block, the simply heuristic of splitting when the sum of absolute - * differences exceeds a constant seems to be good enough. We also add a number - * proportional to the block length so that the algorithm is more likely to end - * long blocks than short blocks. This reflects the general expectation that it - * will become increasingly beneficial to start a new block as the current - * block grows longer. - * - * Finally, for an approximation, it is not strictly necessary that the exact - * symbols being used are considered. With "near-optimal parsing", for example, - * the actual symbols that will be used are unknown until after the block - * boundary is chosen and the block has been optimized. Since the final choices - * cannot be used, we can use preliminary "greedy" choices instead. - */ - -/* Initialize the block split statistics when starting a new block. */ -static void -init_block_split_stats(struct block_split_stats *stats) -{ - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->new_observations[i] = 0; - stats->observations[i] = 0; - } - stats->num_new_observations = 0; - stats->num_observations = 0; -} - -/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the - * literal, for 8 possible literal observation types. */ -static forceinline void -observe_literal(struct block_split_stats *stats, u8 lit) -{ - stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; - stats->num_new_observations++; -} - -/* Match observation. Heuristic: use one observation type for "short match" and - * one observation type for "long match". */ -static forceinline void -observe_match(struct block_split_stats *stats, unsigned length) -{ - stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; - stats->num_new_observations++; -} - -static bool -do_end_block_check(struct block_split_stats *stats, u32 block_length) -{ - int i; - - if (stats->num_observations > 0) { - - /* Note: to avoid slow divisions, we do not divide by - * 'num_observations', but rather do all math with the numbers - * multiplied by 'num_observations'. */ - u32 total_delta = 0; - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - u32 expected = stats->observations[i] * stats->num_new_observations; - u32 actual = stats->new_observations[i] * stats->num_observations; - u32 delta = (actual > expected) ? actual - expected : - expected - actual; - total_delta += delta; - } - - /* Ready to end the block? */ - if (total_delta + (block_length / 4096) * stats->num_observations >= - NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations) - return true; - } - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->num_observations += stats->new_observations[i]; - stats->observations[i] += stats->new_observations[i]; - stats->new_observations[i] = 0; - } - stats->num_new_observations = 0; - return false; -} - -static forceinline bool -should_end_block(struct block_split_stats *stats, - const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) -{ - /* Ready to check block split statistics? */ - if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK || - in_next - in_block_begin < MIN_BLOCK_LENGTH || - in_end - in_next < MIN_BLOCK_LENGTH) - return false; - - return do_end_block_check(stats, in_next - in_block_begin); -} - -/******************************************************************************/ - -/* - * This is the level 0 "compressor". It always outputs uncompressed blocks. - */ -static size_t -deflate_compress_none(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - struct deflate_output_bitstream os; - - deflate_init_output(&os, out, out_nbytes_avail); - - deflate_write_uncompressed_blocks(&os, in, in_nbytes, true); - - return deflate_flush_output(&os); -} - -/* - * This is the "greedy" DEFLATE compressor. It always chooses the longest match. - */ -static size_t -deflate_compress_greedy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block. */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - u32 litrunlen = 0; - struct deflate_sequence *next_seq = c->p.g.sequences; - - init_block_split_stats(&c->split_stats); - deflate_reset_symbol_frequencies(c); - - do { - u32 length; - u32 offset; - - /* Decrease the maximum and nice match lengths if we're - * approaching the end of the input buffer. */ - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - length = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - DEFLATE_MIN_MATCH_LEN - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &offset); - - if (length >= DEFLATE_MIN_MATCH_LEN) { - /* Match found. */ - deflate_choose_match(c, length, offset, - &litrunlen, &next_seq); - observe_match(&c->split_stats, length); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next + 1, - in_end, - length - 1, - next_hashes); - } else { - /* No match found. */ - deflate_choose_literal(c, *in_next, &litrunlen); - observe_literal(&c->split_stats, *in_next); - in_next++; - } - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - deflate_finish_sequence(next_seq, litrunlen); - deflate_flush_block(c, &os, in_block_begin, - in_next - in_block_begin, - in_next == in_end, false); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -/* - * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to - * see if there's a longer match at the next position. If yes, it outputs a - * literal and continues to the next position. If no, it outputs the match. - */ -static size_t -deflate_compress_lazy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block. */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - u32 litrunlen = 0; - struct deflate_sequence *next_seq = c->p.g.sequences; - - init_block_split_stats(&c->split_stats); - deflate_reset_symbol_frequencies(c); - - do { - unsigned cur_len; - unsigned cur_offset; - unsigned next_len; - unsigned next_offset; - - if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - /* Find the longest match at the current position. */ - cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - DEFLATE_MIN_MATCH_LEN - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &cur_offset); - in_next += 1; - - if (cur_len < DEFLATE_MIN_MATCH_LEN) { - /* No match found. Choose a literal. */ - deflate_choose_literal(c, *(in_next - 1), &litrunlen); - observe_literal(&c->split_stats, *(in_next - 1)); - continue; - } - - have_cur_match: - observe_match(&c->split_stats, cur_len); - - /* We have a match at the current position. */ - - /* If the current match is very long, choose it - * immediately. */ - if (cur_len >= nice_len) { - deflate_choose_match(c, cur_len, cur_offset, - &litrunlen, &next_seq); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 1, - next_hashes); - continue; - } - - /* - * Try to find a match at the next position. - * - * Note: since we already have a match at the *current* - * position, we use only half the 'max_search_depth' - * when checking the *next* position. This is a useful - * trade-off because it's more worthwhile to use a - * greater search depth on the initial match. - * - * Note: it's possible to structure the code such that - * there's only one call to longest_match(), which - * handles both the "find the initial match" and "try to - * find a longer match" cases. However, it is faster to - * have two call sites, with longest_match() inlined at - * each. - */ - if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - cur_len, - max_len, - nice_len, - c->max_search_depth / 2, - next_hashes, - &next_offset); - in_next += 1; - - if (next_len > cur_len) { - /* Found a longer match at the next position. - * Output a literal. Then the next match - * becomes the current match. */ - deflate_choose_literal(c, *(in_next - 2), &litrunlen); - cur_len = next_len; - cur_offset = next_offset; - goto have_cur_match; - } - - /* No longer match at the next position. - * Output the current match. */ - deflate_choose_match(c, cur_len, cur_offset, - &litrunlen, &next_seq); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 2, - next_hashes); - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - deflate_finish_sequence(next_seq, litrunlen); - deflate_flush_block(c, &os, in_block_begin, - in_next - in_block_begin, - in_next == in_end, false); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and compute the frequencies of the Huffman symbols that - * would be needed to output those matches and literals. - */ -static void -deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) -{ - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - - if (length == 1) { - /* Literal */ - c->freqs.litlen[offset]++; - } else { - /* Match */ - c->freqs.litlen[257 + deflate_length_slot[length]]++; - c->freqs.offset[deflate_get_offset_slot(c, offset)]++; - } - cur_node += length; - } while (cur_node != end_node); -} - -/* Set the current cost model from the codeword lengths specified in @lens. */ -static void -deflate_set_costs_from_codes(struct libdeflate_compressor *c, - const struct deflate_lens *lens) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { - u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); - c->p.n.costs.literal[i] = bits << COST_SHIFT; - } - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { - unsigned length_slot = deflate_length_slot[i]; - unsigned litlen_sym = 257 + length_slot; - u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); - bits += deflate_extra_length_bits[length_slot]; - c->p.n.costs.length[i] = bits << COST_SHIFT; - } - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { - u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); - bits += deflate_extra_offset_bits[i]; - c->p.n.costs.offset_slot[i] = bits << COST_SHIFT; - } -} - -static forceinline u32 -deflate_default_literal_cost(unsigned literal) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 66 is 8.25 bits/symbol */ - return 66; -} - -static forceinline u32 -deflate_default_length_slot_cost(unsigned length_slot) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 60 is 7.5 bits/symbol */ - return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT); -} - -static forceinline u32 -deflate_default_offset_slot_cost(unsigned offset_slot) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 39 is 4.875 bits/symbol */ - return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT); -} - -/* - * Set default symbol costs for the first block's first optimization pass. - * - * It works well to assume that each symbol is equally probable. This results - * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 << - * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding - * alphabet. However, we intentionally bias the parse towards matches rather - * than literals by using a slightly lower default cost for length symbols than - * for literals. This often improves the compression ratio slightly. - */ -static void -deflate_set_default_costs(struct libdeflate_compressor *c) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - c->p.n.costs.literal[i] = deflate_default_literal_cost(i); - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - c->p.n.costs.length[i] = deflate_default_length_slot_cost( - deflate_length_slot[i]); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i); -} - -static forceinline void -deflate_adjust_cost(u32 *cost_p, u32 default_cost) -{ - *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1; -} - -/* - * Adjust the costs when beginning a new block. - * - * Since the current costs have been optimized for the data, it's undesirable to - * throw them away and start over with the default costs. At the same time, we - * don't want to bias the parse by assuming that the next block will be similar - * to the current block. As a compromise, make the costs closer to the - * defaults, but don't simply set them to the defaults. - */ -static void -deflate_adjust_costs(struct libdeflate_compressor *c) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - deflate_adjust_cost(&c->p.n.costs.literal[i], - deflate_default_literal_cost(i)); - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - deflate_adjust_cost(&c->p.n.costs.length[i], - deflate_default_length_slot_cost( - deflate_length_slot[i])); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - deflate_adjust_cost(&c->p.n.costs.offset_slot[i], - deflate_default_offset_slot_cost(i)); -} - -/* - * Find the minimum-cost path through the graph of possible match/literal - * choices for this block. - * - * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which - * represents the node at the beginning of the block, to - * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of - * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. - * - * The algorithm works backwards, starting at the end node and proceeding - * backwards one node at a time. At each node, the minimum cost to reach the - * end node is computed and the match/literal choice that begins that path is - * saved. - */ -static void -deflate_find_min_cost_path(struct libdeflate_compressor *c, - const u32 block_length, - const struct lz_match *cache_ptr) -{ - struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; - struct deflate_optimum_node *cur_node = end_node; - - cur_node->cost_to_end = 0; - do { - unsigned num_matches; - unsigned literal; - u32 best_cost_to_end; - - cur_node--; - cache_ptr--; - - num_matches = cache_ptr->length; - literal = cache_ptr->offset; - - /* It's always possible to choose a literal. */ - best_cost_to_end = c->p.n.costs.literal[literal] + - (cur_node + 1)->cost_to_end; - cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; - - /* Also consider matches if there are any. */ - if (num_matches) { - const struct lz_match *match; - unsigned len; - unsigned offset; - unsigned offset_slot; - u32 offset_cost; - u32 cost_to_end; - - /* - * Consider each length from the minimum - * (DEFLATE_MIN_MATCH_LEN) to the length of the longest - * match found at this position. For each length, we - * consider only the smallest offset for which that - * length is available. Although this is not guaranteed - * to be optimal due to the possibility of a larger - * offset costing less than a smaller offset to code, - * this is a very useful heuristic. - */ - match = cache_ptr - num_matches; - len = DEFLATE_MIN_MATCH_LEN; - do { - offset = match->offset; - offset_slot = deflate_get_offset_slot(c, offset); - offset_cost = c->p.n.costs.offset_slot[offset_slot]; - do { - cost_to_end = offset_cost + - c->p.n.costs.length[len] + - (cur_node + len)->cost_to_end; - if (cost_to_end < best_cost_to_end) { - best_cost_to_end = cost_to_end; - cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len; - } - } while (++len <= match->length); - } while (++match != cache_ptr); - cache_ptr -= num_matches; - } - cur_node->cost_to_end = best_cost_to_end; - } while (cur_node != &c->p.n.optimum_nodes[0]); -} - -/* - * Choose the literal/match sequence to use for the current block. The basic - * algorithm finds a minimum-cost path through the block's graph of - * literal/match choices, given a cost model. However, the cost of each symbol - * is unknown until the Huffman codes have been built, but at the same time the - * Huffman codes depend on the frequencies of chosen symbols. Consequently, - * multiple passes must be used to try to approximate an optimal solution. The - * first pass uses default costs, mixed with the costs from the previous block - * if any. Later passes use the Huffman codeword lengths from the previous pass - * as the costs. - */ -static void -deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length, - const struct lz_match *cache_ptr, bool is_first_block) -{ - unsigned num_passes_remaining = c->p.n.num_optim_passes; - u32 i; - - /* Force the block to really end at the desired length, even if some - * matches extend beyond it. */ - for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, - ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) - c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; - - /* Set the initial costs. */ - if (is_first_block) - deflate_set_default_costs(c); - else - deflate_adjust_costs(c); - - for (;;) { - /* Find the minimum cost path for this pass. */ - deflate_find_min_cost_path(c, block_length, cache_ptr); - - /* Compute frequencies of the chosen symbols. */ - deflate_reset_symbol_frequencies(c); - deflate_tally_item_list(c, block_length); - - if (--num_passes_remaining == 0) - break; - - /* At least one optimization pass remains; update the costs. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); - deflate_set_costs_from_codes(c, &c->codes.lens); - } -} - -/* - * This is the "near-optimal" DEFLATE compressor. It computes the optimal - * representation of each DEFLATE block using a minimum-cost path search over - * the graph of possible match/literal choices for that block, assuming a - * certain cost for each Huffman symbol. - * - * For several reasons, the end result is not guaranteed to be optimal: - * - * - Nonoptimal choice of blocks - * - Heuristic limitations on which matches are actually considered - * - Symbol costs are unknown until the symbols have already been chosen - * (so iterative optimization must be used) - */ -static size_t -deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - bt_matchfinder_init(&c->p.n.bt_mf); - - do { - /* Starting a new DEFLATE block. */ - - struct lz_match *cache_ptr = c->p.n.match_cache; - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - const u8 *next_observation = in_next; - - init_block_split_stats(&c->split_stats); - - /* - * Find matches until we decide to end the block. We end the - * block if any of the following is true: - * - * (1) Maximum block length has been reached - * (2) Match catch may overflow. - * (3) Block split heuristic says to split now. - */ - do { - struct lz_match *matches; - unsigned best_len; - - /* Slide the window forward if needed. */ - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - } - - /* Decrease the maximum and nice match lengths if we're - * approaching the end of the input buffer. */ - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - /* - * Find matches with the current position using the - * binary tree matchfinder and save them in - * 'match_cache'. - * - * Note: the binary tree matchfinder is more suited for - * optimal parsing than the hash chain matchfinder. The - * reasons for this include: - * - * - The binary tree matchfinder can find more matches - * in the same number of steps. - * - One of the major advantages of hash chains is that - * skipping positions (not searching for matches at - * them) is faster; however, with optimal parsing we - * search for matches at almost all positions, so this - * advantage of hash chains is negated. - */ - matches = cache_ptr; - best_len = 0; - if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { - cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &best_len, - matches); - } - - if (in_next >= next_observation) { - if (best_len >= 4) { - observe_match(&c->split_stats, best_len); - next_observation = in_next + best_len; - } else { - observe_literal(&c->split_stats, *in_next); - next_observation = in_next + 1; - } - } - - cache_ptr->length = cache_ptr - matches; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - - /* - * If there was a very long match found, don't cache any - * matches for the bytes covered by that match. This - * avoids degenerate behavior when compressing highly - * redundant data, where the number of matches can be - * very large. - * - * This heuristic doesn't actually hurt the compression - * ratio very much. If there's a long match, then the - * data must be highly compressible, so it doesn't - * matter much what we do. - */ - if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) { - --best_len; - do { - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - } - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) { - bt_matchfinder_skip_position(&c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - nice_len, - c->max_search_depth, - next_hashes); - } - cache_ptr->length = 0; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - } while (--best_len); - } - } while (in_next < in_max_block_end && - cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - /* All the matches for this block have been cached. Now choose - * the sequence of items to output and flush the block. */ - deflate_optimize_block(c, in_next - in_block_begin, cache_ptr, - in_block_begin == in); - deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin, - in_next == in_end, true); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Initialize c->offset_slot_fast. */ -static void -deflate_init_offset_slot_fast(struct libdeflate_compressor *c) -{ - unsigned offset_slot; - unsigned offset; - unsigned offset_end; - - for (offset_slot = 0; - offset_slot < ARRAY_LEN(deflate_offset_slot_base); - offset_slot++) - { - offset = deflate_offset_slot_base[offset_slot]; - #if USE_FULL_OFFSET_SLOT_FAST - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[offset] = offset_slot; - } while (++offset != offset_end); - #else - if (offset <= 256) { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[offset - 1] = offset_slot; - } while (++offset != offset_end); - } else { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; - } while ((offset += (1 << 7)) != offset_end); - } - #endif - } -} - -LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI -libdeflate_alloc_compressor(int compression_level) -{ - struct libdeflate_compressor *c; - size_t size = offsetof(struct libdeflate_compressor, p); - - if (compression_level < 0 || compression_level > 12) - return NULL; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - if (compression_level >= 8) - size += sizeof(c->p.n); - else if (compression_level >= 1) - size += sizeof(c->p.g); -#else - if (compression_level >= 1) - size += sizeof(c->p.g); -#endif - - c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); - if (!c) - return NULL; - - c->compression_level = compression_level; - - /* - * The higher the compression level, the more we should bother trying to - * compress very small inputs. - */ - c->min_size_to_compress = 56 - (compression_level * 4); - - switch (compression_level) { - case 0: - c->impl = deflate_compress_none; - break; - case 1: - c->impl = deflate_compress_greedy; - c->max_search_depth = 2; - c->nice_match_length = 8; - break; - case 2: - c->impl = deflate_compress_greedy; - c->max_search_depth = 6; - c->nice_match_length = 10; - break; - case 3: - c->impl = deflate_compress_greedy; - c->max_search_depth = 12; - c->nice_match_length = 14; - break; - case 4: - c->impl = deflate_compress_greedy; - c->max_search_depth = 24; - c->nice_match_length = 24; - break; - case 5: - c->impl = deflate_compress_lazy; - c->max_search_depth = 20; - c->nice_match_length = 30; - break; - case 6: - c->impl = deflate_compress_lazy; - c->max_search_depth = 40; - c->nice_match_length = 65; - break; - case 7: - c->impl = deflate_compress_lazy; - c->max_search_depth = 100; - c->nice_match_length = 130; - break; -#if SUPPORT_NEAR_OPTIMAL_PARSING - case 8: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 12; - c->nice_match_length = 20; - c->p.n.num_optim_passes = 1; - break; - case 9: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 16; - c->nice_match_length = 26; - c->p.n.num_optim_passes = 2; - break; - case 10: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 30; - c->nice_match_length = 50; - c->p.n.num_optim_passes = 2; - break; - case 11: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 60; - c->nice_match_length = 80; - c->p.n.num_optim_passes = 3; - break; - default: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 100; - c->nice_match_length = 133; - c->p.n.num_optim_passes = 4; - break; -#else - case 8: - c->impl = deflate_compress_lazy; - c->max_search_depth = 150; - c->nice_match_length = 200; - break; - default: - c->impl = deflate_compress_lazy; - c->max_search_depth = 200; - c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - break; -#endif - } - - deflate_init_offset_slot_fast(c); - deflate_init_static_codes(c); - - return c; -} - -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress(struct libdeflate_compressor *c, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail) -{ - if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING)) - return 0; - - /* For extremely small inputs just use a single uncompressed block. */ - if (unlikely(in_nbytes < c->min_size_to_compress)) { - struct deflate_output_bitstream os; - deflate_init_output(&os, out, out_nbytes_avail); - if (in_nbytes == 0) - in = &os; /* Avoid passing NULL to memcpy() */ - deflate_write_uncompressed_block(&os, in, in_nbytes, true); - return deflate_flush_output(&os); - } - - return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); -} - -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_compressor(struct libdeflate_compressor *c) -{ - libdeflate_aligned_free(c); -} - -unsigned int -deflate_get_compression_level(struct libdeflate_compressor *c) -{ - return c->compression_level; -} - -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) -{ - /* - * The worst case is all uncompressed blocks where one block has length - * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. - * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, - * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. - */ - size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); - return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING; -} |