diff options
Diffstat (limited to 'util/compress/libdeflate/lib/deflate_decompress.c')
-rw-r--r-- | util/compress/libdeflate/lib/deflate_decompress.c | 1000 |
1 files changed, 0 insertions, 1000 deletions
diff --git a/util/compress/libdeflate/lib/deflate_decompress.c b/util/compress/libdeflate/lib/deflate_decompress.c deleted file mode 100644 index 1990e74d6..000000000 --- a/util/compress/libdeflate/lib/deflate_decompress.c +++ /dev/null @@ -1,1000 +0,0 @@ -/* - * deflate_decompress.c - a decompressor for DEFLATE - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * --------------------------------------------------------------------------- - * - * This is a highly optimized DEFLATE decompressor. When compiled with gcc on - * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2 - * instructions are available). On other architectures it should still be - * significantly faster than zlib, but the difference may be smaller. - * - * Why this is faster than zlib's implementation: - * - * - Word accesses rather than byte accesses when reading input - * - Word accesses rather than byte accesses when copying matches - * - Faster Huffman decoding combined with various DEFLATE-specific tricks - * - Larger bitbuffer variable that doesn't need to be filled as often - * - Other optimizations to remove unnecessary branches - * - Only full-buffer decompression is supported, so the code doesn't need to - * support stopping and resuming decompression. - * - On x86_64, compile a version of the decompression routine using BMI2 - * instructions and use it automatically at runtime when supported. - */ - -#include <limits.h> - -#include "deflate_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -/* - * If the expression passed to SAFETY_CHECK() evaluates to false, then the - * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the - * compressed data is invalid. - * - * Theoretically, these checks could be disabled for specialized applications - * where all input to the decompressor will be trusted. - */ -#if 0 -# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") -# define SAFETY_CHECK(expr) (void)(expr) -#else -# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA -#endif - -/* - * Each TABLEBITS number is the base-2 logarithm of the number of entries in the - * main portion of the corresponding decode table. Each number should be large - * enough to ensure that for typical data, the vast majority of symbols can be - * decoded by a direct lookup of the next TABLEBITS bits of compressed data. - * However, this must be balanced against the fact that a larger table requires - * more memory and requires more time to fill. - * - * Note: you cannot change a TABLEBITS number without also changing the - * corresponding ENOUGH number! - */ -#define PRECODE_TABLEBITS 7 -#define LITLEN_TABLEBITS 10 -#define OFFSET_TABLEBITS 8 - -/* - * Each ENOUGH number is the maximum number of decode table entries that may be - * required for the corresponding Huffman code, including the main table and all - * subtables. Each number depends on three parameters: - * - * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) - * (2) the number of main table bits (the TABLEBITS numbers defined above) - * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) - * - * The ENOUGH numbers were computed using the utility program 'enough' from - * zlib. This program enumerates all possible relevant Huffman codes to find - * the worst-case usage of decode table entries. - */ -#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ -#define LITLEN_ENOUGH 1334 /* enough 288 10 15 */ -#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ - -/* - * Type for codeword lengths. - */ -typedef u8 len_t; - -/* - * The main DEFLATE decompressor structure. Since this implementation only - * supports full buffer decompression, this structure does not store the entire - * decompression state, but rather only some arrays that are too large to - * comfortably allocate on the stack. - */ -struct libdeflate_decompressor { - - /* - * The arrays aren't all needed at the same time. 'precode_lens' and - * 'precode_decode_table' are unneeded after 'lens' has been filled. - * Furthermore, 'lens' need not be retained after building the litlen - * and offset decode tables. In fact, 'lens' can be in union with - * 'litlen_decode_table' provided that 'offset_decode_table' is separate - * and is built first. - */ - - union { - len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; - - struct { - len_t lens[DEFLATE_NUM_LITLEN_SYMS + - DEFLATE_NUM_OFFSET_SYMS + - DEFLATE_MAX_LENS_OVERRUN]; - - u32 precode_decode_table[PRECODE_ENOUGH]; - } l; - - u32 litlen_decode_table[LITLEN_ENOUGH]; - } u; - - u32 offset_decode_table[OFFSET_ENOUGH]; - - /* used only during build_decode_table() */ - u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; - - bool static_codes_loaded; -}; - -/***************************************************************************** - * Input bitstream * - *****************************************************************************/ - -/* - * The state of the "input bitstream" consists of the following variables: - * - * - in_next: pointer to the next unread byte in the input buffer - * - * - in_end: pointer just past the end of the input buffer - * - * - bitbuf: a word-sized variable containing bits that have been read from - * the input buffer. The buffered bits are right-aligned - * (they're the low-order bits). - * - * - bitsleft: number of bits in 'bitbuf' that are valid. - * - * To make it easier for the compiler to optimize the code by keeping variables - * in registers, these are declared as normal variables and manipulated using - * macros. - */ - -/* - * The type for the bitbuffer variable ('bitbuf' described above). For best - * performance, this should have size equal to a machine word. - * - * 64-bit platforms have a significant advantage: they get a bigger bitbuffer - * which they have to fill less often. - */ -typedef machine_word_t bitbuf_t; - -/* - * Number of bits the bitbuffer variable can hold. - * - * This is one less than the obvious value because of the optimized arithmetic - * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range - * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS]. - */ -#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) - -/* - * The maximum number of bits that can be ensured in the bitbuffer variable, - * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder - * only reads whole bytes from memory, so this is the lowest value of 'bitsleft' - * at which another byte cannot be read without first consuming some bits. - */ -#define MAX_ENSURE (BITBUF_NBITS - 7) - -/* - * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if - * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile - * time constant, then this expression will be a compile-type constant. - * Therefore, CAN_ENSURE() can be used choose between alternative - * implementations at compile time. - */ -#define CAN_ENSURE(n) ((n) <= MAX_ENSURE) - -/* - * Fill the bitbuffer variable, reading one byte at a time. - * - * If we would overread the input buffer, we just don't read anything, leaving - * the bits zeroed but marking them filled. This simplifies the decompressor - * because it removes the need to distinguish between real overreads and - * overreads that occur only because of the decompressor's own lookahead. - * - * The disadvantage is that real overreads are not detected immediately. - * However, this is safe because the decompressor is still guaranteed to make - * forward progress when presented never-ending 0 bits. In an existing block - * output will be getting generated, whereas new blocks can only be uncompressed - * (since the type code for uncompressed blocks is 0), for which we check for - * previous overread. But even if we didn't check, uncompressed blocks would - * fail to validate because LEN would not equal ~NLEN. So the decompressor will - * eventually either detect that the output buffer is full, or detect invalid - * input, or finish the final block. - */ -#define FILL_BITS_BYTEWISE() \ -do { \ - if (likely(in_next != in_end)) \ - bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ - else \ - overrun_count++; \ - bitsleft += 8; \ -} while (bitsleft <= BITBUF_NBITS - 8) - -/* - * Fill the bitbuffer variable by reading the next word from the input buffer - * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits - * were filled. This can be significantly faster than FILL_BITS_BYTEWISE(). - * However, for this to work correctly, the word must be interpreted in - * little-endian format. In addition, the memory access may be unaligned. - * Therefore, this method is most efficient on little-endian architectures that - * support fast unaligned access, such as x86 and x86_64. - * - * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to - * be 1 less than the word size and therefore be all 1 bits. Then the number of - * bits filled is the value of the 0 bits in position >= 3 when changed to 1. - * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000 - * = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and - * 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the - * compiler optimizes out.) In contrast, the alternative of: - * - * in_next += (BITBUF_NBITS - bitsleft) >> 3; - * bitsleft += (BITBUF_NBITS - bitsleft) & ~7; - * - * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on - * average refill an extra bit, but uses 5 operations: two +, and one each of - * -, >>, and &. Also the - and & must be completed before 'bitsleft' can be - * updated, while the current solution updates 'bitsleft' with no dependencies. - */ -#define FILL_BITS_WORDWISE() \ -do { \ - /* BITBUF_NBITS must be all 1's in binary, see above */ \ - STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\ - \ - bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ - in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \ - bitsleft |= BITBUF_NBITS & ~7; \ -} while (0) - -/* - * Does the bitbuffer variable currently contain at least 'n' bits? - */ -#define HAVE_BITS(n) (bitsleft >= (n)) - -/* - * Load more bits from the input buffer until the specified number of bits is - * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE - * and CAN_ENSURE(). - */ -#define ENSURE_BITS(n) \ -if (!HAVE_BITS(n)) { \ - if (CPU_IS_LITTLE_ENDIAN() && \ - UNALIGNED_ACCESS_IS_FAST && \ - likely(in_end - in_next >= sizeof(bitbuf_t))) \ - FILL_BITS_WORDWISE(); \ - else \ - FILL_BITS_BYTEWISE(); \ -} - -/* - * Return the next 'n' bits from the bitbuffer variable without removing them. - */ -#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1)) - -/* - * Remove the next 'n' bits from the bitbuffer variable. - */ -#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n)) - -/* - * Remove and return the next 'n' bits from the bitbuffer variable. - */ -#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) - -/* - * Verify that the input buffer hasn't been overread, then align the input to - * the next byte boundary, discarding any remaining bits in the current byte. - * - * Note that if the bitbuffer variable currently contains more than 7 bits, then - * we must rewind 'in_next', effectively putting those bits back. Only the bits - * in what would be the "current" byte if we were reading one byte at a time can - * be actually discarded. - */ -#define ALIGN_INPUT() \ -do { \ - SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ - in_next -= (bitsleft >> 3) - overrun_count; \ - overrun_count = 0; \ - bitbuf = 0; \ - bitsleft = 0; \ -} while(0) - -/* - * Read a 16-bit value from the input. This must have been preceded by a call - * to ALIGN_INPUT(), and the caller must have already checked for overrun. - */ -#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) - -/***************************************************************************** - * Huffman decoding * - *****************************************************************************/ - -/* - * A decode table for order TABLEBITS consists of a main table of (1 << - * TABLEBITS) entries followed by a variable number of subtables. - * - * The decoding algorithm takes the next TABLEBITS bits of compressed data and - * uses them as an index into the decode table. The resulting entry is either a - * "direct entry", meaning that it contains the value desired, or a "subtable - * pointer", meaning that the entry references a subtable that must be indexed - * using more bits of the compressed data to decode the symbol. - * - * Each decode table (a main table along with its subtables, if any) is - * associated with a Huffman code. Logically, the result of a decode table - * lookup is a symbol from the alphabet from which the corresponding Huffman - * code was constructed. A symbol with codeword length n <= TABLEBITS is - * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a - * symbol with codeword length n > TABLEBITS is associated with one or more - * subtable entries. - * - * On top of this basic design, we implement several optimizations: - * - * - We store the length of each codeword directly in each of its decode table - * entries. This allows the codeword length to be produced without indexing - * an additional table. - * - * - When beneficial, we don't store the Huffman symbol itself, but instead data - * generated from it. For example, when decoding an offset symbol in DEFLATE, - * it's more efficient if we can decode the offset base and number of extra - * offset bits directly rather than decoding the offset symbol and then - * looking up both of those values in an additional table or tables. - * - * The size of each decode table entry is 32 bits, which provides slightly - * better performance than 16-bit entries on 32 and 64 bit processers, provided - * that the table doesn't get so large that it takes up too much memory and - * starts generating cache misses. The bits of each decode table entry are - * defined as follows: - * - * - Bits 30 -- 31: flags (see below) - * - Bits 8 -- 29: decode result: a Huffman symbol or related data - * - Bits 0 -- 7: codeword length - */ - -/* - * This flag is set in all main decode table entries that represent subtable - * pointers. - */ -#define HUFFDEC_SUBTABLE_POINTER 0x80000000 - -/* - * This flag is set in all entries in the litlen decode table that represent - * literals. - */ -#define HUFFDEC_LITERAL 0x40000000 - -/* Mask for extracting the codeword length from a decode table entry. */ -#define HUFFDEC_LENGTH_MASK 0xFF - -/* Shift to extract the decode result from a decode table entry. */ -#define HUFFDEC_RESULT_SHIFT 8 - -/* Shift a decode result into its position in the decode table entry. */ -#define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT) - -/* The decode result for each precode symbol. There is no special optimization - * for the precode; the decode result is simply the symbol value. */ -static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = { -#define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , -#undef ENTRY -}; - -/* The decode result for each litlen symbol. For literals, this is the literal - * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length - * base and the number of extra length bits. */ -static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = { - - /* Literals */ -#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal)) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , - ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , - ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , - ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , - ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , - ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , - ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , - ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , - ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , - ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , - ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , - ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , - ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , - ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , - ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , - ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , - ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , - ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , - ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , - ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , - ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , - ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , - ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , - ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , - ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , - ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , - ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , - ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , - ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , - ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , - ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , - ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , - ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , - ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , - ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , - ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , - ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , - ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , - ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , - ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , - ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , - ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , - ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , - ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , - ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , - ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , - ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , - ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , - ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , - ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , - ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , - ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , - ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , - ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , - ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , - ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , - ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , - ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , - ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , - ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , -#undef ENTRY - -#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF -#define HUFFDEC_LENGTH_BASE_SHIFT 8 -#define HUFFDEC_END_OF_BLOCK_LENGTH 0 - -#define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ - ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits)) - - /* End of block */ - ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), - - /* Lengths */ - ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), - ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), - ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), - ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), - ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), - ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), - ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), - ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , -#undef ENTRY -}; - -/* The decode result for each offset symbol. This is the offset base and the - * number of extra offset bits. */ -static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = { - -#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 -#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) - -#define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ - ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \ - (offset_base)) - ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , - ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , - ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , - ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , - ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , - ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , - ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , - ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , -#undef ENTRY -}; - -/* - * Build a table for fast decoding of symbols from a Huffman code. As input, - * this function takes the codeword length of each symbol which may be used in - * the code. As output, it produces a decode table for the canonical Huffman - * code described by the codeword lengths. The decode table is built with the - * assumption that it will be indexed with "bit-reversed" codewords, where the - * low-order bit is the first bit of the codeword. This format is used for all - * Huffman codes in DEFLATE. - * - * @decode_table - * The array in which the decode table will be generated. This array must - * have sufficient length; see the definition of the ENOUGH numbers. - * @lens - * An array which provides, for each symbol, the length of the - * corresponding codeword in bits, or 0 if the symbol is unused. This may - * alias @decode_table, since nothing is written to @decode_table until all - * @lens have been consumed. All codeword lengths are assumed to be <= - * @max_codeword_len but are otherwise considered untrusted. If they do - * not form a valid Huffman code, then the decode table is not built and - * %false is returned. - * @num_syms - * The number of symbols in the code, including all unused symbols. - * @decode_results - * An array which provides, for each symbol, the actual value to store into - * the decode table. This value will be directly produced as the result of - * decoding that symbol, thereby moving the indirection out of the decode - * loop and into the table initialization. - * @table_bits - * The log base-2 of the number of main table entries to use. - * @max_codeword_len - * The maximum allowed codeword length for this Huffman code. - * Must be <= DEFLATE_MAX_CODEWORD_LEN. - * @sorted_syms - * A temporary array of length @num_syms. - * - * Returns %true if successful; %false if the codeword lengths do not form a - * valid Huffman code. - */ -static bool -build_decode_table(u32 decode_table[], - const len_t lens[], - const unsigned num_syms, - const u32 decode_results[], - const unsigned table_bits, - const unsigned max_codeword_len, - u16 *sorted_syms) -{ - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned sym; /* current symbol */ - unsigned codeword; /* current codeword, bit-reversed */ - unsigned len; /* current codeword length in bits */ - unsigned count; /* num codewords remaining with this length */ - u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ - unsigned cur_table_end; /* end index of current table */ - unsigned subtable_prefix; /* codeword prefix of current subtable */ - unsigned subtable_start; /* start index of current subtable */ - unsigned subtable_bits; /* log2 of current subtable length */ - - /* Count how many codewords have each length, including 0. */ - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - for (sym = 0; sym < num_syms; sym++) - len_counts[lens[sym]]++; - - /* - * Sort the symbols primarily by increasing codeword length and - * secondarily by increasing symbol value; or equivalently by their - * codewords in lexicographic order, since a canonical code is assumed. - * - * For efficiency, also compute 'codespace_used' in the same pass over - * 'len_counts[]' used to build 'offsets[]' for sorting. - */ - - /* Ensure that 'codespace_used' cannot overflow. */ - STATIC_ASSERT(sizeof(codespace_used) == 4); - STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= - DEFLATE_MAX_NUM_SYMS); - - offsets[0] = 0; - offsets[1] = len_counts[0]; - codespace_used = 0; - for (len = 1; len < max_codeword_len; len++) { - offsets[len + 1] = offsets[len] + len_counts[len]; - codespace_used = (codespace_used << 1) + len_counts[len]; - } - codespace_used = (codespace_used << 1) + len_counts[len]; - - for (sym = 0; sym < num_syms; sym++) - sorted_syms[offsets[lens[sym]]++] = sym; - - sorted_syms += offsets[0]; /* Skip unused symbols */ - - /* lens[] is done being used, so we can write to decode_table[] now. */ - - /* - * Check whether the lengths form a complete code (exactly fills the - * codespace), an incomplete code (doesn't fill the codespace), or an - * overfull code (overflows the codespace). A codeword of length 'n' - * uses proportion '1/(2^n)' of the codespace. An overfull code is - * nonsensical, so is considered invalid. An incomplete code is - * considered valid only in two specific cases; see below. - */ - - /* overfull code? */ - if (unlikely(codespace_used > (1U << max_codeword_len))) - return false; - - /* incomplete code? */ - if (unlikely(codespace_used < (1U << max_codeword_len))) { - u32 entry; - unsigned i; - - if (codespace_used == 0) { - /* - * An empty code is allowed. This can happen for the - * offset code in DEFLATE, since a dynamic Huffman block - * need not contain any matches. - */ - - /* sym=0, len=1 (arbitrary) */ - entry = decode_results[0] | 1; - } else { - /* - * Allow codes with a single used symbol, with codeword - * length 1. The DEFLATE RFC is unclear regarding this - * case. What zlib's decompressor does is permit this - * for the litlen and offset codes and assume the - * codeword is '0' rather than '1'. We do the same - * except we allow this for precodes too, since there's - * no convincing reason to treat the codes differently. - * We also assign both codewords '0' and '1' to the - * symbol to avoid having to handle '1' specially. - */ - if (codespace_used != (1U << (max_codeword_len - 1)) || - len_counts[1] != 1) - return false; - entry = decode_results[*sorted_syms] | 1; - } - /* - * Note: the decode table still must be fully initialized, in - * case the stream is malformed and contains bits from the part - * of the codespace the incomplete code doesn't use. - */ - for (i = 0; i < (1U << table_bits); i++) - decode_table[i] = entry; - return true; - } - - /* - * The lengths form a complete code. Now, enumerate the codewords in - * lexicographic order and fill the decode table entries for each one. - * - * First, process all codewords with len <= table_bits. Each one gets - * '2^(table_bits-len)' direct entries in the table. - * - * Since DEFLATE uses bit-reversed codewords, these entries aren't - * consecutive but rather are spaced '2^len' entries apart. This makes - * filling them naively somewhat awkward and inefficient, since strided - * stores are less cache-friendly and preclude the use of word or - * vector-at-a-time stores to fill multiple entries per instruction. - * - * To optimize this, we incrementally double the table size. When - * processing codewords with length 'len', the table is treated as - * having only '2^len' entries, so each codeword uses just one entry. - * Then, each time 'len' is incremented, the table size is doubled and - * the first half is copied to the second half. This significantly - * improves performance over naively doing strided stores. - * - * Note that some entries copied for each table doubling may not have - * been initialized yet, but it doesn't matter since they're guaranteed - * to be initialized later (because the Huffman code is complete). - */ - codeword = 0; - len = 1; - while ((count = len_counts[len]) == 0) - len++; - cur_table_end = 1U << len; - while (len <= table_bits) { - /* Process all 'count' codewords with length 'len' bits. */ - do { - unsigned bit; - - /* Fill the first entry for the current codeword. */ - decode_table[codeword] = - decode_results[*sorted_syms++] | len; - - if (codeword == cur_table_end - 1) { - /* Last codeword (all 1's) */ - for (; len < table_bits; len++) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * - sizeof(decode_table[0])); - cur_table_end <<= 1; - } - return true; - } - /* - * To advance to the lexicographically next codeword in - * the canonical code, the codeword must be incremented, - * then 0's must be appended to the codeword as needed - * to match the next codeword's length. - * - * Since the codeword is bit-reversed, appending 0's is - * a no-op. However, incrementing it is nontrivial. To - * do so efficiently, use the 'bsr' instruction to find - * the last (highest order) 0 bit in the codeword, set - * it, and clear any later (higher order) 1 bits. But - * 'bsr' actually finds the highest order 1 bit, so to - * use it first flip all bits in the codeword by XOR'ing - * it with (1U << len) - 1 == cur_table_end - 1. - */ - bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); - codeword &= bit - 1; - codeword |= bit; - } while (--count); - - /* Advance to the next codeword length. */ - do { - if (++len <= table_bits) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * sizeof(decode_table[0])); - cur_table_end <<= 1; - } - } while ((count = len_counts[len]) == 0); - } - - /* Process codewords with len > table_bits. These require subtables. */ - cur_table_end = 1U << table_bits; - subtable_prefix = -1; - subtable_start = 0; - for (;;) { - u32 entry; - unsigned i; - unsigned stride; - unsigned bit; - - /* - * Start a new subtable if the first 'table_bits' bits of the - * codeword don't match the prefix of the current subtable. - */ - if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { - subtable_prefix = (codeword & ((1U << table_bits) - 1)); - subtable_start = cur_table_end; - /* - * Calculate the subtable length. If the codeword has - * length 'table_bits + n', then the subtable needs - * '2^n' entries. But it may need more; if fewer than - * '2^n' codewords of length 'table_bits + n' remain, - * then the length will need to be incremented to bring - * in longer codewords until the subtable can be - * completely filled. Note that because the Huffman - * code is complete, it will always be possible to fill - * the subtable eventually. - */ - subtable_bits = len - table_bits; - codespace_used = count; - while (codespace_used < (1U << subtable_bits)) { - subtable_bits++; - codespace_used = (codespace_used << 1) + - len_counts[table_bits + subtable_bits]; - } - cur_table_end = subtable_start + (1U << subtable_bits); - - /* - * Create the entry that points from the main table to - * the subtable. This entry contains the index of the - * start of the subtable and the number of bits with - * which the subtable is indexed (the log base 2 of the - * number of entries it contains). - */ - decode_table[subtable_prefix] = - HUFFDEC_SUBTABLE_POINTER | - HUFFDEC_RESULT_ENTRY(subtable_start) | - subtable_bits; - } - - /* Fill the subtable entries for the current codeword. */ - entry = decode_results[*sorted_syms++] | (len - table_bits); - i = subtable_start + (codeword >> table_bits); - stride = 1U << (len - table_bits); - do { - decode_table[i] = entry; - i += stride; - } while (i < cur_table_end); - - /* Advance to the next codeword. */ - if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ - return true; - bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); - codeword &= bit - 1; - codeword |= bit; - count--; - while (count == 0) - count = len_counts[++len]; - } -} - -/* Build the decode table for the precode. */ -static bool -build_precode_decode_table(struct libdeflate_decompressor *d) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); - - return build_decode_table(d->u.l.precode_decode_table, - d->u.precode_lens, - DEFLATE_NUM_PRECODE_SYMS, - precode_decode_results, - PRECODE_TABLEBITS, - DEFLATE_MAX_PRE_CODEWORD_LEN, - d->sorted_syms); -} - -/* Build the decode table for the literal/length code. */ -static bool -build_litlen_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334); - - return build_decode_table(d->u.litlen_decode_table, - d->u.l.lens, - num_litlen_syms, - litlen_decode_results, - LITLEN_TABLEBITS, - DEFLATE_MAX_LITLEN_CODEWORD_LEN, - d->sorted_syms); -} - -/* Build the decode table for the offset code. */ -static bool -build_offset_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); - - return build_decode_table(d->offset_decode_table, - d->u.l.lens + num_litlen_syms, - num_offset_syms, - offset_decode_results, - OFFSET_TABLEBITS, - DEFLATE_MAX_OFFSET_CODEWORD_LEN, - d->sorted_syms); -} - -static forceinline machine_word_t -repeat_byte(u8 b) -{ - machine_word_t v; - - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - - v = b; - v |= v << 8; - v |= v << 16; - v |= v << ((WORDBITS == 64) ? 32 : 0); - return v; -} - -static forceinline void -copy_word_unaligned(const void *src, void *dst) -{ - store_word_unaligned(load_word_unaligned(src), dst); -} - -/***************************************************************************** - * Main decompression routine - *****************************************************************************/ - -typedef enum libdeflate_result (*decompress_func_t) - (struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); - -#undef DEFAULT_IMPL -#undef DISPATCH -#if defined(__i386__) || defined(__x86_64__) -# include "x86/decompress_impl.h" -#endif - -#ifndef DEFAULT_IMPL -# define FUNCNAME deflate_decompress_default -# define ATTRIBUTES -# include "decompress_template.h" -# define DEFAULT_IMPL deflate_decompress_default -#endif - -#ifdef DISPATCH -static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); - -static volatile decompress_func_t decompress_impl = dispatch; - -/* Choose the fastest implementation at runtime */ -static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) -{ - decompress_func_t f = arch_select_decompress_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - decompress_impl = f; - return (*f)(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); -} -#else -# define decompress_impl DEFAULT_IMPL /* only one implementation, use it */ -#endif - - -/* - * This is the main DEFLATE decompression routine. See libdeflate.h for the - * documentation. - * - * Note that the real code is in decompress_template.h. The part here just - * handles calling the appropriate implementation depending on the CPU features - * at runtime. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) -{ - return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); -} - -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) -{ - return libdeflate_deflate_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); -} - -LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI -libdeflate_alloc_decompressor(void) -{ - /* - * Note that only certain parts of the decompressor actually must be - * initialized here: - * - * - 'static_codes_loaded' must be initialized to false. - * - * - The first half of the main portion of each decode table must be - * initialized to any value, to avoid reading from uninitialized - * memory during table expansion in build_decode_table(). (Although, - * this is really just to avoid warnings with dynamic tools like - * valgrind, since build_decode_table() is guaranteed to initialize - * all entries eventually anyway.) - * - * But for simplicity, we currently just zero the whole decompressor. - */ - struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); - - if (d == NULL) - return NULL; - memset(d, 0, sizeof(*d)); - return d; -} - -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_decompressor(struct libdeflate_decompressor *d) -{ - libdeflate_free(d); -} |