diff options
Diffstat (limited to 'util/compress/libdeflate/lib/x86/adler32_impl.h')
-rw-r--r-- | util/compress/libdeflate/lib/x86/adler32_impl.h | 337 |
1 files changed, 0 insertions, 337 deletions
diff --git a/util/compress/libdeflate/lib/x86/adler32_impl.h b/util/compress/libdeflate/lib/x86/adler32_impl.h deleted file mode 100644 index f89bde585..000000000 --- a/util/compress/libdeflate/lib/x86/adler32_impl.h +++ /dev/null @@ -1,337 +0,0 @@ -/* - * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef LIB_X86_ADLER32_IMPL_H -#define LIB_X86_ADLER32_IMPL_H - -#include "cpu_features.h" - -/* - * The following macros horizontally sum the s1 counters and add them to the - * real s1, and likewise for s2. They do this via a series of reductions, each - * of which halves the vector length, until just one counter remains. - * - * The s1 reductions don't depend on the s2 reductions and vice versa, so for - * efficiency they are interleaved. Also, every other s1 counter is 0 due to - * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than - * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits. - */ - -#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \ -{ \ - __v4su s1_last = (v_s1), s2_last = (v_s2); \ - \ - /* 128 => 32 bits */ \ - s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \ - s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \ - s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \ - \ - *(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \ - *(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \ -{ \ - __v4su s1_128bit, s2_128bit; \ - \ - /* 256 => 128 bits */ \ - s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \ - (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1); \ - s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \ - (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1); \ - \ - ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ -{ \ - __v8su s1_256bit, s2_256bit; \ - \ - /* 512 => 256 bits */ \ - s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \ - (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \ - s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \ - (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \ - \ - ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ -} - -/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */ -#undef DISPATCH_AVX512BW -#if !defined(DEFAULT_IMPL) && \ - /* - * clang before v3.9 is missing some AVX-512BW intrinsics including - * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using - * AVX-512BW, even when __AVX512BW__ is defined, conditional on - * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin. - */ \ - COMPILER_SUPPORTS_AVX512BW_TARGET && \ - (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS)) -# define FUNCNAME adler32_avx512bw -# define FUNCNAME_CHUNK adler32_avx512bw_chunk -# define IMPL_ALIGNMENT 64 -# define IMPL_SEGMENT_SIZE 64 -# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE -# ifdef __AVX512BW__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_avx512bw -# else -# define ATTRIBUTES __attribute__((target("avx512bw"))) -# define DISPATCH 1 -# define DISPATCH_AVX512BW 1 -# endif -# include <immintrin.h> -static forceinline ATTRIBUTES void -adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end, - u32 *s1, u32 *s2) -{ - const __m512i zeroes = _mm512_setzero_si512(); - const __v64qi multipliers = (__v64qi){ - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __v32hi ones = (__v32hi)_mm512_set1_epi16(1); - __v16si v_s1 = (__v16si)zeroes; - __v16si v_s1_sums = (__v16si)zeroes; - __v16si v_s2 = (__v16si)zeroes; - - do { - /* Load the next 64-byte segment */ - __m512i bytes = *p++; - - /* Multiply the bytes by 64...1 (the number of times they need - * to be added to s2) and add adjacent products */ - __v32hi sums = (__v32hi)_mm512_maddubs_epi16( - bytes, (__m512i)multipliers); - - /* Keep sum of all previous s1 counters, for adding to s2 later. - * This allows delaying the multiplication by 64 to the end. */ - v_s1_sums += v_s1; - - /* Add the sum of each group of 8 bytes to the corresponding s1 - * counter */ - v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes); - - /* Add the sum of each group of 4 products of the bytes by - * 64...1 to the corresponding s2 counter */ - v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums, - (__m512i)ones); - } while (p != end); - - /* Finish the s2 counters by adding the sum of the s1 values at the - * beginning of each segment, multiplied by the segment size (64) */ - v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* AVX-512BW implementation */ - -/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */ -#undef DISPATCH_AVX2 -#if !defined(DEFAULT_IMPL) && \ - (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS)) -# define FUNCNAME adler32_avx2 -# define FUNCNAME_CHUNK adler32_avx2_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_SIZE 32 -# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE -# ifdef __AVX2__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_avx2 -# else -# define ATTRIBUTES __attribute__((target("avx2"))) -# define DISPATCH 1 -# define DISPATCH_AVX2 1 -# endif -# include <immintrin.h> -static forceinline ATTRIBUTES void -adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) -{ - const __m256i zeroes = _mm256_setzero_si256(); - const __v32qu multipliers = (__v32qu){ - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __v16hu ones = (__v16hu)_mm256_set1_epi16(1); - __v8su v_s1 = (__v8su)zeroes; - __v8su v_s1_sums = (__v8su)zeroes; - __v8su v_s2 = (__v8su)zeroes; - - do { - /* Load the next 32-byte segment */ - __m256i bytes = *p++; - - /* Multiply the bytes by 32...1 (the number of times they need - * to be added to s2) and add adjacent products */ - __v16hu sums = (__v16hu)_mm256_maddubs_epi16( - bytes, (__m256i)multipliers); - - /* Keep sum of all previous s1 counters, for adding to s2 later. - * This allows delaying the multiplication by 32 to the end. */ - v_s1_sums += v_s1; - - /* Add the sum of each group of 8 bytes to the corresponding s1 - * counter */ - v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes); - - /* Add the sum of each group of 4 products of the bytes by - * 32...1 to the corresponding s2 counter */ - v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones); - } while (p != end); - - /* Finish the s2 counters by adding the sum of the s1 values at the - * beginning of each segment, multiplied by the segment size (32) */ - v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* AVX2 implementation */ - -/* SSE2 implementation */ -#undef DISPATCH_SSE2 -#if !defined(DEFAULT_IMPL) && \ - (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS)) -# define FUNCNAME adler32_sse2 -# define FUNCNAME_CHUNK adler32_sse2_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_SIZE 32 -/* - * The 16-bit precision byte counters must not be allowed to undergo *signed* - * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) - * would behave incorrectly. - */ -# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF)) -# ifdef __SSE2__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_sse2 -# else -# define ATTRIBUTES __attribute__((target("sse2"))) -# define DISPATCH 1 -# define DISPATCH_SSE2 1 -# endif -# include <emmintrin.h> -static forceinline ATTRIBUTES void -adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) -{ - const __m128i zeroes = _mm_setzero_si128(); - - /* s1 counters: 32-bit, sum of bytes */ - __v4su v_s1 = (__v4su)zeroes; - - /* s2 counters: 32-bit, sum of s1 values */ - __v4su v_s2 = (__v4su)zeroes; - - /* - * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes - * that eventually need to be multiplied by a number 32...1 for addition - * into s2. - */ - __v8hu v_byte_sums_a = (__v8hu)zeroes; - __v8hu v_byte_sums_b = (__v8hu)zeroes; - __v8hu v_byte_sums_c = (__v8hu)zeroes; - __v8hu v_byte_sums_d = (__v8hu)zeroes; - - do { - /* Load the next 32 bytes */ - const __m128i bytes1 = *p++; - const __m128i bytes2 = *p++; - - /* - * Accumulate the previous s1 counters into the s2 counters. - * Logically, this really should be v_s2 += v_s1 * 32, but we - * can do the multiplication (or left shift) later. - */ - v_s2 += v_s1; - - /* - * s1 update: use "Packed Sum of Absolute Differences" to add - * the bytes horizontally with 8 bytes per sum. Then add the - * sums to the s1 counters. - */ - v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes); - v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes); - - /* - * Also accumulate the bytes into 32 separate counters that have - * 16-bit precision. - */ - v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes); - v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes); - v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes); - v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes); - - } while (p != end); - - /* Finish calculating the s2 counters */ - v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a, - (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b, - (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c, - (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 }); - v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d, - (__m128i)(__v8hu){ 8, 7, 6, 5, 4, 3, 2, 1 }); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* SSE2 implementation */ - -#ifdef DISPATCH -static inline adler32_func_t -arch_select_adler32_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_AVX512BW - if (features & X86_CPU_FEATURE_AVX512BW) - return adler32_avx512bw; -#endif -#ifdef DISPATCH_AVX2 - if (features & X86_CPU_FEATURE_AVX2) - return adler32_avx2; -#endif -#ifdef DISPATCH_SSE2 - if (features & X86_CPU_FEATURE_SSE2) - return adler32_sse2; -#endif - return NULL; -} -#endif /* DISPATCH */ - -#endif /* LIB_X86_ADLER32_IMPL_H */ |