diff options
author | Jatin Bhateja <jbhateja@openjdk.org> | 2024-01-25 10:07:50 +0000 |
---|---|---|
committer | Jatin Bhateja <jbhateja@openjdk.org> | 2024-01-25 10:07:50 +0000 |
commit | 6d36eb78ad781ecd80d66d1319921a8746820394 (patch) | |
tree | f4158d8c491fc0113e0cd472fd69dbcf58e47990 | |
parent | 9d1a6d14846bb1f76ca7258452b3b3f8e3e8b223 (diff) |
8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.jdk-23+7
Reviewed-by: epeter, sviswanathan
-rw-r--r-- | src/hotspot/cpu/x86/assembler_x86.hpp | 6 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp | 36 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp | 6 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 93 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 6 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/stubRoutines_x86.cpp | 6 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/stubRoutines_x86.hpp | 12 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/stubRoutines_x86_64.cpp | 1 | ||||
-rw-r--r-- | src/hotspot/cpu/x86/x86.ad | 31 | ||||
-rw-r--r-- | test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java | 185 |
10 files changed, 364 insertions, 18 deletions
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 7b907218f35..8b512fac6bc 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -816,8 +816,8 @@ private: void check_relocation(RelocationHolder const& rspec, int format); #endif - void emit_data(jint data, relocInfo::relocType rtype, int format); - void emit_data(jint data, RelocationHolder const& rspec, int format); + void emit_data(jint data, relocInfo::relocType rtype, int format = 0); + void emit_data(jint data, RelocationHolder const& rspec, int format = 0); void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0); void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 3817c38f4ba..7512a366e7e 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis kmov(dst, rtmp2); } +#ifdef _LP64 +void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, + XMMRegister mask, Register rtmp, Register rscratch, + XMMRegister permv, XMMRegister xtmp, BasicType bt, + int vec_enc) { + assert(type2aelembytes(bt) >= 4, ""); + assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); + address compress_perm_table = nullptr; + address expand_perm_table = nullptr; + if (type2aelembytes(bt) == 8) { + compress_perm_table = StubRoutines::x86::compress_perm_table64(); + expand_perm_table = StubRoutines::x86::expand_perm_table64(); + vmovmskpd(rtmp, mask, vec_enc); + } else { + compress_perm_table = StubRoutines::x86::compress_perm_table32(); + expand_perm_table = StubRoutines::x86::expand_perm_table32(); + vmovmskps(rtmp, mask, vec_enc); + } + shlq(rtmp, 5); // for 32 byte permute row. + if (opcode == Op_CompressV) { + lea(rscratch, ExternalAddress(compress_perm_table)); + } else { + lea(rscratch, ExternalAddress(expand_perm_table)); + } + addptr(rtmp, rscratch); + vmovdqu(permv, Address(rtmp)); + vpermps(dst, permv, src, Assembler::AVX_256bit); + vpxor(xtmp, xtmp, xtmp, vec_enc); + // Blend the result with zero vector using permute mask, each column entry + // in a permute table row contains either a valid permute index or a -1 (default) + // value, this can potentially be used as a blending mask after + // compressing/expanding the source vector lanes. + vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); +} +#endif + void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, bool merge, BasicType bt, int vec_enc) { if (opcode == Op_CompressV) { diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index e9e1412957b..151f2148372 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -390,6 +390,10 @@ public: void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4); + + void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask, + Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp, + BasicType bt, int vec_enc); #endif // _LP64 void udivI(Register rax, Register divisor, Register rdx); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index cad9e6475c6..71aafdc1cd3 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) { return start; } +address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + if (esize == 32) { + // Loop to generate 256 x 8 int compression permute index table. A row is + // accessed using 8 bit index computed using vector mask. An entry in + // a row holds either a valid permute index corresponding to set bit position + // or a -1 (default) value. + for (int mask = 0; mask < 256; mask++) { + int ctr = 0; + for (int j = 0; j < 8; j++) { + if (mask & (1 << j)) { + __ emit_data(j, relocInfo::none); + ctr++; + } + } + for (; ctr < 8; ctr++) { + __ emit_data(-1, relocInfo::none); + } + } + } else { + assert(esize == 64, ""); + // Loop to generate 16 x 4 long compression permute index table. A row is + // accessed using 4 bit index computed using vector mask. An entry in + // a row holds either a valid permute index pair for a quadword corresponding + // to set bit position or a -1 (default) value. + for (int mask = 0; mask < 16; mask++) { + int ctr = 0; + for (int j = 0; j < 4; j++) { + if (mask & (1 << j)) { + __ emit_data(2 * j, relocInfo::none); + __ emit_data(2 * j + 1, relocInfo::none); + ctr++; + } + } + for (; ctr < 4; ctr++) { + __ emit_data64(-1L, relocInfo::none); + } + } + } + return start; +} + +address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + if (esize == 32) { + // Loop to generate 256 x 8 int expand permute index table. A row is accessed + // using 8 bit index computed using vector mask. An entry in a row holds either + // a valid permute index (starting from least significant lane) placed at poisition + // corresponding to set bit position or a -1 (default) value. + for (int mask = 0; mask < 256; mask++) { + int ctr = 0; + for (int j = 0; j < 8; j++) { + if (mask & (1 << j)) { + __ emit_data(ctr++, relocInfo::none); + } else { + __ emit_data(-1, relocInfo::none); + } + } + } + } else { + assert(esize == 64, ""); + // Loop to generate 16 x 4 long expand permute index table. A row is accessed + // using 4 bit index computed using vector mask. An entry in a row holds either + // a valid doubleword permute index pair representing a quadword index (starting + // from least significant lane) placed at poisition corresponding to set bit + // position or a -1 (default) value. + for (int mask = 0; mask < 16; mask++) { + int ctr = 0; + for (int j = 0; j < 4; j++) { + if (mask & (1 << j)) { + __ emit_data(2 * ctr, relocInfo::none); + __ emit_data(2 * ctr + 1, relocInfo::none); + ctr++; + } else { + __ emit_data64(-1L, relocInfo::none); + } + } + } + } + return start; +} + address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", stub_name); @@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() { StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int"); StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short"); + if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) { + StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32); + StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64); + StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32); + StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64); + } + if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) { // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight. StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut"); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 6b7da718498..db43085d37f 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator { address generate_fp_mask(const char *stub_name, int64_t mask); + address generate_compress_perm_table(const char *stub_name, int32_t esize); + + address generate_expand_perm_table(const char *stub_name, int32_t esize); + address generate_vector_mask(const char *stub_name, int64_t mask); address generate_vector_byte_perm_mask(const char *stub_name); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index cebf661ae75..3be83eed9d2 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr; address StubRoutines::x86::_join_1_2_base64 = nullptr; address StubRoutines::x86::_join_2_3_base64 = nullptr; address StubRoutines::x86::_decoding_table_base64 = nullptr; +address StubRoutines::x86::_compress_perm_table32 = nullptr; +address StubRoutines::x86::_compress_perm_table64 = nullptr; +address StubRoutines::x86::_expand_perm_table32 = nullptr; +address StubRoutines::x86::_expand_perm_table64 = nullptr; #endif address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr; diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index 6c602324f3e..cfb91c5c083 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,7 +37,7 @@ enum platform_dependent_constants { _continuation_stubs_code_size = 1000 LP64_ONLY(+1000), // AVX512 intrinsics add more code in 64-bit VM, // Windows have more code to save/restore registers - _compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000), + _compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000), _final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000) }; @@ -58,6 +58,10 @@ class x86 { static address _float_sign_flip; static address _double_sign_mask; static address _double_sign_flip; + static address _compress_perm_table32; + static address _compress_perm_table64; + static address _expand_perm_table32; + static address _expand_perm_table64; public: @@ -338,6 +342,10 @@ class x86 { static address base64_decoding_table_addr() { return _decoding_table_base64; } static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; } static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; } + static address compress_perm_table32() { return _compress_perm_table32; } + static address compress_perm_table64() { return _compress_perm_table64; } + static address expand_perm_table32() { return _expand_perm_table32; } + static address expand_perm_table64() { return _expand_perm_table64; } #endif static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; } static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; } diff --git a/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp b/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp index 417b32eb4a6..eb6c11d7167 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp @@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr; address StubRoutines::x86::_float_sign_flip = nullptr; address StubRoutines::x86::_double_sign_mask = nullptr; address StubRoutines::x86::_double_sign_flip = nullptr; - diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index caa82aab99c..671d3f7d212 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) { return false; } break; + case Op_CompressV: + case Op_ExpandV: case Op_PopCountVL: if (UseAVX < 2) { return false; @@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) { return false; } break; - case Op_CompressV: - case Op_ExpandV: - if (!VM_Version::supports_avx512vl()) { - return false; - } - break; case Op_SqrtF: if (UseSSE < 1) { return false; @@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) { return false; } - if (size_in_bits < 128 ) { + if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) { return false; } - if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) { + if (size_in_bits < 128 ) { return false; } - break; case Op_VectorLongToMask: if (UseAVX < 1 || !is_LP64) { return false; @@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, %} // --------------------------------- Compress/Expand Operations --------------------------- +#ifdef _LP64 +instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32); + match(Set dst (CompressV src mask)); + match(Set dst (ExpandV src mask)); + effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr); + format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + BasicType bt = Matcher::vector_element_basic_type(this); + __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register, + $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} +#endif instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{ + predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64); match(Set dst (CompressV src mask)); match(Set dst (ExpandV src mask)); format %{ "vector_compress_expand $dst, $src, $mask" %} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java new file mode 100644 index 00000000000..d41d5404250 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.concurrent.TimeUnit; +import java.util.Random; +import jdk.incubator.vector.*; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-XX:UseAVX=2"}) +public class ColumnFilterBenchmark { + @Param({"1024", "2047", "4096"}) + int size; + + float [] floatinCol; + float [] floatoutCol; + float fpivot; + + double [] doubleinCol; + double [] doubleoutCol; + double dpivot; + + int [] intinCol; + int [] intoutCol; + int ipivot; + + long [] longinCol; + long [] longoutCol; + long lpivot; + + static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_256; + static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_256; + static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_256; + static final VectorSpecies<Long> lspecies = LongVector.SPECIES_256; + + @Setup(Level.Trial) + public void BmSetup() { + Random r = new Random(2048); + + floatinCol = new float[size]; + floatoutCol = new float[size]; + fpivot = (float) (size / 2); + doubleinCol = new double[size]; + doubleoutCol = new double[size]; + dpivot = (double) (size / 2); + intinCol = new int[size]; + intoutCol = new int[size]; + ipivot = size / 2; + longinCol = new long[size]; + longoutCol = new long[size]; + lpivot = size / 2; + + for (int i = 4; i < size; i++) { + floatinCol[i] = r.nextFloat() * size; + doubleinCol[i] = r.nextDouble() * size; + intinCol[i] = r.nextInt(size); + longinCol[i] = (long)intinCol[i]; + } + } + + @Benchmark + public void fuzzyFilterIntColumn() { + int i = 0; + int j = 0; + long maskctr = 1; + int endIndex = ispecies.loopBound(size); + for (; i < endIndex; i += ispecies.length()) { + IntVector vec = IntVector.fromArray(ispecies, intinCol, i); + VectorMask<Integer> pred = VectorMask.fromLong(ispecies, maskctr++); + vec.compress(pred).intoArray(intoutCol, j); + j += pred.trueCount(); + } + } + + @Benchmark + public void fuzzyFilterLongColumn() { + int i = 0; + int j = 0; + long maskctr = 1; + int endIndex = lspecies.loopBound(size); + for (; i < endIndex; i += lspecies.length()) { + LongVector vec = LongVector.fromArray(lspecies, longinCol, i); + VectorMask<Long> pred = VectorMask.fromLong(lspecies, maskctr++); + vec.compress(pred).intoArray(longoutCol, j); + j += pred.trueCount(); + } + } + + @Benchmark + public void filterIntColumn() { + int i = 0; + int j = 0; + int endIndex = ispecies.loopBound(size); + for (; i < endIndex; i += ispecies.length()) { + IntVector vec = IntVector.fromArray(ispecies, intinCol, i); + VectorMask<Integer> pred = vec.compare(VectorOperators.GT, ipivot); + vec.compress(pred).intoArray(intoutCol, j); + j += pred.trueCount(); + } + for (; i < endIndex; i++) { + if (intinCol[i] > ipivot) { + intoutCol[j++] = intinCol[i]; + } + } + } + + @Benchmark + public void filterLongColumn() { + int i = 0; + int j = 0; + int endIndex = lspecies.loopBound(size); + for (; i < endIndex; i += lspecies.length()) { + LongVector vec = LongVector.fromArray(lspecies, longinCol, i); + VectorMask<Long> pred = vec.compare(VectorOperators.GT, lpivot); + vec.compress(pred).intoArray(longoutCol, j); + j += pred.trueCount(); + } + for (; i < endIndex; i++) { + if (longinCol[i] > lpivot) { + longoutCol[j++] = longinCol[i]; + } + } + } + + @Benchmark + public void filterFloatColumn() { + int i = 0; + int j = 0; + int endIndex = fspecies.loopBound(size); + for (; i < endIndex; i += fspecies.length()) { + FloatVector vec = FloatVector.fromArray(fspecies, floatinCol, i); + VectorMask<Float> pred = vec.compare(VectorOperators.GT, fpivot); + vec.compress(pred).intoArray(floatoutCol, j); + j += pred.trueCount(); + } + for (; i < endIndex; i++) { + if (floatinCol[i] > fpivot) { + floatoutCol[j++] = floatinCol[i]; + } + } + } + + @Benchmark + public void filterDoubleColumn() { + int i = 0; + int j = 0; + int endIndex = dspecies.loopBound(size); + for (; i < endIndex; i += dspecies.length()) { + DoubleVector vec = DoubleVector.fromArray(dspecies, doubleinCol, i); + VectorMask<Double> pred = vec.compare(VectorOperators.GT, dpivot); + vec.compress(pred).intoArray(doubleoutCol, j); + j += pred.trueCount(); + } + for (; i < endIndex; i++) { + if (doubleinCol[i] > dpivot) { + doubleoutCol[j++] = doubleinCol[i]; + } + } + } +} |