summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJatin Bhateja <jbhateja@openjdk.org>2024-01-25 10:07:50 +0000
committerJatin Bhateja <jbhateja@openjdk.org>2024-01-25 10:07:50 +0000
commit6d36eb78ad781ecd80d66d1319921a8746820394 (patch)
treef4158d8c491fc0113e0cd472fd69dbcf58e47990
parent9d1a6d14846bb1f76ca7258452b3b3f8e3e8b223 (diff)
8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.jdk-23+7
Reviewed-by: epeter, sviswanathan
-rw-r--r--src/hotspot/cpu/x86/assembler_x86.hpp6
-rw-r--r--src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp36
-rw-r--r--src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp6
-rw-r--r--src/hotspot/cpu/x86/stubGenerator_x86_64.cpp93
-rw-r--r--src/hotspot/cpu/x86/stubGenerator_x86_64.hpp6
-rw-r--r--src/hotspot/cpu/x86/stubRoutines_x86.cpp6
-rw-r--r--src/hotspot/cpu/x86/stubRoutines_x86.hpp12
-rw-r--r--src/hotspot/cpu/x86/stubRoutines_x86_64.cpp1
-rw-r--r--src/hotspot/cpu/x86/x86.ad31
-rw-r--r--test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java185
10 files changed, 364 insertions, 18 deletions
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
index 7b907218f35..8b512fac6bc 100644
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -816,8 +816,8 @@ private:
void check_relocation(RelocationHolder const& rspec, int format);
#endif
- void emit_data(jint data, relocInfo::relocType rtype, int format);
- void emit_data(jint data, RelocationHolder const& rspec, int format);
+ void emit_data(jint data, relocInfo::relocType rtype, int format = 0);
+ void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
index 3817c38f4ba..7512a366e7e 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
kmov(dst, rtmp2);
}
+#ifdef _LP64
+void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
+ XMMRegister mask, Register rtmp, Register rscratch,
+ XMMRegister permv, XMMRegister xtmp, BasicType bt,
+ int vec_enc) {
+ assert(type2aelembytes(bt) >= 4, "");
+ assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
+ address compress_perm_table = nullptr;
+ address expand_perm_table = nullptr;
+ if (type2aelembytes(bt) == 8) {
+ compress_perm_table = StubRoutines::x86::compress_perm_table64();
+ expand_perm_table = StubRoutines::x86::expand_perm_table64();
+ vmovmskpd(rtmp, mask, vec_enc);
+ } else {
+ compress_perm_table = StubRoutines::x86::compress_perm_table32();
+ expand_perm_table = StubRoutines::x86::expand_perm_table32();
+ vmovmskps(rtmp, mask, vec_enc);
+ }
+ shlq(rtmp, 5); // for 32 byte permute row.
+ if (opcode == Op_CompressV) {
+ lea(rscratch, ExternalAddress(compress_perm_table));
+ } else {
+ lea(rscratch, ExternalAddress(expand_perm_table));
+ }
+ addptr(rtmp, rscratch);
+ vmovdqu(permv, Address(rtmp));
+ vpermps(dst, permv, src, Assembler::AVX_256bit);
+ vpxor(xtmp, xtmp, xtmp, vec_enc);
+ // Blend the result with zero vector using permute mask, each column entry
+ // in a permute table row contains either a valid permute index or a -1 (default)
+ // value, this can potentially be used as a blending mask after
+ // compressing/expanding the source vector lanes.
+ vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
+}
+#endif
+
void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
bool merge, BasicType bt, int vec_enc) {
if (opcode == Op_CompressV) {
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
index e9e1412957b..151f2148372 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -390,6 +390,10 @@ public:
void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
+
+ void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
+ Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
+ BasicType bt, int vec_enc);
#endif // _LP64
void udivI(Register rax, Register divisor, Register rdx);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index cad9e6475c6..71aafdc1cd3 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
return start;
}
+address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", stub_name);
+ address start = __ pc();
+ if (esize == 32) {
+ // Loop to generate 256 x 8 int compression permute index table. A row is
+ // accessed using 8 bit index computed using vector mask. An entry in
+ // a row holds either a valid permute index corresponding to set bit position
+ // or a -1 (default) value.
+ for (int mask = 0; mask < 256; mask++) {
+ int ctr = 0;
+ for (int j = 0; j < 8; j++) {
+ if (mask & (1 << j)) {
+ __ emit_data(j, relocInfo::none);
+ ctr++;
+ }
+ }
+ for (; ctr < 8; ctr++) {
+ __ emit_data(-1, relocInfo::none);
+ }
+ }
+ } else {
+ assert(esize == 64, "");
+ // Loop to generate 16 x 4 long compression permute index table. A row is
+ // accessed using 4 bit index computed using vector mask. An entry in
+ // a row holds either a valid permute index pair for a quadword corresponding
+ // to set bit position or a -1 (default) value.
+ for (int mask = 0; mask < 16; mask++) {
+ int ctr = 0;
+ for (int j = 0; j < 4; j++) {
+ if (mask & (1 << j)) {
+ __ emit_data(2 * j, relocInfo::none);
+ __ emit_data(2 * j + 1, relocInfo::none);
+ ctr++;
+ }
+ }
+ for (; ctr < 4; ctr++) {
+ __ emit_data64(-1L, relocInfo::none);
+ }
+ }
+ }
+ return start;
+}
+
+address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", stub_name);
+ address start = __ pc();
+ if (esize == 32) {
+ // Loop to generate 256 x 8 int expand permute index table. A row is accessed
+ // using 8 bit index computed using vector mask. An entry in a row holds either
+ // a valid permute index (starting from least significant lane) placed at poisition
+ // corresponding to set bit position or a -1 (default) value.
+ for (int mask = 0; mask < 256; mask++) {
+ int ctr = 0;
+ for (int j = 0; j < 8; j++) {
+ if (mask & (1 << j)) {
+ __ emit_data(ctr++, relocInfo::none);
+ } else {
+ __ emit_data(-1, relocInfo::none);
+ }
+ }
+ }
+ } else {
+ assert(esize == 64, "");
+ // Loop to generate 16 x 4 long expand permute index table. A row is accessed
+ // using 4 bit index computed using vector mask. An entry in a row holds either
+ // a valid doubleword permute index pair representing a quadword index (starting
+ // from least significant lane) placed at poisition corresponding to set bit
+ // position or a -1 (default) value.
+ for (int mask = 0; mask < 16; mask++) {
+ int ctr = 0;
+ for (int j = 0; j < 4; j++) {
+ if (mask & (1 << j)) {
+ __ emit_data(2 * ctr, relocInfo::none);
+ __ emit_data(2 * ctr + 1, relocInfo::none);
+ ctr++;
+ } else {
+ __ emit_data64(-1L, relocInfo::none);
+ }
+ }
+ }
+ }
+ return start;
+}
+
address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
@@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
+ if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
+ StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
+ StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
+ StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
+ StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
+ }
+
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
index 6b7da718498..db43085d37f 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {
address generate_fp_mask(const char *stub_name, int64_t mask);
+ address generate_compress_perm_table(const char *stub_name, int32_t esize);
+
+ address generate_expand_perm_table(const char *stub_name, int32_t esize);
+
address generate_vector_mask(const char *stub_name, int64_t mask);
address generate_vector_byte_perm_mask(const char *stub_name);
diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp
index cebf661ae75..3be83eed9d2 100644
--- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
address StubRoutines::x86::_join_1_2_base64 = nullptr;
address StubRoutines::x86::_join_2_3_base64 = nullptr;
address StubRoutines::x86::_decoding_table_base64 = nullptr;
+address StubRoutines::x86::_compress_perm_table32 = nullptr;
+address StubRoutines::x86::_compress_perm_table64 = nullptr;
+address StubRoutines::x86::_expand_perm_table32 = nullptr;
+address StubRoutines::x86::_expand_perm_table64 = nullptr;
#endif
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;
diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
index 6c602324f3e..cfb91c5c083 100644
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -37,7 +37,7 @@ enum platform_dependent_constants {
_continuation_stubs_code_size = 1000 LP64_ONLY(+1000),
// AVX512 intrinsics add more code in 64-bit VM,
// Windows have more code to save/restore registers
- _compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
+ _compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
_final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
};
@@ -58,6 +58,10 @@ class x86 {
static address _float_sign_flip;
static address _double_sign_mask;
static address _double_sign_flip;
+ static address _compress_perm_table32;
+ static address _compress_perm_table64;
+ static address _expand_perm_table32;
+ static address _expand_perm_table64;
public:
@@ -338,6 +342,10 @@ class x86 {
static address base64_decoding_table_addr() { return _decoding_table_base64; }
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
+ static address compress_perm_table32() { return _compress_perm_table32; }
+ static address compress_perm_table64() { return _compress_perm_table64; }
+ static address expand_perm_table32() { return _expand_perm_table32; }
+ static address expand_perm_table64() { return _expand_perm_table64; }
#endif
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }
diff --git a/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp b/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
index 417b32eb4a6..eb6c11d7167 100644
--- a/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
@@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
address StubRoutines::x86::_float_sign_flip = nullptr;
address StubRoutines::x86::_double_sign_mask = nullptr;
address StubRoutines::x86::_double_sign_flip = nullptr;
-
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index caa82aab99c..671d3f7d212 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
+ case Op_CompressV:
+ case Op_ExpandV:
case Op_PopCountVL:
if (UseAVX < 2) {
return false;
@@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
- case Op_CompressV:
- case Op_ExpandV:
- if (!VM_Version::supports_avx512vl()) {
- return false;
- }
- break;
case Op_SqrtF:
if (UseSSE < 1) {
return false;
@@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
return false;
}
- if (size_in_bits < 128 ) {
+ if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
return false;
}
- if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
+ if (size_in_bits < 128 ) {
return false;
}
- break;
case Op_VectorLongToMask:
if (UseAVX < 1 || !is_LP64) {
return false;
@@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
%}
// --------------------------------- Compress/Expand Operations ---------------------------
+#ifdef _LP64
+instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
+ predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
+ match(Set dst (CompressV src mask));
+ match(Set dst (ExpandV src mask));
+ effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
+ format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
+ ins_encode %{
+ int opcode = this->ideal_Opcode();
+ int vlen_enc = vector_length_encoding(this);
+ BasicType bt = Matcher::vector_element_basic_type(this);
+ __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
+ $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#endif
instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
+ predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
match(Set dst (CompressV src mask));
match(Set dst (ExpandV src mask));
format %{ "vector_compress_expand $dst, $src, $mask" %}
diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java
new file mode 100644
index 00000000000..d41d5404250
--- /dev/null
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package org.openjdk.bench.jdk.incubator.vector;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+import jdk.incubator.vector.*;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-XX:UseAVX=2"})
+public class ColumnFilterBenchmark {
+ @Param({"1024", "2047", "4096"})
+ int size;
+
+ float [] floatinCol;
+ float [] floatoutCol;
+ float fpivot;
+
+ double [] doubleinCol;
+ double [] doubleoutCol;
+ double dpivot;
+
+ int [] intinCol;
+ int [] intoutCol;
+ int ipivot;
+
+ long [] longinCol;
+ long [] longoutCol;
+ long lpivot;
+
+ static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_256;
+ static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_256;
+ static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_256;
+ static final VectorSpecies<Long> lspecies = LongVector.SPECIES_256;
+
+ @Setup(Level.Trial)
+ public void BmSetup() {
+ Random r = new Random(2048);
+
+ floatinCol = new float[size];
+ floatoutCol = new float[size];
+ fpivot = (float) (size / 2);
+ doubleinCol = new double[size];
+ doubleoutCol = new double[size];
+ dpivot = (double) (size / 2);
+ intinCol = new int[size];
+ intoutCol = new int[size];
+ ipivot = size / 2;
+ longinCol = new long[size];
+ longoutCol = new long[size];
+ lpivot = size / 2;
+
+ for (int i = 4; i < size; i++) {
+ floatinCol[i] = r.nextFloat() * size;
+ doubleinCol[i] = r.nextDouble() * size;
+ intinCol[i] = r.nextInt(size);
+ longinCol[i] = (long)intinCol[i];
+ }
+ }
+
+ @Benchmark
+ public void fuzzyFilterIntColumn() {
+ int i = 0;
+ int j = 0;
+ long maskctr = 1;
+ int endIndex = ispecies.loopBound(size);
+ for (; i < endIndex; i += ispecies.length()) {
+ IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
+ VectorMask<Integer> pred = VectorMask.fromLong(ispecies, maskctr++);
+ vec.compress(pred).intoArray(intoutCol, j);
+ j += pred.trueCount();
+ }
+ }
+
+ @Benchmark
+ public void fuzzyFilterLongColumn() {
+ int i = 0;
+ int j = 0;
+ long maskctr = 1;
+ int endIndex = lspecies.loopBound(size);
+ for (; i < endIndex; i += lspecies.length()) {
+ LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
+ VectorMask<Long> pred = VectorMask.fromLong(lspecies, maskctr++);
+ vec.compress(pred).intoArray(longoutCol, j);
+ j += pred.trueCount();
+ }
+ }
+
+ @Benchmark
+ public void filterIntColumn() {
+ int i = 0;
+ int j = 0;
+ int endIndex = ispecies.loopBound(size);
+ for (; i < endIndex; i += ispecies.length()) {
+ IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
+ VectorMask<Integer> pred = vec.compare(VectorOperators.GT, ipivot);
+ vec.compress(pred).intoArray(intoutCol, j);
+ j += pred.trueCount();
+ }
+ for (; i < endIndex; i++) {
+ if (intinCol[i] > ipivot) {
+ intoutCol[j++] = intinCol[i];
+ }
+ }
+ }
+
+ @Benchmark
+ public void filterLongColumn() {
+ int i = 0;
+ int j = 0;
+ int endIndex = lspecies.loopBound(size);
+ for (; i < endIndex; i += lspecies.length()) {
+ LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
+ VectorMask<Long> pred = vec.compare(VectorOperators.GT, lpivot);
+ vec.compress(pred).intoArray(longoutCol, j);
+ j += pred.trueCount();
+ }
+ for (; i < endIndex; i++) {
+ if (longinCol[i] > lpivot) {
+ longoutCol[j++] = longinCol[i];
+ }
+ }
+ }
+
+ @Benchmark
+ public void filterFloatColumn() {
+ int i = 0;
+ int j = 0;
+ int endIndex = fspecies.loopBound(size);
+ for (; i < endIndex; i += fspecies.length()) {
+ FloatVector vec = FloatVector.fromArray(fspecies, floatinCol, i);
+ VectorMask<Float> pred = vec.compare(VectorOperators.GT, fpivot);
+ vec.compress(pred).intoArray(floatoutCol, j);
+ j += pred.trueCount();
+ }
+ for (; i < endIndex; i++) {
+ if (floatinCol[i] > fpivot) {
+ floatoutCol[j++] = floatinCol[i];
+ }
+ }
+ }
+
+ @Benchmark
+ public void filterDoubleColumn() {
+ int i = 0;
+ int j = 0;
+ int endIndex = dspecies.loopBound(size);
+ for (; i < endIndex; i += dspecies.length()) {
+ DoubleVector vec = DoubleVector.fromArray(dspecies, doubleinCol, i);
+ VectorMask<Double> pred = vec.compare(VectorOperators.GT, dpivot);
+ vec.compress(pred).intoArray(doubleoutCol, j);
+ j += pred.trueCount();
+ }
+ for (; i < endIndex; i++) {
+ if (doubleinCol[i] > dpivot) {
+ doubleoutCol[j++] = doubleinCol[i];
+ }
+ }
+ }
+}