diff options
author | Egor Bogatov <egorbo@gmail.com> | 2024-03-01 23:59:41 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-01 23:59:41 +0100 |
commit | a604763dc82d382e47f5be9fad0b672dc3b3ad72 (patch) | |
tree | dff5e969bfdca70cfb554c9ffd3441f9d37dd72a | |
parent | 33bf01fa7404be5ca7cd43a2c1b127ea0bf58287 (diff) |
ARM64: Use SIMD to copy nongc gaps in blocks with gc pointers (#99140)
-rw-r--r-- | src/coreclr/jit/codegenarm64.cpp | 66 | ||||
-rw-r--r-- | src/coreclr/jit/lsraarmarch.cpp | 7 |
2 files changed, 57 insertions, 16 deletions
diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 498f227e48d5..81370e641383 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3626,7 +3626,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) unsigned slots = layout->GetSlotCount(); // Temp register(s) used to perform the sequence of loads and stores. - regNumber tmpReg = cpObjNode->ExtractTempReg(); + regNumber tmpReg = cpObjNode->ExtractTempReg(RBM_ALLINT); regNumber tmpReg2 = REG_NA; assert(genIsValidIntReg(tmpReg)); @@ -3635,7 +3635,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) if (slots > 1) { - tmpReg2 = cpObjNode->GetSingleTempReg(); + tmpReg2 = cpObjNode->ExtractTempReg(RBM_ALLINT); assert(tmpReg2 != tmpReg); assert(genIsValidIntReg(tmpReg2)); assert(tmpReg2 != REG_WRITE_BARRIER_DST_BYREF); @@ -3682,26 +3682,60 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount(); + // We might also need SIMD regs if we have 4 or more continuous non-gc slots + // On ARM64, SIMD loads/stores provide 8-byte atomicity guarantees when aligned to 8 bytes. + regNumber tmpSimdReg1 = REG_NA; + regNumber tmpSimdReg2 = REG_NA; + if ((slots >= 4) && compiler->IsBaselineSimdIsaSupported()) + { + tmpSimdReg1 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT); + tmpSimdReg2 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT); + } + unsigned i = 0; while (i < slots) { if (!layout->IsGCPtr(i)) { - // Check if the next slot's type is also TYP_GC_NONE and use ldp/stp - if ((i + 1 < slots) && !layout->IsGCPtr(i + 1)) + // How many continuous non-gc slots do we have? + unsigned nonGcSlots = 0; + do { - emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_SRC_BYREF, - 2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); - emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_DST_BYREF, - 2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); - ++i; // extra increment of i, since we are copying two items - } - else + nonGcSlots++; + i++; + } while ((i < slots) && !layout->IsGCPtr(i)); + + const regNumber srcReg = REG_WRITE_BARRIER_SRC_BYREF; + const regNumber dstReg = REG_WRITE_BARRIER_DST_BYREF; + while (nonGcSlots > 0) { - emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE, - INS_OPTS_POST_INDEX); - emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE, - INS_OPTS_POST_INDEX); + regNumber tmp1 = tmpReg; + regNumber tmp2 = tmpReg2; + emitAttr size = EA_8BYTE; + insOpts opts = INS_OPTS_POST_INDEX; + + // Copy at least two slots at a time + if (nonGcSlots >= 2) + { + // Do 4 slots at a time if SIMD is supported + if ((nonGcSlots >= 4) && compiler->IsBaselineSimdIsaSupported()) + { + // We need SIMD temp regs now + tmp1 = tmpSimdReg1; + tmp2 = tmpSimdReg2; + size = EA_16BYTE; + nonGcSlots -= 2; + } + nonGcSlots -= 2; + emit->emitIns_R_R_R_I(INS_ldp, size, tmp1, tmp2, srcReg, EA_SIZE(size) * 2, opts); + emit->emitIns_R_R_R_I(INS_stp, size, tmp1, tmp2, dstReg, EA_SIZE(size) * 2, opts); + } + else + { + nonGcSlots--; + emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, srcReg, EA_SIZE(size), opts); + emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, dstReg, EA_SIZE(size), opts); + } } } else @@ -3709,8 +3743,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) // In the case of a GC-Pointer we'll call the ByRef write barrier helper genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); gcPtrCount--; + i++; } - ++i; } assert(gcPtrCount == 0); } diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index 1df68f5f3f57..0f5761c2d033 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -698,6 +698,13 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) buildInternalIntRegisterDefForNode(blkNode, internalIntCandidates); } + if (size >= 4 * REGSIZE_BYTES && compiler->IsBaselineSimdIsaSupported()) + { + // We can use 128-bit SIMD ldp/stp for larger block sizes + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + // If we have a dest address we want it in RBM_WRITE_BARRIER_DST_BYREF. dstAddrRegMask = RBM_WRITE_BARRIER_DST_BYREF; |