ARM64: Add new String.compareTo intrinsic.
Benchmarked on Nexus6P big, little, and all cores. The new intrinsic
is faster than pStringCompareTo for compare lengths on [1,512], so the
runtime call is no longer needed.
Change-Id: If94bfe24d9bf4dddcca648cc0b563709fc407b34
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 55831a0..1c7b57e 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -47,6 +47,7 @@
using helpers::WRegisterFrom;
using helpers::XRegisterFrom;
using helpers::InputRegisterAt;
+using helpers::OutputRegister;
namespace {
@@ -1173,31 +1174,118 @@
void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
LocationSummary* locations = new (arena_) LocationSummary(invoke,
- LocationSummary::kCall,
+ invoke->InputAt(1)->CanBeNull()
+ ? LocationSummary::kCallOnSlowPath
+ : LocationSummary::kNoCall,
kIntrinsified);
- InvokeRuntimeCallingConvention calling_convention;
- locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
- locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
- locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+ locations->SetInAt(0, Location::RequiresRegister());
+ locations->SetInAt(1, Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
}
void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
vixl::MacroAssembler* masm = GetVIXLAssembler();
LocationSummary* locations = invoke->GetLocations();
+ Register str = XRegisterFrom(locations->InAt(0));
+ Register arg = XRegisterFrom(locations->InAt(1));
+ Register out = OutputRegister(invoke);
+
+ Register temp0 = WRegisterFrom(locations->GetTemp(0));
+ Register temp1 = WRegisterFrom(locations->GetTemp(1));
+ Register temp2 = WRegisterFrom(locations->GetTemp(2));
+
+ vixl::Label loop;
+ vixl::Label find_char_diff;
+ vixl::Label end;
+
+ // Get offsets of count and value fields within a string object.
+ const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+ const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+
// Note that the null check must have been done earlier.
DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
- Register argument = WRegisterFrom(locations->InAt(1));
- __ Cmp(argument, 0);
- SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
- codegen_->AddSlowPath(slow_path);
- __ B(eq, slow_path->GetEntryLabel());
+ // Take slow path and throw if input can be and is null.
+ SlowPathCodeARM64* slow_path = nullptr;
+ const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
+ if (can_slow_path) {
+ slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+ codegen_->AddSlowPath(slow_path);
+ __ Cbz(arg, slow_path->GetEntryLabel());
+ }
- __ Ldr(
- lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pStringCompareTo).Int32Value()));
- __ Blr(lr);
- __ Bind(slow_path->GetExitLabel());
+ // Reference equality check, return 0 if same reference.
+ __ Subs(out, str, arg);
+ __ B(&end, eq);
+ // Load lengths of this and argument strings.
+ __ Ldr(temp0, MemOperand(str.X(), count_offset));
+ __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+ // Return zero if both strings are empty.
+ __ Orr(out, temp0, temp1);
+ __ Cbz(out, &end);
+ // out = length diff.
+ __ Subs(out, temp0, temp1);
+ // temp2 = min(len(str), len(arg)).
+ __ Csel(temp2, temp1, temp0, ge);
+ // Shorter string is empty?
+ __ Cbz(temp2, &end);
+
+ // Store offset of string value in preparation for comparison loop.
+ __ Mov(temp1, value_offset);
+
+ UseScratchRegisterScope scratch_scope(masm);
+ Register temp4 = scratch_scope.AcquireX();
+
+ // Assertions that must hold in order to compare strings 4 characters at a time.
+ DCHECK_ALIGNED(value_offset, 8);
+ static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+ const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
+ DCHECK_EQ(char_size, 2u);
+
+ // Promote temp0 to an X reg, ready for LDR.
+ temp0 = temp0.X();
+
+ // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
+ __ Bind(&loop);
+ __ Ldr(temp4, MemOperand(str.X(), temp1));
+ __ Ldr(temp0, MemOperand(arg.X(), temp1));
+ __ Cmp(temp4, temp0);
+ __ B(ne, &find_char_diff);
+ __ Add(temp1, temp1, char_size * 4);
+ __ Subs(temp2, temp2, 4);
+ __ B(gt, &loop);
+ __ B(&end);
+
+ // Promote temp1 to an X reg, ready for EOR.
+ temp1 = temp1.X();
+
+ // Find the single 16-bit character difference.
+ __ Bind(&find_char_diff);
+ // Get the bit position of the first character that differs.
+ __ Eor(temp1, temp0, temp4);
+ __ Rbit(temp1, temp1);
+ __ Clz(temp1, temp1);
+ __ Bic(temp1, temp1, 0xf);
+ // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
+ // the difference occurs outside the remaining string data, so just return length diff (out).
+ __ Cmp(temp2, Operand(temp1, LSR, 4));
+ __ B(le, &end);
+ // Extract the characters and calculate the difference.
+ __ Lsr(temp0, temp0, temp1);
+ __ Lsr(temp4, temp4, temp1);
+ __ And(temp4, temp4, 0xffff);
+ __ Sub(out, temp4, Operand(temp0, UXTH));
+
+ __ Bind(&end);
+
+ if (can_slow_path) {
+ __ Bind(slow_path->GetExitLabel());
+ }
}
void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 1618ced..bf0f647 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -80,7 +80,8 @@
// Intrinsics
qpoints->pIndexOf = art_quick_indexof;
- qpoints->pStringCompareTo = art_quick_string_compareto;
+ // The ARM64 StringCompareTo intrinsic does not call the runtime.
+ qpoints->pStringCompareTo = nullptr;
qpoints->pMemcpy = memcpy;
// Read barrier.
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 209e7f0..c1fee41 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2208,108 +2208,3 @@
asr x0, x0, #1
ret
END art_quick_indexof
-
- /*
- * String's compareTo.
- *
- * TODO: Not very optimized.
- *
- * On entry:
- * x0: this object pointer
- * x1: comp object pointer
- *
- */
- .extern __memcmp16
-ENTRY art_quick_string_compareto
- mov x2, x0 // x0 is return, use x2 for first input.
- sub x0, x2, x1 // Same string object?
- cbnz x0,1f
- ret
-1: // Different string objects.
-
- ldr w4, [x2, #MIRROR_STRING_COUNT_OFFSET]
- ldr w3, [x1, #MIRROR_STRING_COUNT_OFFSET]
- add x2, x2, #MIRROR_STRING_VALUE_OFFSET
- add x1, x1, #MIRROR_STRING_VALUE_OFFSET
-
- /*
- * Now: Data* Count
- * first arg x2 w4
- * second arg x1 w3
- */
-
- // x0 := str1.length(w4) - str2.length(w3). ldr zero-extended w3/w4 into x3/x4.
- subs x0, x4, x3
- // Min(count1, count2) into w3.
- csel x3, x3, x4, ge
-
- // TODO: Tune this value.
- // Check for long string, do memcmp16 for them.
- cmp w3, #28 // Constant from arm32.
- bgt .Ldo_memcmp16
-
- /*
- * Now:
- * x2: *first string data
- * x1: *second string data
- * w3: iteration count
- * x0: return value if comparison equal
- * x4, x5, x6, x7: free
- */
-
- // Do a simple unrolled loop.
-.Lloop:
- // At least two more elements?
- subs w3, w3, #2
- b.lt .Lremainder_or_done
-
- ldrh w4, [x2], #2
- ldrh w5, [x1], #2
-
- ldrh w6, [x2], #2
- ldrh w7, [x1], #2
-
- subs w4, w4, w5
- b.ne .Lw4_result
-
- subs w6, w6, w7
- b.ne .Lw6_result
-
- b .Lloop
-
-.Lremainder_or_done:
- adds w3, w3, #1
- b.eq .Lremainder
- ret
-
-.Lremainder:
- ldrh w4, [x2], #2
- ldrh w5, [x1], #2
- subs w4, w4, w5
- b.ne .Lw4_result
- ret
-
-// Result is in w4
-.Lw4_result:
- sxtw x0, w4
- ret
-
-// Result is in w6
-.Lw6_result:
- sxtw x0, w6
- ret
-
-.Ldo_memcmp16:
- mov x14, x0 // Save x0 and LR. __memcmp16 does not use these temps.
- mov x15, xLR // TODO: Codify and check that?
-
- mov x0, x2
- uxtw x2, w3
- bl __memcmp16
-
- mov xLR, x15 // Restore LR.
-
- cmp x0, #0 // Check the memcmp difference.
- csel x0, x0, x14, ne // x0 := x0 != 0 ? x14(prev x0=length diff) : x1.
- ret
-END art_quick_string_compareto
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 3cdff55..02629e8 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1205,7 +1205,8 @@
TEST_F(StubTest, StringCompareTo) {
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || \
+ // There is no StringCompareTo runtime entrypoint for __aarch64__.
+#if defined(__i386__) || defined(__arm__) || \
defined(__mips__) || (defined(__x86_64__) && !defined(__APPLE__))
// TODO: Check the "Unresolved" allocation stubs