Merge "Use the fast path object size for RosAlloc asm stubs"
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index bc4c999..c51c336 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1086,25 +1086,6 @@
// Load the class (r2)
ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
cbz r2, .Lart_quick_alloc_object_rosalloc_slow_path // Check null class
- // Check class status.
- ldr r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
- cmp r3, #MIRROR_CLASS_STATUS_INITIALIZED
- bne .Lart_quick_alloc_object_rosalloc_slow_path
- // Add a fake dependence from the
- // following access flag and size
- // loads to the status load.
- // This is to prevent those loads
- // from being reordered above the
- // status load and reading wrong
- // values (an alternative is to use
- // a load-acquire for the status).
- eor r3, r3, r3
- add r2, r2, r3
- // Check access flags has
- // kAccClassIsFinalizable
- ldr r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
- tst r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
- bne .Lart_quick_alloc_object_rosalloc_slow_path
ldr r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
// allocation stack has room.
@@ -1113,22 +1094,21 @@
cmp r3, r12
bhs .Lart_quick_alloc_object_rosalloc_slow_path
- ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (r3)
+ ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3)
cmp r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread
- // local allocation
+ // local allocation. Also does the
+ // initialized and finalizable checks.
bhs .Lart_quick_alloc_object_rosalloc_slow_path
// Compute the rosalloc bracket index
- // from the size.
- // Align up the size by the rosalloc
- // bracket quantum size and divide
- // by the quantum size and subtract
- // by 1. This code is a shorter but
- // equivalent version.
- sub r3, r3, #1
- lsr r3, r3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
+ // from the size. Since the size is
+ // already aligned we can combine the
+ // two shifts together.
+ add r12, r9, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+ // Subtract pointer size since ther
+ // are no runs for 0 byte allocations
+ // and the size is already aligned.
// Load the rosalloc run (r12)
- add r12, r9, r3, lsl #POINTER_SIZE_SHIFT
- ldr r12, [r12, #THREAD_ROSALLOC_RUNS_OFFSET]
+ ldr r12, [r12, #(THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)]
// Load the free list head (r3). This
// will be the return val.
ldr r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
@@ -1153,7 +1133,7 @@
// to later accesses to the class
// object. Alternatively we could use
// "ishst" if we use load-acquire for
- // the class status load.)
+ // the object size load.
// Needs to be done before pushing on
// allocation since Heap::VisitObjects
// relies on seeing the class pointer.
@@ -1200,9 +1180,7 @@
ldrd r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
sub r12, r3, r12 // Compute the remaining buf size.
ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3).
- cmp r3, r12 // Check if it fits. OK to do this
- // before rounding up the object size
- // assuming the buf size alignment.
+ cmp r3, r12 // Check if it fits.
bhi \slowPathLabel
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
// Reload old thread_local_pos (r0)
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 82dac9c..03768af 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1860,47 +1860,27 @@
// Load the class (x2)
ldr w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
cbz x2, .Lart_quick_alloc_object_rosalloc_slow_path // Check null class
- // Check class status.
- ldr w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]
- cmp x3, #MIRROR_CLASS_STATUS_INITIALIZED
- bne .Lart_quick_alloc_object_rosalloc_slow_path
- // Add a fake dependence from the
- // following access flag and size
- // loads to the status load.
- // This is to prevent those loads
- // from being reordered above the
- // status load and reading wrong
- // values (an alternative is to use
- // a load-acquire for the status).
- eor x3, x3, x3
- add x2, x2, x3
- // Check access flags has
- // kAccClassIsFinalizable
- ldr w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
- tst x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
- bne .Lart_quick_alloc_object_rosalloc_slow_path
ldr x3, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
// allocation stack has room.
// ldp won't work due to large offset.
ldr x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
cmp x3, x4
bhs .Lart_quick_alloc_object_rosalloc_slow_path
- ldr w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (x3)
+ ldr w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (x3)
cmp x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread
- // local allocation
+ // local allocation. Also does the
+ // finalizable and initialization
+ // checks.
bhs .Lart_quick_alloc_object_rosalloc_slow_path
// Compute the rosalloc bracket index
- // from the size.
- // Align up the size by the rosalloc
- // bracket quantum size and divide
- // by the quantum size and subtract
- // by 1. This code is a shorter but
- // equivalent version.
- sub x3, x3, #1
- lsr x3, x3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
- // Load the rosalloc run (x4)
- add x4, xSELF, x3, lsl #POINTER_SIZE_SHIFT
- ldr x4, [x4, #THREAD_ROSALLOC_RUNS_OFFSET]
+ // from the size. Since the size is
+ // already aligned we can combine the
+ // two shifts together.
+ add x4, xSELF, x3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+ // Subtract pointer size since ther
+ // are no runs for 0 byte allocations
+ // and the size is already aligned.
+ ldr x4, [x4, #(THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)]
// Load the free list head (x3). This
// will be the return val.
ldr x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
@@ -1921,11 +1901,11 @@
str w2, [x3, #MIRROR_OBJECT_CLASS_OFFSET]
// Fence. This is "ish" not "ishst" so
// that it also ensures ordering of
- // the class status load with respect
+ // the object size load with respect
// to later accesses to the class
// object. Alternatively we could use
// "ishst" if we use load-acquire for
- // the class status load.)
+ // the class status load.
// Needs to be done before pushing on
// allocation since Heap::VisitObjects
// relies on seeing the class pointer.
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 9c22245..67ebf50 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -897,17 +897,6 @@
movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
testl %edx, %edx // Check null class
jz .Lart_quick_alloc_object_rosalloc_slow_path
- // Check class status
- cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx)
- jne .Lart_quick_alloc_object_rosalloc_slow_path
- // No fake dependence needed on x86
- // between status and flags load,
- // since each load is a load-acquire,
- // no loads reordering.
- // Check access flags has
- // kAccClassIsFinalizable
- testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx)
- jnz .Lart_quick_alloc_object_rosalloc_slow_path
movl %fs:THREAD_SELF_OFFSET, %ebx // ebx = thread
// Check if the thread local allocation
@@ -916,21 +905,19 @@
cmpl THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%ebx), %edi
jae .Lart_quick_alloc_object_rosalloc_slow_path
- movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %edi // Load the object size (edi)
+ movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %edi // Load the object size (edi)
// Check if the size is for a thread
- // local allocation
+ // local allocation. Also does the
+ // finalizable and initialization check.
cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %edi
ja .Lart_quick_alloc_object_rosalloc_slow_path
- decl %edi
shrl LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %edi // Calculate the rosalloc bracket index
// from object size.
- // Align up the size by the rosalloc
- // bracket quantum size and divide
- // by the quantum size and subtract
- // by 1. This code is a shorter but
- // equivalent version.
// Load thread local rosalloc run (ebx)
- movl THREAD_ROSALLOC_RUNS_OFFSET(%ebx, %edi, __SIZEOF_POINTER__), %ebx
+ // Subtract __SIZEOF_POINTER__ to subtract
+ // one from edi as there is no 0 byte run
+ // and the size is already aligned.
+ movl (THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)(%ebx, %edi, __SIZEOF_POINTER__), %ebx
// Load free_list head (edi),
// this will be the return value.
movl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%ebx), %edi
@@ -990,17 +977,11 @@
MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
testl %edx, %edx // Check null class
jz VAR(slowPathLabel)
- // No fake dependence needed on x86
- // between status and flags load,
- // since each load is a load-acquire,
- // no loads reordering.
movl %fs:THREAD_SELF_OFFSET, %ebx // ebx = thread
movl THREAD_LOCAL_END_OFFSET(%ebx), %edi // Load thread_local_end.
subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi // Compute the remaining buffer size.
movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%edx), %esi // Load the object size.
- cmpl %edi, %esi // Check if it fits. OK to do this
- // before rounding up the object size
- // assuming the buf size alignment.
+ cmpl %edi, %esi // Check if it fits.
ja VAR(slowPathLabel)
movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax // Load thread_local_pos
// as allocated object.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index c568715..b805703 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -935,17 +935,6 @@
movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
testl %edx, %edx // Check null class
jz .Lart_quick_alloc_object_rosalloc_slow_path
- // Check class status.
- cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
- jne .Lart_quick_alloc_object_rosalloc_slow_path
- // We don't need a fence (between the
- // the status and the access flag
- // loads) here because every load is
- // a load acquire on x86.
- // Check access flags has
- // kAccClassIsFinalizable
- testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
- jnz .Lart_quick_alloc_object_rosalloc_slow_path
// Check if the thread local
// allocation stack has room.
movq %gs:THREAD_SELF_OFFSET, %r8 // r8 = thread
@@ -953,22 +942,21 @@
cmpq THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%r8), %rcx
jae .Lart_quick_alloc_object_rosalloc_slow_path
// Load the object size
- movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %eax
+ movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%rdx), %eax
// Check if the size is for a thread
- // local allocation
+ // local allocation. Also does the
+ // initialized and finalizable checks.
cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %eax
ja .Lart_quick_alloc_object_rosalloc_slow_path
// Compute the rosalloc bracket index
// from the size.
- // Align up the size by the rosalloc
- // bracket quantum size and divide
- // by the quantum size and subtract
- // by 1. This code is a shorter but
- // equivalent version.
- subq LITERAL(1), %rax
shrq LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %rax
// Load the rosalloc run (r9)
- movq THREAD_ROSALLOC_RUNS_OFFSET(%r8, %rax, __SIZEOF_POINTER__), %r9
+ // Subtract __SIZEOF_POINTER__ to
+ // subtract one from edi as there is no
+ // 0 byte run and the size is already
+ // aligned.
+ movq (THREAD_ROSALLOC_RUNS_OFFSET - __SIZEOF_POINTER__)(%r8, %rax, __SIZEOF_POINTER__), %r9
// Load the free list head (rax). This
// will be the return val.
movq (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9), %rax
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 26b8e8a..d1d8caa 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -42,11 +42,19 @@
template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
inline uint32_t Class::GetObjectSize() {
// Note: Extra parentheses to avoid the comma being interpreted as macro parameter separator.
- DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << " class=" << PrettyTypeOf(this);
+ DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << "class=" << PrettyTypeOf(this);
return GetField32(ObjectSizeOffset());
}
template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
+inline uint32_t Class::GetObjectSizeAllocFastPath() {
+ // Note: Extra parentheses to avoid the comma being interpreted as macro parameter separator.
+ DCHECK((!IsVariableSize<kVerifyFlags, kReadBarrierOption>())) << "class=" << PrettyTypeOf(this);
+ return GetField32(ObjectSizeAllocFastPathOffset());
+}
+
+
+template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
inline Class* Class::GetSuperClass() {
// Can only get super class for loaded classes (hack for when runtime is
// initializing)
@@ -862,7 +870,7 @@
klass->SetDexClassDefIndex(DexFile::kDexNoIndex16); // Default to no valid class def index.
klass->SetDexTypeIndex(DexFile::kDexNoIndex16); // Default to no valid type index.
// Default to force slow path until initialized.
- klass->SetObjectSizeAllocFastPath(std::numeric_limits<int32_t>::max());
+ klass->SetObjectSizeAllocFastPath(std::numeric_limits<uint32_t>::max());
}
inline void Class::SetAccessFlags(uint32_t new_access_flags) {
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index b60c573..c979c28 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -109,12 +109,11 @@
// alloc path sees a valid object size, we would know that it's initialized as long as it has a
// load-acquire/fake dependency.
if (new_status == kStatusInitialized && !h_this->IsVariableSize()) {
- uint32_t object_size = RoundUp(h_this->GetObjectSize(), kObjectAlignment);
- if (h_this->IsFinalizable()) {
- // Finalizable objects must always go slow path.
- object_size = std::numeric_limits<int32_t>::max();
+ DCHECK_EQ(h_this->GetObjectSizeAllocFastPath(), std::numeric_limits<uint32_t>::max());
+ // Finalizable objects must always go slow path.
+ if (!h_this->IsFinalizable()) {
+ h_this->SetObjectSizeAllocFastPath(RoundUp(h_this->GetObjectSize(), kObjectAlignment));
}
- h_this->SetObjectSizeAllocFastPath(object_size);
}
if (!class_linker_initialized) {
@@ -149,7 +148,7 @@
if (kIsDebugBuild && new_class_size < GetClassSize()) {
DumpClass(LOG(INTERNAL_FATAL), kDumpClassFullDetail);
LOG(INTERNAL_FATAL) << new_class_size << " vs " << GetClassSize();
- LOG(FATAL) << " class=" << PrettyTypeOf(this);
+ LOG(FATAL) << "class=" << PrettyTypeOf(this);
}
// Not called within a transaction.
SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, class_size_), new_class_size);
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index f8f414b..99b7769 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -598,6 +598,10 @@
void SetObjectSizeAllocFastPath(uint32_t new_object_size) REQUIRES_SHARED(Locks::mutator_lock_);
+ template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+ ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+ uint32_t GetObjectSizeAllocFastPath() REQUIRES_SHARED(Locks::mutator_lock_);
+
void SetObjectSizeWithoutChecks(uint32_t new_object_size)
REQUIRES_SHARED(Locks::mutator_lock_) {
// Not called within a transaction.
@@ -1462,7 +1466,7 @@
// See also class_size_.
uint32_t object_size_;
- // Aligned object size for allocation fast path. The value is max int if the object is
+ // Aligned object size for allocation fast path. The value is max uint32_t if the object is
// uninitialized or finalizable. Not currently used for variable sized objects.
uint32_t object_size_alloc_fast_path_;