Merge "Optimize non-intrinsic object array copy for Baker."
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 7c72d00..046c2d8 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6470,30 +6470,26 @@
                                         iftable_offset,
                                         maybe_temp2_loc,
                                         kWithoutReadBarrier);
-      Label is_null;
       // Null iftable means it is empty and will always fail the check.
-      // Not cbz since the temp may not be a low register.
-      __ CompareAndBranchIfZero(temp, &is_null);
+      __ CompareAndBranchIfZero(temp, type_check_slow_path->GetEntryLabel());
 
       // Loop through the iftable and check if any class matches.
       __ ldr(maybe_temp2_loc.AsRegister<Register>(), Address(temp, array_length_offset));
 
       Label start_loop;
       __ Bind(&start_loop);
+      __ CompareAndBranchIfZero(maybe_temp2_loc.AsRegister<Register>(),
+                                type_check_slow_path->GetEntryLabel());
       __ ldr(maybe_temp3_loc.AsRegister<Register>(), Address(temp, object_array_data_offset));
       __ MaybeUnpoisonHeapReference(maybe_temp3_loc.AsRegister<Register>());
-      __ cmp(cls, ShifterOperand(maybe_temp3_loc.AsRegister<Register>()));
-      __ b(&done, EQ);  // Return if same class.
       // Go to next interface.
       __ add(temp, temp, ShifterOperand(2 * kHeapReferenceSize));
       __ sub(maybe_temp2_loc.AsRegister<Register>(),
              maybe_temp2_loc.AsRegister<Register>(),
              ShifterOperand(2));
-      // Not cbnz since the temp may not be a low register.
-      __ CompareAndBranchIfNonZero(maybe_temp2_loc.AsRegister<Register>(), &start_loop);
-      __ Bind(&is_null);
-
-      __ b(type_check_slow_path->GetEntryLabel());
+      // Compare the classes and continue the loop if they do not match.
+      __ cmp(cls, ShifterOperand(maybe_temp3_loc.AsRegister<Register>()));
+      __ b(&start_loop, NE);
       break;
     }
   }
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 35b1605..a085fea 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -3782,26 +3782,23 @@
                                         iftable_offset,
                                         maybe_temp2_loc,
                                         kWithoutReadBarrier);
-      vixl::aarch64::Label is_null;
       // Null iftable means it is empty and will always fail the check.
-      __ Cbz(temp, &is_null);
+      __ Cbz(temp, type_check_slow_path->GetEntryLabel());
 
       // Loop through the iftable and check if any class matches.
       __ Ldr(WRegisterFrom(maybe_temp2_loc), HeapOperand(temp.W(), array_length_offset));
 
       vixl::aarch64::Label start_loop;
       __ Bind(&start_loop);
+      __ Cbz(WRegisterFrom(maybe_temp2_loc), type_check_slow_path->GetEntryLabel());
       __ Ldr(WRegisterFrom(maybe_temp3_loc), HeapOperand(temp.W(), object_array_data_offset));
       GetAssembler()->MaybeUnpoisonHeapReference(WRegisterFrom(maybe_temp3_loc));
-      __ Cmp(cls, WRegisterFrom(maybe_temp3_loc));
-      __ B(eq, &done);  // Return if same class.
       // Go to next interface.
       __ Add(temp, temp, 2 * kHeapReferenceSize);
       __ Sub(WRegisterFrom(maybe_temp2_loc), WRegisterFrom(maybe_temp2_loc), 2);
-      __ Cbnz(WRegisterFrom(maybe_temp2_loc), &start_loop);
-      __ Bind(&is_null);
-
-      __ B(type_check_slow_path->GetEntryLabel());
+      // Compare the classes and continue the loop if they do not match.
+      __ Cmp(cls, WRegisterFrom(maybe_temp3_loc));
+      __ B(ne, &start_loop);
       break;
     }
   }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 2a9e21d..45edff8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6833,10 +6833,9 @@
                                           temp_loc,
                                           iftable_offset,
                                           kWithoutReadBarrier);
-        NearLabel is_null;
         // Null iftable means it is empty.
         __ testl(temp, temp);
-        __ j(kZero, &is_null);
+        __ j(kZero, type_check_slow_path->GetEntryLabel());
 
         // Loop through the iftable and check if any class matches.
         __ movl(maybe_temp2_loc.AsRegister<Register>(), Address(temp, array_length_offset));
@@ -6849,7 +6848,6 @@
         __ addl(temp, Immediate(2 * kHeapReferenceSize));
         __ subl(maybe_temp2_loc.AsRegister<Register>(), Immediate(2));
         __ j(kNotZero, &start_loop);
-        __ Bind(&is_null);
       }
 
       __ jmp(type_check_slow_path->GetEntryLabel());
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index cb89e50..f7a2f40 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -6243,10 +6243,9 @@
                                           temp_loc,
                                           iftable_offset,
                                           kWithoutReadBarrier);
-        NearLabel is_null;
         // Null iftable means it is empty.
         __ testl(temp, temp);
-        __ j(kZero, &is_null);
+        __ j(kZero, type_check_slow_path->GetEntryLabel());
 
         // Loop through the iftable and check if any class matches.
         __ movl(maybe_temp2_loc.AsRegister<CpuRegister>(), Address(temp, array_length_offset));
@@ -6259,7 +6258,6 @@
         __ addl(temp, Immediate(2 * kHeapReferenceSize));
         __ subl(maybe_temp2_loc.AsRegister<CpuRegister>(), Immediate(2));
         __ j(kNotZero, &start_loop);
-        __ Bind(&is_null);
       }
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index bbc634d..3e79223 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -116,18 +116,17 @@
                                                   size_t* bytes_tl_bulk_allocated) {
   DCHECK(IsAllocated() && IsInToSpace());
   DCHECK_ALIGNED(num_bytes, kAlignment);
-  Atomic<uint8_t*>* atomic_top = reinterpret_cast<Atomic<uint8_t*>*>(&top_);
   uint8_t* old_top;
   uint8_t* new_top;
   do {
-    old_top = atomic_top->LoadRelaxed();
+    old_top = top_.LoadRelaxed();
     new_top = old_top + num_bytes;
     if (UNLIKELY(new_top > end_)) {
       return nullptr;
     }
-  } while (!atomic_top->CompareExchangeWeakSequentiallyConsistent(old_top, new_top));
-  reinterpret_cast<Atomic<uint64_t>*>(&objects_allocated_)->FetchAndAddSequentiallyConsistent(1);
-  DCHECK_LE(atomic_top->LoadRelaxed(), end_);
+  } while (!top_.CompareExchangeWeakRelaxed(old_top, new_top));
+  objects_allocated_.FetchAndAddRelaxed(1);
+  DCHECK_LE(Top(), end_);
   DCHECK_LT(old_top, end_);
   DCHECK_LE(new_top, end_);
   *bytes_allocated = num_bytes;
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 35bc369..8077319 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -57,7 +57,7 @@
   regions_.reset(new Region[num_regions_]);
   uint8_t* region_addr = mem_map->Begin();
   for (size_t i = 0; i < num_regions_; ++i, region_addr += kRegionSize) {
-    regions_[i] = Region(i, region_addr, region_addr + kRegionSize);
+    regions_[i].Init(i, region_addr, region_addr + kRegionSize);
   }
   mark_bitmap_.reset(
       accounting::ContinuousSpaceBitmap::Create("region space live bitmap", Begin(), Capacity()));
@@ -72,7 +72,6 @@
     }
     CHECK_EQ(regions_[num_regions_ - 1].End(), Limit());
   }
-  full_region_ = Region();
   DCHECK(!full_region_.IsFree());
   DCHECK(full_region_.IsAllocated());
   current_region_ = &full_region_;
@@ -346,7 +345,7 @@
 void RegionSpace::RecordAlloc(mirror::Object* ref) {
   CHECK(ref != nullptr);
   Region* r = RefToRegion(ref);
-  reinterpret_cast<Atomic<uint64_t>*>(&r->objects_allocated_)->FetchAndAddSequentiallyConsistent(1);
+  r->objects_allocated_.FetchAndAddSequentiallyConsistent(1);
 }
 
 bool RegionSpace::AllocNewTlab(Thread* self) {
@@ -424,7 +423,8 @@
 }
 
 void RegionSpace::Region::Dump(std::ostream& os) const {
-  os << "Region[" << idx_ << "]=" << reinterpret_cast<void*>(begin_) << "-" << reinterpret_cast<void*>(top_)
+  os << "Region[" << idx_ << "]=" << reinterpret_cast<void*>(begin_) << "-"
+     << reinterpret_cast<void*>(Top())
      << "-" << reinterpret_cast<void*>(end_)
      << " state=" << static_cast<uint>(state_) << " type=" << static_cast<uint>(type_)
      << " objects_allocated=" << objects_allocated_
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 381ccfa..f3b9595 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -246,11 +246,19 @@
           objects_allocated_(0), alloc_time_(0), live_bytes_(static_cast<size_t>(-1)),
           is_newly_allocated_(false), is_a_tlab_(false), thread_(nullptr) {}
 
-    Region(size_t idx, uint8_t* begin, uint8_t* end)
-        : idx_(idx), begin_(begin), top_(begin), end_(end),
-          state_(RegionState::kRegionStateFree), type_(RegionType::kRegionTypeNone),
-          objects_allocated_(0), alloc_time_(0), live_bytes_(static_cast<size_t>(-1)),
-          is_newly_allocated_(false), is_a_tlab_(false), thread_(nullptr) {
+    void Init(size_t idx, uint8_t* begin, uint8_t* end) {
+      idx_ = idx;
+      begin_ = begin;
+      top_.StoreRelaxed(begin);
+      end_ = end;
+      state_ = RegionState::kRegionStateFree;
+      type_ = RegionType::kRegionTypeNone;
+      objects_allocated_.StoreRelaxed(0);
+      alloc_time_ = 0;
+      live_bytes_ = static_cast<size_t>(-1);
+      is_newly_allocated_ = false;
+      is_a_tlab_ = false;
+      thread_ = nullptr;
       DCHECK_LT(begin, end);
       DCHECK_EQ(static_cast<size_t>(end - begin), kRegionSize);
     }
@@ -264,16 +272,13 @@
     }
 
     void Clear() {
-      top_ = begin_;
+      top_.StoreRelaxed(begin_);
       state_ = RegionState::kRegionStateFree;
       type_ = RegionType::kRegionTypeNone;
-      objects_allocated_ = 0;
+      objects_allocated_.StoreRelaxed(0);
       alloc_time_ = 0;
       live_bytes_ = static_cast<size_t>(-1);
-      if (!kMadviseZeroes) {
-        memset(begin_, 0, end_ - begin_);
-      }
-      madvise(begin_, end_ - begin_, MADV_DONTNEED);
+      ZeroAndReleasePages(begin_, end_ - begin_);
       is_newly_allocated_ = false;
       is_a_tlab_ = false;
       thread_ = nullptr;
@@ -287,8 +292,8 @@
       bool is_free = state_ == RegionState::kRegionStateFree;
       if (is_free) {
         DCHECK(IsInNoSpace());
-        DCHECK_EQ(begin_, top_);
-        DCHECK_EQ(objects_allocated_, 0U);
+        DCHECK_EQ(begin_, Top());
+        DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
       }
       return is_free;
     }
@@ -328,7 +333,7 @@
     bool IsLarge() const {
       bool is_large = state_ == RegionState::kRegionStateLarge;
       if (is_large) {
-        DCHECK_LT(begin_ + 1 * MB, top_);
+        DCHECK_LT(begin_ + 1 * MB, Top());
       }
       return is_large;
     }
@@ -337,7 +342,7 @@
     bool IsLargeTail() const {
       bool is_large_tail = state_ == RegionState::kRegionStateLargeTail;
       if (is_large_tail) {
-        DCHECK_EQ(begin_, top_);
+        DCHECK_EQ(begin_, Top());
       }
       return is_large_tail;
     }
@@ -395,15 +400,15 @@
 
     size_t BytesAllocated() const {
       if (IsLarge()) {
-        DCHECK_LT(begin_ + kRegionSize, top_);
-        return static_cast<size_t>(top_ - begin_);
+        DCHECK_LT(begin_ + kRegionSize, Top());
+        return static_cast<size_t>(Top() - begin_);
       } else if (IsLargeTail()) {
-        DCHECK_EQ(begin_, top_);
+        DCHECK_EQ(begin_, Top());
         return 0;
       } else {
         DCHECK(IsAllocated()) << static_cast<uint>(state_);
-        DCHECK_LE(begin_, top_);
-        size_t bytes = static_cast<size_t>(top_ - begin_);
+        DCHECK_LE(begin_, Top());
+        size_t bytes = static_cast<size_t>(Top() - begin_);
         DCHECK_LE(bytes, kRegionSize);
         return bytes;
       }
@@ -411,12 +416,12 @@
 
     size_t ObjectsAllocated() const {
       if (IsLarge()) {
-        DCHECK_LT(begin_ + 1 * MB, top_);
-        DCHECK_EQ(objects_allocated_, 0U);
+        DCHECK_LT(begin_ + 1 * MB, Top());
+        DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
         return 1;
       } else if (IsLargeTail()) {
-        DCHECK_EQ(begin_, top_);
-        DCHECK_EQ(objects_allocated_, 0U);
+        DCHECK_EQ(begin_, Top());
+        DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
         return 0;
       } else {
         DCHECK(IsAllocated()) << static_cast<uint>(state_);
@@ -428,12 +433,12 @@
       return begin_;
     }
 
-    uint8_t* Top() const {
-      return top_;
+    ALWAYS_INLINE uint8_t* Top() const {
+      return top_.LoadRelaxed();
     }
 
     void SetTop(uint8_t* new_top) {
-      top_ = new_top;
+      top_.StoreRelaxed(new_top);
     }
 
     uint8_t* End() const {
@@ -448,27 +453,26 @@
 
     void RecordThreadLocalAllocations(size_t num_objects, size_t num_bytes) {
       DCHECK(IsAllocated());
-      DCHECK_EQ(objects_allocated_, 0U);
-      DCHECK_EQ(top_, end_);
-      objects_allocated_ = num_objects;
-      top_ = begin_ + num_bytes;
-      DCHECK_EQ(top_, end_);
+      DCHECK_EQ(objects_allocated_.LoadRelaxed(), 0U);
+      DCHECK_EQ(Top(), end_);
+      objects_allocated_.StoreRelaxed(num_objects);
+      top_.StoreRelaxed(begin_ + num_bytes);
+      DCHECK_EQ(Top(), end_);
     }
 
    private:
-    size_t idx_;                   // The region's index in the region space.
-    uint8_t* begin_;               // The begin address of the region.
-    // Can't use Atomic<uint8_t*> as Atomic's copy operator is implicitly deleted.
-    uint8_t* top_;                 // The current position of the allocation.
-    uint8_t* end_;                 // The end address of the region.
-    RegionState state_;            // The region state (see RegionState).
-    RegionType type_;              // The region type (see RegionType).
-    uint64_t objects_allocated_;   // The number of objects allocated.
-    uint32_t alloc_time_;          // The allocation time of the region.
-    size_t live_bytes_;            // The live bytes. Used to compute the live percent.
-    bool is_newly_allocated_;      // True if it's allocated after the last collection.
-    bool is_a_tlab_;               // True if it's a tlab.
-    Thread* thread_;               // The owning thread if it's a tlab.
+    size_t idx_;                        // The region's index in the region space.
+    uint8_t* begin_;                    // The begin address of the region.
+    Atomic<uint8_t*> top_;              // The current position of the allocation.
+    uint8_t* end_;                      // The end address of the region.
+    RegionState state_;                 // The region state (see RegionState).
+    RegionType type_;                   // The region type (see RegionType).
+    Atomic<size_t> objects_allocated_;  // The number of objects allocated.
+    uint32_t alloc_time_;               // The allocation time of the region.
+    size_t live_bytes_;                 // The live bytes. Used to compute the live percent.
+    bool is_newly_allocated_;           // True if it's allocated after the last collection.
+    bool is_a_tlab_;                    // True if it's a tlab.
+    Thread* thread_;                    // The owning thread if it's a tlab.
 
     friend class RegionSpace;
   };