src/hotspot/os_cpu/solaris_x86/solaris_x86_64.s - platform/libcore - Gitiles

 /
 / Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
 / DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 /
 / This code is free software; you can redistribute it and/or modify it
 / under the terms of the GNU General Public License version 2 only, as
 / published by the Free Software Foundation.
 /
 / This code is distributed in the hope that it will be useful, but WITHOUT
 / ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 / FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 / version 2 for more details (a copy is included in the LICENSE file that
 / accompanied this code).
 /
 / You should have received a copy of the GNU General Public License version
 / 2 along with this work; if not, write to the Free Software Foundation,
 / Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 /
 / Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 / or visit www.oracle.com if you need additional information or have any
 / questions.
 /

         .globl fs_load
         .globl fs_thread

         // NOTE WELL!  The _Copy functions are called directly
         // from server-compiler-generated code via CallLeafNoFP,
         // which means that they *must* either not use floating
         // point or use it in the same manner as does the server
         // compiler.

         .globl _Copy_arrayof_conjoint_bytes
         .globl _Copy_conjoint_jshorts_atomic
         .globl _Copy_arrayof_conjoint_jshorts
         .globl _Copy_conjoint_jints_atomic
         .globl _Copy_arrayof_conjoint_jints
         .globl _Copy_conjoint_jlongs_atomic
         .globl _Copy_arrayof_conjoint_jlongs

         .section .text,"ax"

         / Fast thread accessors, used by threadLS_solaris_amd64.cpp
         .align   16
 fs_load:
         movq %fs:(%rdi),%rax
         ret

         .align   16
 fs_thread:
         movq %fs:0x0,%rax
         ret

         .globl  SpinPause
         .align  16
 SpinPause:
         rep
         nop
         movq    $1, %rax
         ret


         / Support for void Copy::arrayof_conjoint_bytes(void* from,
         /                                               void* to,
         /                                               size_t count)
         / rdi - from
         / rsi - to
         / rdx - count, treated as ssize_t
         /
         .align   16
 _Copy_arrayof_conjoint_bytes:
         movq     %rdx,%r8             / byte count
         shrq     $3,%rdx              / qword count
         cmpq     %rdi,%rsi
         leaq     -1(%rdi,%r8,1),%rax  / from + bcount*1 - 1
         jbe      acb_CopyRight
         cmpq     %rax,%rsi
         jbe      acb_CopyLeft
 acb_CopyRight:
         leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
         leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
         negq     %rdx
         jmp      7f
         .align   16
 1:      movq     8(%rax,%rdx,8),%rsi
         movq     %rsi,8(%rcx,%rdx,8)
         addq     $1,%rdx
         jnz      1b
 2:      testq    $4,%r8               / check for trailing dword
         jz       3f
         movl     8(%rax),%esi         / copy trailing dword
         movl     %esi,8(%rcx)
         addq     $4,%rax
         addq     $4,%rcx              / original %rsi is trashed, so we
                                       /  can't use it as a base register
 3:      testq    $2,%r8               / check for trailing word
         jz       4f
         movw     8(%rax),%si          / copy trailing word
         movw     %si,8(%rcx)
         addq     $2,%rcx
 4:      testq    $1,%r8               / check for trailing byte
         jz       5f
         movb     -1(%rdi,%r8,1),%al   / copy trailing byte
         movb     %al,8(%rcx)
 5:      ret
         .align   16
 6:      movq     -24(%rax,%rdx,8),%rsi
         movq     %rsi,-24(%rcx,%rdx,8)
         movq     -16(%rax,%rdx,8),%rsi
         movq     %rsi,-16(%rcx,%rdx,8)
         movq     -8(%rax,%rdx,8),%rsi
         movq     %rsi,-8(%rcx,%rdx,8)
         movq     (%rax,%rdx,8),%rsi
         movq     %rsi,(%rcx,%rdx,8)
 7:      addq     $4,%rdx
         jle      6b
         subq     $4,%rdx
         jl       1b
         jmp      2b
 acb_CopyLeft:
         testq    $1,%r8               / check for trailing byte
         jz       1f
         movb     -1(%rdi,%r8,1),%cl   / copy trailing byte
         movb     %cl,-1(%rsi,%r8,1)
         subq     $1,%r8               / adjust for possible trailing word
 1:      testq    $2,%r8               / check for trailing word
         jz       2f
         movw     -2(%rdi,%r8,1),%cx   / copy trailing word
         movw     %cx,-2(%rsi,%r8,1)
 2:      testq    $4,%r8               / check for trailing dword
         jz       5f
         movl     (%rdi,%rdx,8),%ecx   / copy trailing dword
         movl     %ecx,(%rsi,%rdx,8)
         jmp      5f
         .align   16
 3:      movq     -8(%rdi,%rdx,8),%rcx
         movq     %rcx,-8(%rsi,%rdx,8)
         subq     $1,%rdx
         jnz      3b
         ret
         .align   16
 4:      movq     24(%rdi,%rdx,8),%rcx
         movq     %rcx,24(%rsi,%rdx,8)
         movq     16(%rdi,%rdx,8),%rcx
         movq     %rcx,16(%rsi,%rdx,8)
         movq     8(%rdi,%rdx,8),%rcx
         movq     %rcx,8(%rsi,%rdx,8)
         movq     (%rdi,%rdx,8),%rcx
         movq     %rcx,(%rsi,%rdx,8)
 5:      subq     $4,%rdx
         jge      4b
         addq     $4,%rdx
         jg       3b
         ret

         / Support for void Copy::arrayof_conjoint_jshorts(void* from,
         /                                                 void* to,
         /                                                 size_t count)
         / Equivalent to
         /   conjoint_jshorts_atomic
         /
         / If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
         / let the hardware handle it.  The tow or four words within dwords
         / or qwords that span cache line boundaries will still be loaded
         / and stored atomically.
         /
         / rdi - from
         / rsi - to
         / rdx - count, treated as ssize_t
         /
         .align   16
 _Copy_arrayof_conjoint_jshorts:
 _Copy_conjoint_jshorts_atomic:
         movq     %rdx,%r8             / word count
         shrq     $2,%rdx              / qword count
         cmpq     %rdi,%rsi
         leaq     -2(%rdi,%r8,2),%rax  / from + wcount*2 - 2
         jbe      acs_CopyRight
         cmpq     %rax,%rsi
         jbe      acs_CopyLeft
 acs_CopyRight:
         leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
         leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
         negq     %rdx
         jmp      6f
 1:      movq     8(%rax,%rdx,8),%rsi
         movq     %rsi,8(%rcx,%rdx,8)
         addq     $1,%rdx
         jnz      1b
 2:      testq    $2,%r8               / check for trailing dword
         jz       3f
         movl     8(%rax),%esi         / copy trailing dword
         movl     %esi,8(%rcx)
         addq     $4,%rcx              / original %rsi is trashed, so we
                                       /  can't use it as a base register
 3:      testq    $1,%r8               / check for trailing word
         jz       4f
         movw     -2(%rdi,%r8,2),%si   / copy trailing word
         movw     %si,8(%rcx)
 4:      ret
         .align   16
 5:      movq     -24(%rax,%rdx,8),%rsi
         movq     %rsi,-24(%rcx,%rdx,8)
         movq     -16(%rax,%rdx,8),%rsi
         movq     %rsi,-16(%rcx,%rdx,8)
         movq     -8(%rax,%rdx,8),%rsi
         movq     %rsi,-8(%rcx,%rdx,8)
         movq     (%rax,%rdx,8),%rsi
         movq     %rsi,(%rcx,%rdx,8)
 6:      addq     $4,%rdx
         jle      5b
         subq     $4,%rdx
         jl       1b
         jmp      2b
 acs_CopyLeft:
         testq    $1,%r8               / check for trailing word
         jz       1f
         movw     -2(%rdi,%r8,2),%cx   / copy trailing word
         movw     %cx,-2(%rsi,%r8,2)
 1:      testq    $2,%r8               / check for trailing dword
         jz       4f
         movl     (%rdi,%rdx,8),%ecx   / copy trailing dword
         movl     %ecx,(%rsi,%rdx,8)
         jmp      4f
 2:      movq     -8(%rdi,%rdx,8),%rcx
         movq     %rcx,-8(%rsi,%rdx,8)
         subq     $1,%rdx
         jnz      2b
         ret
         .align   16
 3:      movq     24(%rdi,%rdx,8),%rcx
         movq     %rcx,24(%rsi,%rdx,8)
         movq     16(%rdi,%rdx,8),%rcx
         movq     %rcx,16(%rsi,%rdx,8)
         movq     8(%rdi,%rdx,8),%rcx
         movq     %rcx,8(%rsi,%rdx,8)
         movq     (%rdi,%rdx,8),%rcx
         movq     %rcx,(%rsi,%rdx,8)
 4:      subq     $4,%rdx
         jge      3b
         addq     $4,%rdx
         jg       2b
         ret

         / Support for void Copy::arrayof_conjoint_jints(jint* from,
         /                                               jint* to,
         /                                               size_t count)
         / Equivalent to
         /   conjoint_jints_atomic
         /
         / If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
         / the hardware handle it.  The two dwords within qwords that span
         / cache line boundaries will still be loaded and stored atomically.
         /
         / rdi - from
         / rsi - to
         / rdx - count, treated as ssize_t
         /
         .align   16
 _Copy_arrayof_conjoint_jints:
 _Copy_conjoint_jints_atomic:
         movq     %rdx,%r8             / dword count
         shrq     %rdx                 / qword count
         cmpq     %rdi,%rsi
         leaq     -4(%rdi,%r8,4),%rax  / from + dcount*4 - 4
         jbe      aci_CopyRight
         cmpq     %rax,%rsi
         jbe      aci_CopyLeft
 aci_CopyRight:
         leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
         leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
         negq     %rdx
         jmp      5f
         .align   16
 1:      movq     8(%rax,%rdx,8),%rsi
         movq     %rsi,8(%rcx,%rdx,8)
         addq     $1,%rdx
         jnz       1b
 2:      testq    $1,%r8               / check for trailing dword
         jz       3f
         movl     8(%rax),%esi         / copy trailing dword
         movl     %esi,8(%rcx)
 3:      ret
         .align   16
 4:      movq     -24(%rax,%rdx,8),%rsi
         movq     %rsi,-24(%rcx,%rdx,8)
         movq     -16(%rax,%rdx,8),%rsi
         movq     %rsi,-16(%rcx,%rdx,8)
         movq     -8(%rax,%rdx,8),%rsi
         movq     %rsi,-8(%rcx,%rdx,8)
         movq     (%rax,%rdx,8),%rsi
         movq     %rsi,(%rcx,%rdx,8)
 5:      addq     $4,%rdx
         jle      4b
         subq     $4,%rdx
         jl       1b
         jmp      2b
 aci_CopyLeft:
         testq    $1,%r8               / check for trailing dword
         jz       3f
         movl     -4(%rdi,%r8,4),%ecx  / copy trailing dword
         movl     %ecx,-4(%rsi,%r8,4)
         jmp      3f
 1:      movq     -8(%rdi,%rdx,8),%rcx
         movq     %rcx,-8(%rsi,%rdx,8)
         subq     $1,%rdx
         jnz      1b
         ret
         .align   16
 2:      movq     24(%rdi,%rdx,8),%rcx
         movq     %rcx,24(%rsi,%rdx,8)
         movq     16(%rdi,%rdx,8),%rcx
         movq     %rcx,16(%rsi,%rdx,8)
         movq     8(%rdi,%rdx,8),%rcx
         movq     %rcx,8(%rsi,%rdx,8)
         movq     (%rdi,%rdx,8),%rcx
         movq     %rcx,(%rsi,%rdx,8)
 3:      subq     $4,%rdx
         jge      2b
         addq     $4,%rdx
         jg       1b
         ret

         / Support for void Copy::arrayof_conjoint_jlongs(jlong* from,
         /                                                jlong* to,
         /                                                size_t count)
         / Equivalent to
         /   conjoint_jlongs_atomic
         /   arrayof_conjoint_oops
         /   conjoint_oops_atomic
         /
         / rdi - from
         / rsi - to
         / rdx - count, treated as ssize_t
         /
         .align   16
 _Copy_arrayof_conjoint_jlongs:
 _Copy_conjoint_jlongs_atomic:
         cmpq     %rdi,%rsi
         leaq     -8(%rdi,%rdx,8),%rax / from + count*8 - 8
         jbe      acl_CopyRight
         cmpq     %rax,%rsi
         jbe      acl_CopyLeft
 acl_CopyRight:
         leaq     -8(%rsi,%rdx,8),%rcx / to + count*8 - 8
         negq     %rdx
         jmp      3f
 1:      movq     8(%rax,%rdx,8),%rsi
         movq     %rsi,8(%rcx,%rdx,8)
         addq     $1,%rdx
         jnz      1b
         ret
         .align   16
 2:      movq     -24(%rax,%rdx,8),%rsi
         movq     %rsi,-24(%rcx,%rdx,8)
         movq     -16(%rax,%rdx,8),%rsi
         movq     %rsi,-16(%rcx,%rdx,8)
         movq     -8(%rax,%rdx,8),%rsi
         movq     %rsi,-8(%rcx,%rdx,8)
         movq     (%rax,%rdx,8),%rsi
         movq     %rsi,(%rcx,%rdx,8)
 3:      addq     $4,%rdx
         jle      2b
         subq     $4,%rdx
         jl       1b
         ret
 4:      movq     -8(%rdi,%rdx,8),%rcx
         movq     %rcx,-8(%rsi,%rdx,8)
         subq     $1,%rdx
         jnz      4b
         ret
         .align   16
 5:      movq     24(%rdi,%rdx,8),%rcx
         movq     %rcx,24(%rsi,%rdx,8)
         movq     16(%rdi,%rdx,8),%rcx
         movq     %rcx,16(%rsi,%rdx,8)
         movq     8(%rdi,%rdx,8),%rcx
         movq     %rcx,8(%rsi,%rdx,8)
         movq     (%rdi,%rdx,8),%rcx
         movq     %rcx,(%rsi,%rdx,8)
 acl_CopyLeft:
         subq     $4,%rdx
         jge      5b
         addq     $4,%rdx
         jg       4b
         ret
	/
	/ Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
	/ DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	/
	/ This code is free software; you can redistribute it and/or modify it
	/ under the terms of the GNU General Public License version 2 only, as
	/ published by the Free Software Foundation.
	/
	/ This code is distributed in the hope that it will be useful, but WITHOUT
	/ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	/ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	/ version 2 for more details (a copy is included in the LICENSE file that
	/ accompanied this code).
	/
	/ You should have received a copy of the GNU General Public License version
	/ 2 along with this work; if not, write to the Free Software Foundation,
	/ Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	/
	/ Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	/ or visit www.oracle.com if you need additional information or have any
	/ questions.
	/

	.globl fs_load
	.globl fs_thread

	// NOTE WELL! The _Copy functions are called directly
	// from server-compiler-generated code via CallLeafNoFP,
	// which means that they must either not use floating
	// point or use it in the same manner as does the server
	// compiler.

	.globl _Copy_arrayof_conjoint_bytes
	.globl _Copy_conjoint_jshorts_atomic
	.globl _Copy_arrayof_conjoint_jshorts
	.globl _Copy_conjoint_jints_atomic
	.globl _Copy_arrayof_conjoint_jints
	.globl _Copy_conjoint_jlongs_atomic
	.globl _Copy_arrayof_conjoint_jlongs

	.section .text,"ax"

	/ Fast thread accessors, used by threadLS_solaris_amd64.cpp
	.align 16
	fs_load:
	movq %fs:(%rdi),%rax
	ret

	.align 16
	fs_thread:
	movq %fs:0x0,%rax
	ret

	.globl SpinPause
	.align 16
	SpinPause:
	rep
	nop
	movq $1, %rax
	ret


	/ Support for void Copy::arrayof_conjoint_bytes(void* from,
	/ void* to,
	/ size_t count)
	/ rdi - from
	/ rsi - to
	/ rdx - count, treated as ssize_t
	/
	.align 16
	_Copy_arrayof_conjoint_bytes:
	movq %rdx,%r8 / byte count
	shrq $3,%rdx / qword count
	cmpq %rdi,%rsi
	leaq -1(%rdi,%r8,1),%rax / from + bcount*1 - 1
	jbe acb_CopyRight
	cmpq %rax,%rsi
	jbe acb_CopyLeft
	acb_CopyRight:
	leaq -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
	leaq -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
	negq %rdx
	jmp 7f
	.align 16
	1: movq 8(%rax,%rdx,8),%rsi
	movq %rsi,8(%rcx,%rdx,8)
	addq $1,%rdx
	jnz 1b
	2: testq $4,%r8 / check for trailing dword
	jz 3f
	movl 8(%rax),%esi / copy trailing dword
	movl %esi,8(%rcx)
	addq $4,%rax
	addq $4,%rcx / original %rsi is trashed, so we
	/ can't use it as a base register
	3: testq $2,%r8 / check for trailing word
	jz 4f
	movw 8(%rax),%si / copy trailing word
	movw %si,8(%rcx)
	addq $2,%rcx
	4: testq $1,%r8 / check for trailing byte
	jz 5f
	movb -1(%rdi,%r8,1),%al / copy trailing byte
	movb %al,8(%rcx)
	5: ret
	.align 16
	6: movq -24(%rax,%rdx,8),%rsi
	movq %rsi,-24(%rcx,%rdx,8)
	movq -16(%rax,%rdx,8),%rsi
	movq %rsi,-16(%rcx,%rdx,8)
	movq -8(%rax,%rdx,8),%rsi
	movq %rsi,-8(%rcx,%rdx,8)
	movq (%rax,%rdx,8),%rsi
	movq %rsi,(%rcx,%rdx,8)
	7: addq $4,%rdx
	jle 6b
	subq $4,%rdx
	jl 1b
	jmp 2b
	acb_CopyLeft:
	testq $1,%r8 / check for trailing byte
	jz 1f
	movb -1(%rdi,%r8,1),%cl / copy trailing byte
	movb %cl,-1(%rsi,%r8,1)
	subq $1,%r8 / adjust for possible trailing word
	1: testq $2,%r8 / check for trailing word
	jz 2f
	movw -2(%rdi,%r8,1),%cx / copy trailing word
	movw %cx,-2(%rsi,%r8,1)
	2: testq $4,%r8 / check for trailing dword
	jz 5f
	movl (%rdi,%rdx,8),%ecx / copy trailing dword
	movl %ecx,(%rsi,%rdx,8)
	jmp 5f
	.align 16
	3: movq -8(%rdi,%rdx,8),%rcx
	movq %rcx,-8(%rsi,%rdx,8)
	subq $1,%rdx
	jnz 3b
	ret
	.align 16
	4: movq 24(%rdi,%rdx,8),%rcx
	movq %rcx,24(%rsi,%rdx,8)
	movq 16(%rdi,%rdx,8),%rcx
	movq %rcx,16(%rsi,%rdx,8)
	movq 8(%rdi,%rdx,8),%rcx
	movq %rcx,8(%rsi,%rdx,8)
	movq (%rdi,%rdx,8),%rcx
	movq %rcx,(%rsi,%rdx,8)
	5: subq $4,%rdx
	jge 4b
	addq $4,%rdx
	jg 3b
	ret

	/ Support for void Copy::arrayof_conjoint_jshorts(void* from,
	/ void* to,
	/ size_t count)
	/ Equivalent to
	/ conjoint_jshorts_atomic
	/
	/ If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
	/ let the hardware handle it. The tow or four words within dwords
	/ or qwords that span cache line boundaries will still be loaded
	/ and stored atomically.
	/
	/ rdi - from
	/ rsi - to
	/ rdx - count, treated as ssize_t
	/
	.align 16
	_Copy_arrayof_conjoint_jshorts:
	_Copy_conjoint_jshorts_atomic:
	movq %rdx,%r8 / word count
	shrq $2,%rdx / qword count
	cmpq %rdi,%rsi
	leaq -2(%rdi,%r8,2),%rax / from + wcount*2 - 2
	jbe acs_CopyRight
	cmpq %rax,%rsi
	jbe acs_CopyLeft
	acs_CopyRight:
	leaq -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
	leaq -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
	negq %rdx
	jmp 6f
	1: movq 8(%rax,%rdx,8),%rsi
	movq %rsi,8(%rcx,%rdx,8)
	addq $1,%rdx
	jnz 1b
	2: testq $2,%r8 / check for trailing dword
	jz 3f
	movl 8(%rax),%esi / copy trailing dword
	movl %esi,8(%rcx)
	addq $4,%rcx / original %rsi is trashed, so we
	/ can't use it as a base register
	3: testq $1,%r8 / check for trailing word
	jz 4f
	movw -2(%rdi,%r8,2),%si / copy trailing word
	movw %si,8(%rcx)
	4: ret
	.align 16
	5: movq -24(%rax,%rdx,8),%rsi
	movq %rsi,-24(%rcx,%rdx,8)
	movq -16(%rax,%rdx,8),%rsi
	movq %rsi,-16(%rcx,%rdx,8)
	movq -8(%rax,%rdx,8),%rsi
	movq %rsi,-8(%rcx,%rdx,8)
	movq (%rax,%rdx,8),%rsi
	movq %rsi,(%rcx,%rdx,8)
	6: addq $4,%rdx
	jle 5b
	subq $4,%rdx
	jl 1b
	jmp 2b
	acs_CopyLeft:
	testq $1,%r8 / check for trailing word
	jz 1f
	movw -2(%rdi,%r8,2),%cx / copy trailing word
	movw %cx,-2(%rsi,%r8,2)
	1: testq $2,%r8 / check for trailing dword
	jz 4f
	movl (%rdi,%rdx,8),%ecx / copy trailing dword
	movl %ecx,(%rsi,%rdx,8)
	jmp 4f
	2: movq -8(%rdi,%rdx,8),%rcx
	movq %rcx,-8(%rsi,%rdx,8)
	subq $1,%rdx
	jnz 2b
	ret
	.align 16
	3: movq 24(%rdi,%rdx,8),%rcx
	movq %rcx,24(%rsi,%rdx,8)
	movq 16(%rdi,%rdx,8),%rcx
	movq %rcx,16(%rsi,%rdx,8)
	movq 8(%rdi,%rdx,8),%rcx
	movq %rcx,8(%rsi,%rdx,8)
	movq (%rdi,%rdx,8),%rcx
	movq %rcx,(%rsi,%rdx,8)
	4: subq $4,%rdx
	jge 3b
	addq $4,%rdx
	jg 2b
	ret

	/ Support for void Copy::arrayof_conjoint_jints(jint* from,
	/ jint* to,
	/ size_t count)
	/ Equivalent to
	/ conjoint_jints_atomic
	/
	/ If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
	/ the hardware handle it. The two dwords within qwords that span
	/ cache line boundaries will still be loaded and stored atomically.
	/
	/ rdi - from
	/ rsi - to
	/ rdx - count, treated as ssize_t
	/
	.align 16
	_Copy_arrayof_conjoint_jints:
	_Copy_conjoint_jints_atomic:
	movq %rdx,%r8 / dword count
	shrq %rdx / qword count
	cmpq %rdi,%rsi
	leaq -4(%rdi,%r8,4),%rax / from + dcount*4 - 4
	jbe aci_CopyRight
	cmpq %rax,%rsi
	jbe aci_CopyLeft
	aci_CopyRight:
	leaq -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
	leaq -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
	negq %rdx
	jmp 5f
	.align 16
	1: movq 8(%rax,%rdx,8),%rsi
	movq %rsi,8(%rcx,%rdx,8)
	addq $1,%rdx
	jnz 1b
	2: testq $1,%r8 / check for trailing dword
	jz 3f
	movl 8(%rax),%esi / copy trailing dword
	movl %esi,8(%rcx)
	3: ret
	.align 16
	4: movq -24(%rax,%rdx,8),%rsi
	movq %rsi,-24(%rcx,%rdx,8)
	movq -16(%rax,%rdx,8),%rsi
	movq %rsi,-16(%rcx,%rdx,8)
	movq -8(%rax,%rdx,8),%rsi
	movq %rsi,-8(%rcx,%rdx,8)
	movq (%rax,%rdx,8),%rsi
	movq %rsi,(%rcx,%rdx,8)
	5: addq $4,%rdx
	jle 4b
	subq $4,%rdx
	jl 1b
	jmp 2b
	aci_CopyLeft:
	testq $1,%r8 / check for trailing dword
	jz 3f
	movl -4(%rdi,%r8,4),%ecx / copy trailing dword
	movl %ecx,-4(%rsi,%r8,4)
	jmp 3f
	1: movq -8(%rdi,%rdx,8),%rcx
	movq %rcx,-8(%rsi,%rdx,8)
	subq $1,%rdx
	jnz 1b
	ret
	.align 16
	2: movq 24(%rdi,%rdx,8),%rcx
	movq %rcx,24(%rsi,%rdx,8)
	movq 16(%rdi,%rdx,8),%rcx
	movq %rcx,16(%rsi,%rdx,8)
	movq 8(%rdi,%rdx,8),%rcx
	movq %rcx,8(%rsi,%rdx,8)
	movq (%rdi,%rdx,8),%rcx
	movq %rcx,(%rsi,%rdx,8)
	3: subq $4,%rdx
	jge 2b
	addq $4,%rdx
	jg 1b
	ret

	/ Support for void Copy::arrayof_conjoint_jlongs(jlong* from,
	/ jlong* to,
	/ size_t count)
	/ Equivalent to
	/ conjoint_jlongs_atomic
	/ arrayof_conjoint_oops
	/ conjoint_oops_atomic
	/
	/ rdi - from
	/ rsi - to
	/ rdx - count, treated as ssize_t
	/
	.align 16
	_Copy_arrayof_conjoint_jlongs:
	_Copy_conjoint_jlongs_atomic:
	cmpq %rdi,%rsi
	leaq -8(%rdi,%rdx,8),%rax / from + count*8 - 8
	jbe acl_CopyRight
	cmpq %rax,%rsi
	jbe acl_CopyLeft
	acl_CopyRight:
	leaq -8(%rsi,%rdx,8),%rcx / to + count*8 - 8
	negq %rdx
	jmp 3f
	1: movq 8(%rax,%rdx,8),%rsi
	movq %rsi,8(%rcx,%rdx,8)
	addq $1,%rdx
	jnz 1b
	ret
	.align 16
	2: movq -24(%rax,%rdx,8),%rsi
	movq %rsi,-24(%rcx,%rdx,8)
	movq -16(%rax,%rdx,8),%rsi
	movq %rsi,-16(%rcx,%rdx,8)
	movq -8(%rax,%rdx,8),%rsi
	movq %rsi,-8(%rcx,%rdx,8)
	movq (%rax,%rdx,8),%rsi
	movq %rsi,(%rcx,%rdx,8)
	3: addq $4,%rdx
	jle 2b
	subq $4,%rdx
	jl 1b
	ret
	4: movq -8(%rdi,%rdx,8),%rcx
	movq %rcx,-8(%rsi,%rdx,8)
	subq $1,%rdx
	jnz 4b
	ret
	.align 16
	5: movq 24(%rdi,%rdx,8),%rcx
	movq %rcx,24(%rsi,%rdx,8)
	movq 16(%rdi,%rdx,8),%rcx
	movq %rcx,16(%rsi,%rdx,8)
	movq 8(%rdi,%rdx,8),%rcx
	movq %rcx,8(%rsi,%rdx,8)
	movq (%rdi,%rdx,8),%rcx
	movq %rcx,(%rsi,%rdx,8)
	acl_CopyLeft:
	subq $4,%rdx
	jge 5b
	addq $4,%rdx
	jg 4b
	ret