coregrind/vg_helpers.S - fp2-dev/platform/external/valgrind - Gitiles

 ##--------------------------------------------------------------------##
 ##--- Support routines for the JITter output.                      ---##
 ##---                                                 vg_helpers.S ---##
 ##--------------------------------------------------------------------##

 /*
   This file is part of Valgrind, an extensible x86 protected-mode
   emulator for monitoring program execution on x86-Unixes.

   Copyright (C) 2000-2004 Julian Seward
      jseward@acm.org

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
 */

 #include "vg_constants.h"

 /* ------------------ SIMULATED CPU HELPERS ------------------ */
 /* A stubs for a return which we want to catch: a signal return.
    returns and pthread returns.  In the latter case, the thread's
    return value is in %EAX, so we pass this as the first argument
    to the request.  In both cases we use the user request mechanism.
    You need to to read the definition of VALGRIND_MAGIC_SEQUENCE
    in valgrind.h to make sense of this.

    This isn't used in-place.  It is copied into the client address space
    at an arbitary address.  Therefore, this code must be completely
    position-independent.
 */
 .global VG_(trampoline_code_start)
 .global VG_(trampoline_code_length)
 .global VG_(tramp_sigreturn_offset)
 .global VG_(tramp_syscall_offset)

 VG_(trampoline_code_start):
 sigreturn_start:
 	subl	$20, %esp	# allocate arg block
 	movl	%esp, %edx	# %edx == &_zzq_args[0]
 	movl	$VG_USERREQ__SIGNAL_RETURNS, 0(%edx)	# request
 	movl	$0, 4(%edx)	# arg1
 	movl	$0, 8(%edx)	# arg2
 	movl	$0, 12(%edx)	# arg3
 	movl	$0, 16(%edx)	# arg4
 	movl	%edx, %eax
 	# and now the magic sequence itself:
 	roll $29, %eax
 	roll $3, %eax
 	rorl $27, %eax
 	rorl $5, %eax
 	roll $13, %eax
 	roll $19, %eax
 	# should never get here
 	ud2

 	# We can point our sysinfo stuff here
 	.align 16
 syscall_start:
 	int	$0x80
 	ret
 tramp_code_end:

 .data
 VG_(trampoline_code_length):
 	.long tramp_code_end - VG_(trampoline_code_start)
 VG_(tramp_sigreturn_offset):
 	.long sigreturn_start - VG_(trampoline_code_start)
 VG_(tramp_syscall_offset):
 	.long syscall_start - VG_(trampoline_code_start)
 .text


 /* ------------------ REAL CPU HELPERS ------------------ */
 /* The rest of this lot run on the real CPU. */

 /* Various helper routines, for instructions which are just too
    darn tedious for the JITter to output code in-line:

 	* integer division
 	* integer multiplication
         * setting and getting obscure eflags
 	* double-length shifts
         * eight byte compare and exchange

    All routines use a standard calling convention designed for
    calling from translations, in which the incoming args are
    underneath the return address, the callee saves _all_ registers,
    and the incoming parameters can be modified, to return results.
 */

 /* Fetch the time-stamp-ctr reg.
    On entry:
 	dummy, replaced by %EAX value
 	dummy, replaced by %EDX value
 	RA   <- %esp
 */
 .global VG_(helper_RDTSC)
 VG_(helper_RDTSC):
 	pushl	%eax
 	pushl	%edx
 	rdtsc
 	movl	%edx, 12(%esp)
 	movl	%eax, 16(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /*
    Fetch a byte/word/dword from given port
    On entry:
          size     1, 2 or 4
          port, replaced by result
          RA
 */
 .global VG_(helper_IN)
 VG_(helper_IN):
 	pushl	%eax
 	pushl   %edx
 	movl	16(%esp), %eax
 	movl	12(%esp), %edx
 	cmpl	$4, %eax
 	je	in_dword
 	cmpl	$2, %eax
 	je	in_word
 in_byte:
 	inb	(%dx), %al
 	jmp	in_done
 in_word:
 	in	(%dx), %ax
 	jmp	in_done
 in_dword:
 	inl	(%dx),%eax
 in_done:
 	movl	%eax,12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /*
    Write a byte/word/dword to given port
    On entry:
          size     1, 2 or 4
          port
          value
          RA
 */
 .global VG_(helper_OUT)
 VG_(helper_OUT):
 	pushl	%eax
 	pushl   %edx
 	movl	16(%esp), %edx
 	movl	12(%esp), %eax
 	cmpl	$4, 20(%esp)
 	je	out_dword
 	cmpl	$2, 20(%esp)
 	je	out_word
 out_byte:
 	outb	%al,(%dx)
 	jmp	out_done
 out_word:
 	out	%ax,(%dx)
 	jmp	out_done
 out_dword:
 	outl	%eax,(%dx)
 out_done:
 	popl	%edx
 	popl	%eax
 	ret


 /* Do the CPUID instruction.
    On entry:
 	dummy, replaced by %EAX value
 	dummy, replaced by %EBX value
 	dummy, replaced by %ECX value
 	dummy, replaced by %EDX value
 	RA   <- %esp

    We save registers and package up the args so we can call a C helper
    for all this.
 */
 .global VG_(helper_CPUID)
 VG_(helper_CPUID):
 	pushl	%ebp
 	movl	%esp,%ebp
 	pushl	%eax
 	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
 	pushl	%esi
 	pushl	%edi
 	pushf

 	lea	2*4(%ebp),%eax	/* &edx */
 	pushl	%eax
 	addl	$4,%eax		/* &ecx */
 	pushl	%eax
 	addl	$4,%eax		/* &ebx */
 	pushl	%eax
 	addl	$4,%eax		/* &eax */
 	pushl	%eax
 	pushl	(%eax)		/* eax */

 	call	VG_(helperc_CPUID)
 	addl	$20,%esp

 	popf
 	popl	%edi
 	popl	%esi
 	popl	%edx
 	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	popl	%ebp
 	ret

 /* Fetch the FPU status register.
    On entry:
 	dummy, replaced by result
 	RA   <- %esp
 */
 .global VG_(helper_fstsw_AX)
 VG_(helper_fstsw_AX):
 	pushl	%eax
 	pushl	%esi
 	movl	VGOFF_(m_ssestate), %esi

 	pushfl
 	cmpb	$0, VG_(have_ssestate)
 	jz	aa1nosse
 	fxrstor	(%ebp, %esi, 4)
 	jmp	aa1merge
 aa1nosse:
 	frstor	(%ebp, %esi, 4)
 aa1merge:
 	popfl

 	fstsw	%ax
 	popl	%esi
 	movw	%ax, 8(%esp)
 	popl	%eax
 	ret


 /* Copy %ah into %eflags.
    On entry:
 	value of %eax
 	RA   <- %esp
 */
 .global VG_(helper_SAHF)
 VG_(helper_SAHF):
 	pushl	%eax
 	movl	8(%esp), %eax
 	sahf
 	popl	%eax
 	ret

 /* Copy %eflags into %ah.
    On entry:
 	value of %eax
 	RA   <- %esp
 */
 .global VG_(helper_LAHF)
 VG_(helper_LAHF):
 	pushl	%eax
 	movl	8(%esp), %eax
 	lahf
 	movl	%eax, 8(%esp)
 	popl	%eax
 	ret


 /* Do %al = DAS(%al).  Note that the passed param has %AL as the least
    significant 8 bits, since it was generated with GETB %AL,
    some-temp.  Fortunately %al is the least significant 8 bits of
    %eax anyway, which is why it's safe to work with %eax as a
    whole.

    On entry:
 	value of %eax
 	RA   <- %esp
 */
 .global VG_(helper_DAS)
 VG_(helper_DAS):
 	pushl	%eax
 	movl	8(%esp), %eax
 	das
  	movl	%eax, 8(%esp)
 	popl	%eax
 	ret


 /* Similarly, do %al = DAA(%al). */
 .global VG_(helper_DAA)
 VG_(helper_DAA):
        pushl   %eax
        movl    8(%esp), %eax
        daa
        movl    %eax, 8(%esp)
        popl    %eax
        ret


 /* Similarly, do %ax = AAS(%ax). */
 .global VG_(helper_AAS)
 VG_(helper_AAS):
        pushl   %eax
        movl    8(%esp), %eax
        aas
        movl    %eax, 8(%esp)
        popl    %eax
        ret


 /* Similarly, do %ax = AAA(%ax). */
 .global VG_(helper_AAA)
 VG_(helper_AAA):
        pushl   %eax
        movl    8(%esp), %eax
        aaa
        movl    %eax, 8(%esp)
        popl    %eax
        ret


 /* Similarly, do %ax = AAD(%ax). */
 .global VG_(helper_AAD)
 VG_(helper_AAD):
        pushl   %eax
        movl    8(%esp), %eax
        aad
        movl    %eax, 8(%esp)
        popl    %eax
        ret


 /* Similarly, do %ax = AAM(%ax). */
 .global VG_(helper_AAM)
 VG_(helper_AAM):
        pushl   %eax
        movl    8(%esp), %eax
        aam
        movl    %eax, 8(%esp)
        popl    %eax
        ret


 /* Bit scan forwards/reverse.  Sets flags (??).
    On entry:
 	value, replaced by result
 	RA   <- %esp
 */
 .global VG_(helper_bsrw)
 VG_(helper_bsrw):
 	pushl	%eax
 	movw	12(%esp), %ax
 	bsrw	8(%esp), %ax
 	movw	%ax, 12(%esp)
 	popl	%eax
 	ret

 .global VG_(helper_bsrl)
 VG_(helper_bsrl):
 	pushl	%eax
 	movl	12(%esp), %eax
 	bsrl	8(%esp), %eax
 	movl	%eax, 12(%esp)
 	popl	%eax
 	ret

 .global VG_(helper_bsfw)
 VG_(helper_bsfw):
 	pushl	%eax
 	movw	12(%esp), %ax
 	bsfw	8(%esp), %ax
 	movw	%ax, 12(%esp)
 	popl	%eax
 	ret

 .global VG_(helper_bsfl)
 VG_(helper_bsfl):
 	pushl	%eax
 	movl	12(%esp), %eax
 	bsfl	8(%esp), %eax
 	movl	%eax, 12(%esp)
 	popl	%eax
 	ret


 /* 32-bit double-length shift left/right.
    On entry:
 	amount
 	src
 	dst
 	RA   <- %esp
 */
 .global VG_(helper_shldl)
 VG_(helper_shldl):
 	pushl	%eax
 	pushl	%ebx
 	pushl	%ecx

 	movb	24(%esp), %cl
 	movl	20(%esp), %ebx
 	movl	16(%esp), %eax
 	shldl	%cl, %ebx, %eax
 	movl	%eax, 16(%esp)

 	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	ret

 .global VG_(helper_shldw)
 VG_(helper_shldw):
 	pushl	%eax
 	pushl	%ebx
 	pushl	%ecx

 	movb	24(%esp), %cl
 	movw	20(%esp), %bx
 	movw	16(%esp), %ax
 	shldw	%cl, %bx, %ax
 	movw	%ax, 16(%esp)

 	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	ret

 .global VG_(helper_shrdl)
 VG_(helper_shrdl):
 	pushl	%eax
 	pushl	%ebx
 	pushl	%ecx

 	movb	24(%esp), %cl
 	movl	20(%esp), %ebx
 	movl	16(%esp), %eax
 	shrdl	%cl, %ebx, %eax
 	movl	%eax, 16(%esp)

 	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	ret

 .global VG_(helper_shrdw)
 VG_(helper_shrdw):
 	pushl	%eax
 	pushl	%ebx
 	pushl	%ecx

 	movb	24(%esp), %cl
 	movw	20(%esp), %bx
 	movw	16(%esp), %ax
 	shrdw	%cl, %bx, %ax
 	movw	%ax, 16(%esp)

 	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	ret


 /* Get the direction flag, and return either 1 or -1. */
 .global VG_(helper_get_dirflag)
 VG_(helper_get_dirflag):
 	pushl	%eax

 	movl	VGOFF_(m_dflag), %eax
 	movl	(%ebp, %eax, 4), %eax
 	movl	%eax, 8(%esp)

 	popl	%eax
 	ret

 /* Clear/set the direction flag. */
 .global VG_(helper_CLD)
 VG_(helper_CLD):
 	pushl	%eax

 	movl	VGOFF_(m_dflag), %eax
 	movl	$1, (%ebp, %eax, 4)

 	popl	%eax
 	ret

 .global VG_(helper_STD)
 VG_(helper_STD):
 	pushl	%eax

 	movl	VGOFF_(m_dflag), %eax
 	movl	$-1, (%ebp, %eax, 4)

 	popl	%eax
 	ret

 /* Clear/set/complement the carry flag. */
 .global VG_(helper_CLC)
 VG_(helper_CLC):
         clc
         ret

 .global VG_(helper_STC)
 VG_(helper_STC):
         stc
         ret

 .global VG_(helper_CMC)
 VG_(helper_CMC):
         cmc
         ret

 /* Signed 32-to-64 multiply. */
 .globl VG_(helper_imul_32_64)
 VG_(helper_imul_32_64):
 	pushl	%eax
 	pushl	%edx
 	movl	16(%esp), %eax
 	imull	12(%esp)
 	movl	%eax, 16(%esp)
 	movl	%edx, 12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Signed 16-to-32 multiply. */
 .globl VG_(helper_imul_16_32)
 VG_(helper_imul_16_32):
 	pushl	%eax
 	pushl	%edx
 	movw	16(%esp), %ax
 	imulw	12(%esp)
 	movw	%ax, 16(%esp)
 	movw	%dx, 12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Signed 8-to-16 multiply. */
 .globl VG_(helper_imul_8_16)
 VG_(helper_imul_8_16):
 	pushl	%eax
 	pushl	%edx
 	movb	16(%esp), %al
 	imulb	12(%esp)
 	movw	%ax, 16(%esp)
 	popl	%edx
 	popl	%eax
 	ret


 /* Unsigned 32-to-64 multiply. */
 .globl VG_(helper_mul_32_64)
 VG_(helper_mul_32_64):
 	pushl	%eax
 	pushl	%edx
 	movl	16(%esp), %eax
 	mull	12(%esp)
 	movl	%eax, 16(%esp)
 	movl	%edx, 12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Unsigned 16-to-32 multiply. */
 .globl VG_(helper_mul_16_32)
 VG_(helper_mul_16_32):
 	pushl	%eax
 	pushl	%edx
 	movw	16(%esp), %ax
 	mulw	12(%esp)
 	movw	%ax, 16(%esp)
 	movw	%dx, 12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Unsigned 8-to-16 multiply. */
 .globl VG_(helper_mul_8_16)
 VG_(helper_mul_8_16):
 	pushl	%eax
 	pushl	%edx
 	movb	16(%esp), %al
 	mulb	12(%esp)
 	movw	%ax, 16(%esp)
 	popl	%edx
 	popl	%eax
 	ret


 /* Unsigned 64-into-32 divide. */
 .globl	VG_(helper_div_64_32)
 VG_(helper_div_64_32):
 	pushl	%eax
 	pushl	%edx
 	movl	16(%esp),%eax
 	movl	12(%esp),%edx
 	divl	20(%esp)
 	movl	%eax,16(%esp)
 	movl	%edx,12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Signed 64-into-32 divide. */
 .globl	VG_(helper_idiv_64_32)
 VG_(helper_idiv_64_32):
 	pushl	%eax
 	pushl	%edx
 	movl	16(%esp),%eax
 	movl	12(%esp),%edx
 	idivl	20(%esp)
 	movl	%eax,16(%esp)
 	movl	%edx,12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Unsigned 32-into-16 divide. */
 .globl	VG_(helper_div_32_16)
 VG_(helper_div_32_16):
 	pushl	%eax
 	pushl	%edx
 	movw	16(%esp),%ax
 	movw	12(%esp),%dx
 	divw	20(%esp)
 	movw	%ax,16(%esp)
 	movw	%dx,12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Signed 32-into-16 divide. */
 .globl	VG_(helper_idiv_32_16)
 VG_(helper_idiv_32_16):
 	pushl	%eax
 	pushl	%edx
 	movw	16(%esp),%ax
 	movw	12(%esp),%dx
 	idivw	20(%esp)
 	movw	%ax,16(%esp)
 	movw	%dx,12(%esp)
 	popl	%edx
 	popl	%eax
 	ret

 /* Unsigned 16-into-8 divide. */
 .globl	VG_(helper_div_16_8)
 VG_(helper_div_16_8):
 	pushl	%eax
 	movw	12(%esp),%ax
 	divb	16(%esp)
 	movb	%ah,12(%esp)
 	movb	%al,8(%esp)
 	popl	%eax
 	ret

 /* Signed 16-into-8 divide. */
 .globl	VG_(helper_idiv_16_8)
 VG_(helper_idiv_16_8):
 	pushl	%eax
 	movw	12(%esp),%ax
 	idivb	16(%esp)
 	movb	%ah,12(%esp)
 	movb	%al,8(%esp)
 	popl	%eax
 	ret


 /* Eight byte compare and exchange. */
 .globl VG_(helper_cmpxchg8b)
 VG_(helper_cmpxchg8b):
         pushl %eax
         pushl %ebx
         pushl %ecx
         pushl %edx
         movl 20(%esp), %eax
         movl 24(%esp), %edx
         movl 28(%esp), %ebx
         movl 32(%esp), %ecx
         cmpxchg8b 36(%esp)
         movl %eax, 20(%esp)
         movl %edx, 24(%esp)
         movl %ebx, 28(%esp)
         movl %ecx, 32(%esp)
         popl %edx
         popl %ecx
         popl %ebx
         popl %eax
         ret


 /* Undefined instruction (generates SIGILL) */
 .globl VG_(helper_undefined_instruction)
 VG_(helper_undefined_instruction):
 1:	ud2
 	jmp	1b

 ##--------------------------------------------------------------------##
 ##--- end                                             vg_helpers.S ---##
 ##--------------------------------------------------------------------##
	##--------------------------------------------------------------------##
	##--- Support routines for the JITter output. ---##
	##--- vg_helpers.S ---##
	##--------------------------------------------------------------------##

	/*
	This file is part of Valgrind, an extensible x86 protected-mode
	emulator for monitoring program execution on x86-Unixes.

	Copyright (C) 2000-2004 Julian Seward
	jseward@acm.org

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License as
	published by the Free Software Foundation; either version 2 of the
	License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
	02111-1307, USA.

	The GNU General Public License is contained in the file COPYING.
	*/

	#include "vg_constants.h"

	/* ------------------ SIMULATED CPU HELPERS ------------------ */
	/* A stubs for a return which we want to catch: a signal return.
	returns and pthread returns. In the latter case, the thread's
	return value is in %EAX, so we pass this as the first argument
	to the request. In both cases we use the user request mechanism.
	You need to to read the definition of VALGRIND_MAGIC_SEQUENCE
	in valgrind.h to make sense of this.

	This isn't used in-place. It is copied into the client address space
	at an arbitary address. Therefore, this code must be completely
	position-independent.
	*/
	.global VG_(trampoline_code_start)
	.global VG_(trampoline_code_length)
	.global VG_(tramp_sigreturn_offset)
	.global VG_(tramp_syscall_offset)

	VG_(trampoline_code_start):
	sigreturn_start:
	subl $20, %esp # allocate arg block
	movl %esp, %edx # %edx == &_zzq_args[0]
	movl $VG_USERREQ__SIGNAL_RETURNS, 0(%edx) # request
	movl $0, 4(%edx) # arg1
	movl $0, 8(%edx) # arg2
	movl $0, 12(%edx) # arg3
	movl $0, 16(%edx) # arg4
	movl %edx, %eax
	# and now the magic sequence itself:
	roll $29, %eax
	roll $3, %eax
	rorl $27, %eax
	rorl $5, %eax
	roll $13, %eax
	roll $19, %eax
	# should never get here
	ud2

	# We can point our sysinfo stuff here
	.align 16
	syscall_start:
	int $0x80
	ret
	tramp_code_end:

	.data
	VG_(trampoline_code_length):
	.long tramp_code_end - VG_(trampoline_code_start)
	VG_(tramp_sigreturn_offset):
	.long sigreturn_start - VG_(trampoline_code_start)
	VG_(tramp_syscall_offset):
	.long syscall_start - VG_(trampoline_code_start)
	.text


	/* ------------------ REAL CPU HELPERS ------------------ */
	/* The rest of this lot run on the real CPU. */

	/* Various helper routines, for instructions which are just too
	darn tedious for the JITter to output code in-line:

	* integer division
	* integer multiplication
	* setting and getting obscure eflags
	* double-length shifts
	* eight byte compare and exchange

	All routines use a standard calling convention designed for
	calling from translations, in which the incoming args are
	underneath the return address, the callee saves _all_ registers,
	and the incoming parameters can be modified, to return results.
	*/

	/* Fetch the time-stamp-ctr reg.
	On entry:
	dummy, replaced by %EAX value
	dummy, replaced by %EDX value
	RA <- %esp
	*/
	.global VG_(helper_RDTSC)
	VG_(helper_RDTSC):
	pushl %eax
	pushl %edx
	rdtsc
	movl %edx, 12(%esp)
	movl %eax, 16(%esp)
	popl %edx
	popl %eax
	ret

	/*
	Fetch a byte/word/dword from given port
	On entry:
	size 1, 2 or 4
	port, replaced by result
	RA
	*/
	.global VG_(helper_IN)
	VG_(helper_IN):
	pushl %eax
	pushl %edx
	movl 16(%esp), %eax
	movl 12(%esp), %edx
	cmpl $4, %eax
	je in_dword
	cmpl $2, %eax
	je in_word
	in_byte:
	inb (%dx), %al
	jmp in_done
	in_word:
	in (%dx), %ax
	jmp in_done
	in_dword:
	inl (%dx),%eax
	in_done:
	movl %eax,12(%esp)
	popl %edx
	popl %eax
	ret

	/*
	Write a byte/word/dword to given port
	On entry:
	size 1, 2 or 4
	port
	value
	RA
	*/
	.global VG_(helper_OUT)
	VG_(helper_OUT):
	pushl %eax
	pushl %edx
	movl 16(%esp), %edx
	movl 12(%esp), %eax
	cmpl $4, 20(%esp)
	je out_dword
	cmpl $2, 20(%esp)
	je out_word
	out_byte:
	outb %al,(%dx)
	jmp out_done
	out_word:
	out %ax,(%dx)
	jmp out_done
	out_dword:
	outl %eax,(%dx)
	out_done:
	popl %edx
	popl %eax
	ret


	/* Do the CPUID instruction.
	On entry:
	dummy, replaced by %EAX value
	dummy, replaced by %EBX value
	dummy, replaced by %ECX value
	dummy, replaced by %EDX value
	RA <- %esp

	We save registers and package up the args so we can call a C helper
	for all this.
	*/
	.global VG_(helper_CPUID)
	VG_(helper_CPUID):
	pushl %ebp
	movl %esp,%ebp
	pushl %eax
	pushl %ebx
	pushl %ecx
	pushl %edx
	pushl %esi
	pushl %edi
	pushf

	lea 24(%ebp),%eax / &edx */
	pushl %eax
	addl $4,%eax /* &ecx */
	pushl %eax
	addl $4,%eax /* &ebx */
	pushl %eax
	addl $4,%eax /* &eax */
	pushl %eax
	pushl (%eax) /* eax */

	call VG_(helperc_CPUID)
	addl $20,%esp

	popf
	popl %edi
	popl %esi
	popl %edx
	popl %ecx
	popl %ebx
	popl %eax
	popl %ebp
	ret

	/* Fetch the FPU status register.
	On entry:
	dummy, replaced by result
	RA <- %esp
	*/
	.global VG_(helper_fstsw_AX)
	VG_(helper_fstsw_AX):
	pushl %eax
	pushl %esi
	movl VGOFF_(m_ssestate), %esi

	pushfl
	cmpb $0, VG_(have_ssestate)
	jz aa1nosse
	fxrstor (%ebp, %esi, 4)
	jmp aa1merge
	aa1nosse:
	frstor (%ebp, %esi, 4)
	aa1merge:
	popfl

	fstsw %ax
	popl %esi
	movw %ax, 8(%esp)
	popl %eax
	ret


	/* Copy %ah into %eflags.
	On entry:
	value of %eax
	RA <- %esp
	*/
	.global VG_(helper_SAHF)
	VG_(helper_SAHF):
	pushl %eax
	movl 8(%esp), %eax
	sahf
	popl %eax
	ret

	/* Copy %eflags into %ah.
	On entry:
	value of %eax
	RA <- %esp
	*/
	.global VG_(helper_LAHF)
	VG_(helper_LAHF):
	pushl %eax
	movl 8(%esp), %eax
	lahf
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Do %al = DAS(%al). Note that the passed param has %AL as the least
	significant 8 bits, since it was generated with GETB %AL,
	some-temp. Fortunately %al is the least significant 8 bits of
	%eax anyway, which is why it's safe to work with %eax as a
	whole.

	On entry:
	value of %eax
	RA <- %esp
	*/
	.global VG_(helper_DAS)
	VG_(helper_DAS):
	pushl %eax
	movl 8(%esp), %eax
	das
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Similarly, do %al = DAA(%al). */
	.global VG_(helper_DAA)
	VG_(helper_DAA):
	pushl %eax
	movl 8(%esp), %eax
	daa
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Similarly, do %ax = AAS(%ax). */
	.global VG_(helper_AAS)
	VG_(helper_AAS):
	pushl %eax
	movl 8(%esp), %eax
	aas
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Similarly, do %ax = AAA(%ax). */
	.global VG_(helper_AAA)
	VG_(helper_AAA):
	pushl %eax
	movl 8(%esp), %eax
	aaa
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Similarly, do %ax = AAD(%ax). */
	.global VG_(helper_AAD)
	VG_(helper_AAD):
	pushl %eax
	movl 8(%esp), %eax
	aad
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Similarly, do %ax = AAM(%ax). */
	.global VG_(helper_AAM)
	VG_(helper_AAM):
	pushl %eax
	movl 8(%esp), %eax
	aam
	movl %eax, 8(%esp)
	popl %eax
	ret


	/* Bit scan forwards/reverse. Sets flags (??).
	On entry:
	value, replaced by result
	RA <- %esp
	*/
	.global VG_(helper_bsrw)
	VG_(helper_bsrw):
	pushl %eax
	movw 12(%esp), %ax
	bsrw 8(%esp), %ax
	movw %ax, 12(%esp)
	popl %eax
	ret

	.global VG_(helper_bsrl)
	VG_(helper_bsrl):
	pushl %eax
	movl 12(%esp), %eax
	bsrl 8(%esp), %eax
	movl %eax, 12(%esp)
	popl %eax
	ret

	.global VG_(helper_bsfw)
	VG_(helper_bsfw):
	pushl %eax
	movw 12(%esp), %ax
	bsfw 8(%esp), %ax
	movw %ax, 12(%esp)
	popl %eax
	ret

	.global VG_(helper_bsfl)
	VG_(helper_bsfl):
	pushl %eax
	movl 12(%esp), %eax
	bsfl 8(%esp), %eax
	movl %eax, 12(%esp)
	popl %eax
	ret


	/* 32-bit double-length shift left/right.
	On entry:
	amount
	src
	dst
	RA <- %esp
	*/
	.global VG_(helper_shldl)
	VG_(helper_shldl):
	pushl %eax
	pushl %ebx
	pushl %ecx

	movb 24(%esp), %cl
	movl 20(%esp), %ebx
	movl 16(%esp), %eax
	shldl %cl, %ebx, %eax
	movl %eax, 16(%esp)

	popl %ecx
	popl %ebx
	popl %eax
	ret

	.global VG_(helper_shldw)
	VG_(helper_shldw):
	pushl %eax
	pushl %ebx
	pushl %ecx

	movb 24(%esp), %cl
	movw 20(%esp), %bx
	movw 16(%esp), %ax
	shldw %cl, %bx, %ax
	movw %ax, 16(%esp)

	popl %ecx
	popl %ebx
	popl %eax
	ret

	.global VG_(helper_shrdl)
	VG_(helper_shrdl):
	pushl %eax
	pushl %ebx
	pushl %ecx

	movb 24(%esp), %cl
	movl 20(%esp), %ebx
	movl 16(%esp), %eax
	shrdl %cl, %ebx, %eax
	movl %eax, 16(%esp)

	popl %ecx
	popl %ebx
	popl %eax
	ret

	.global VG_(helper_shrdw)
	VG_(helper_shrdw):
	pushl %eax
	pushl %ebx
	pushl %ecx

	movb 24(%esp), %cl
	movw 20(%esp), %bx
	movw 16(%esp), %ax
	shrdw %cl, %bx, %ax
	movw %ax, 16(%esp)

	popl %ecx
	popl %ebx
	popl %eax
	ret


	/* Get the direction flag, and return either 1 or -1. */
	.global VG_(helper_get_dirflag)
	VG_(helper_get_dirflag):
	pushl %eax

	movl VGOFF_(m_dflag), %eax
	movl (%ebp, %eax, 4), %eax
	movl %eax, 8(%esp)

	popl %eax
	ret

	/* Clear/set the direction flag. */
	.global VG_(helper_CLD)
	VG_(helper_CLD):
	pushl %eax

	movl VGOFF_(m_dflag), %eax
	movl $1, (%ebp, %eax, 4)

	popl %eax
	ret

	.global VG_(helper_STD)
	VG_(helper_STD):
	pushl %eax

	movl VGOFF_(m_dflag), %eax
	movl $-1, (%ebp, %eax, 4)

	popl %eax
	ret

	/* Clear/set/complement the carry flag. */
	.global VG_(helper_CLC)
	VG_(helper_CLC):
	clc
	ret

	.global VG_(helper_STC)
	VG_(helper_STC):
	stc
	ret

	.global VG_(helper_CMC)
	VG_(helper_CMC):
	cmc
	ret

	/* Signed 32-to-64 multiply. */
	.globl VG_(helper_imul_32_64)
	VG_(helper_imul_32_64):
	pushl %eax
	pushl %edx
	movl 16(%esp), %eax
	imull 12(%esp)
	movl %eax, 16(%esp)
	movl %edx, 12(%esp)
	popl %edx
	popl %eax
	ret

	/* Signed 16-to-32 multiply. */
	.globl VG_(helper_imul_16_32)
	VG_(helper_imul_16_32):
	pushl %eax
	pushl %edx
	movw 16(%esp), %ax
	imulw 12(%esp)
	movw %ax, 16(%esp)
	movw %dx, 12(%esp)
	popl %edx
	popl %eax
	ret

	/* Signed 8-to-16 multiply. */
	.globl VG_(helper_imul_8_16)
	VG_(helper_imul_8_16):
	pushl %eax
	pushl %edx
	movb 16(%esp), %al
	imulb 12(%esp)
	movw %ax, 16(%esp)
	popl %edx
	popl %eax
	ret






	/* Unsigned 32-to-64 multiply. */
	.globl VG_(helper_mul_32_64)
	VG_(helper_mul_32_64):
	pushl %eax
	pushl %edx
	movl 16(%esp), %eax
	mull 12(%esp)
	movl %eax, 16(%esp)
	movl %edx, 12(%esp)
	popl %edx
	popl %eax
	ret

	/* Unsigned 16-to-32 multiply. */
	.globl VG_(helper_mul_16_32)
	VG_(helper_mul_16_32):
	pushl %eax
	pushl %edx
	movw 16(%esp), %ax
	mulw 12(%esp)
	movw %ax, 16(%esp)
	movw %dx, 12(%esp)
	popl %edx
	popl %eax
	ret

	/* Unsigned 8-to-16 multiply. */
	.globl VG_(helper_mul_8_16)
	VG_(helper_mul_8_16):
	pushl %eax
	pushl %edx
	movb 16(%esp), %al
	mulb 12(%esp)
	movw %ax, 16(%esp)
	popl %edx
	popl %eax
	ret




	/* Unsigned 64-into-32 divide. */
	.globl VG_(helper_div_64_32)
	VG_(helper_div_64_32):
	pushl %eax
	pushl %edx
	movl 16(%esp),%eax
	movl 12(%esp),%edx
	divl 20(%esp)
	movl %eax,16(%esp)
	movl %edx,12(%esp)
	popl %edx
	popl %eax
	ret

	/* Signed 64-into-32 divide. */
	.globl VG_(helper_idiv_64_32)
	VG_(helper_idiv_64_32):
	pushl %eax
	pushl %edx
	movl 16(%esp),%eax
	movl 12(%esp),%edx
	idivl 20(%esp)
	movl %eax,16(%esp)
	movl %edx,12(%esp)
	popl %edx
	popl %eax
	ret

	/* Unsigned 32-into-16 divide. */
	.globl VG_(helper_div_32_16)
	VG_(helper_div_32_16):
	pushl %eax
	pushl %edx
	movw 16(%esp),%ax
	movw 12(%esp),%dx
	divw 20(%esp)
	movw %ax,16(%esp)
	movw %dx,12(%esp)
	popl %edx
	popl %eax
	ret

	/* Signed 32-into-16 divide. */
	.globl VG_(helper_idiv_32_16)
	VG_(helper_idiv_32_16):
	pushl %eax
	pushl %edx
	movw 16(%esp),%ax
	movw 12(%esp),%dx
	idivw 20(%esp)
	movw %ax,16(%esp)
	movw %dx,12(%esp)
	popl %edx
	popl %eax
	ret

	/* Unsigned 16-into-8 divide. */
	.globl VG_(helper_div_16_8)
	VG_(helper_div_16_8):
	pushl %eax
	movw 12(%esp),%ax
	divb 16(%esp)
	movb %ah,12(%esp)
	movb %al,8(%esp)
	popl %eax
	ret

	/* Signed 16-into-8 divide. */
	.globl VG_(helper_idiv_16_8)
	VG_(helper_idiv_16_8):
	pushl %eax
	movw 12(%esp),%ax
	idivb 16(%esp)
	movb %ah,12(%esp)
	movb %al,8(%esp)
	popl %eax
	ret


	/* Eight byte compare and exchange. */
	.globl VG_(helper_cmpxchg8b)
	VG_(helper_cmpxchg8b):
	pushl %eax
	pushl %ebx
	pushl %ecx
	pushl %edx
	movl 20(%esp), %eax
	movl 24(%esp), %edx
	movl 28(%esp), %ebx
	movl 32(%esp), %ecx
	cmpxchg8b 36(%esp)
	movl %eax, 20(%esp)
	movl %edx, 24(%esp)
	movl %ebx, 28(%esp)
	movl %ecx, 32(%esp)
	popl %edx
	popl %ecx
	popl %ebx
	popl %eax
	ret


	/* Undefined instruction (generates SIGILL) */
	.globl VG_(helper_undefined_instruction)
	VG_(helper_undefined_instruction):
	1: ud2
	jmp 1b

	##--------------------------------------------------------------------##
	##--- end vg_helpers.S ---##
	##--------------------------------------------------------------------##