arch/i386/crypto/aes-i586-asm.S - kernel/msm-5.4 - Gitiles

 // -------------------------------------------------------------------------
 // Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
 // All rights reserved.
 //
 // LICENSE TERMS
 //
 // The free distribution and use of this software in both source and binary
 // form is allowed (with or without changes) provided that:
 //
 //   1. distributions of this source code include the above copyright
 //      notice, this list of conditions and the following disclaimer//
 //
 //   2. distributions in binary form include the above copyright
 //      notice, this list of conditions and the following disclaimer
 //      in the documentation and/or other associated materials//
 //
 //   3. the copyright holder's name is not used to endorse products
 //      built using this software without specific written permission.
 //
 //
 // ALTERNATIVELY, provided that this notice is retained in full, this product
 // may be distributed under the terms of the GNU General Public License (GPL),
 // in which case the provisions of the GPL apply INSTEAD OF those given above.
 //
 // Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
 // Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>

 // DISCLAIMER
 //
 // This software is provided 'as is' with no explicit or implied warranties
 // in respect of its properties including, but not limited to, correctness
 // and fitness for purpose.
 // -------------------------------------------------------------------------
 // Issue Date: 29/07/2002

 .file "aes-i586-asm.S"
 .text

 // aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
 // aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//

 #define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)

 // offsets to parameters with one register pushed onto stack

 #define in_blk    8  // input byte array address parameter
 #define out_blk  12  // output byte array address parameter
 #define ctx      16  // AES context structure

 // offsets in context structure

 #define ekey     0   // encryption key schedule base address
 #define nrnd   256   // number of rounds
 #define dkey   260   // decryption key schedule base address

 // register mapping for encrypt and decrypt subroutines

 #define r0  eax
 #define r1  ebx
 #define r2  ecx
 #define r3  edx
 #define r4  esi
 #define r5  edi

 #define eaxl  al
 #define eaxh  ah
 #define ebxl  bl
 #define ebxh  bh
 #define ecxl  cl
 #define ecxh  ch
 #define edxl  dl
 #define edxh  dh

 #define _h(reg) reg##h
 #define h(reg) _h(reg)

 #define _l(reg) reg##l
 #define l(reg) _l(reg)

 // This macro takes a 32-bit word representing a column and uses
 // each of its four bytes to index into four tables of 256 32-bit
 // words to obtain values that are then xored into the appropriate
 // output registers r0, r1, r4 or r5.

 // Parameters:
 // table table base address
 //   %1  out_state[0]
 //   %2  out_state[1]
 //   %3  out_state[2]
 //   %4  out_state[3]
 //   idx input register for the round (destroyed)
 //   tmp scratch register for the round
 // sched key schedule

 #define do_col(table, a1,a2,a3,a4, idx, tmp)	\
 	movzx   %l(idx),%tmp;			\
 	xor     table(,%tmp,4),%a1;		\
 	movzx   %h(idx),%tmp;			\
 	shr     $16,%idx;			\
 	xor     table+tlen(,%tmp,4),%a2;	\
 	movzx   %l(idx),%tmp;			\
 	movzx   %h(idx),%idx;			\
 	xor     table+2*tlen(,%tmp,4),%a3;	\
 	xor     table+3*tlen(,%idx,4),%a4;

 // initialise output registers from the key schedule
 // NB1: original value of a3 is in idx on exit
 // NB2: original values of a1,a2,a4 aren't used
 #define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
 	mov     0 sched,%a1;			\
 	movzx   %l(idx),%tmp;			\
 	mov     12 sched,%a2;			\
 	xor     table(,%tmp,4),%a1;		\
 	mov     4 sched,%a4;			\
 	movzx   %h(idx),%tmp;			\
 	shr     $16,%idx;			\
 	xor     table+tlen(,%tmp,4),%a2;	\
 	movzx   %l(idx),%tmp;			\
 	movzx   %h(idx),%idx;			\
 	xor     table+3*tlen(,%idx,4),%a4;	\
 	mov     %a3,%idx;			\
 	mov     8 sched,%a3;			\
 	xor     table+2*tlen(,%tmp,4),%a3;

 // initialise output registers from the key schedule
 // NB1: original value of a3 is in idx on exit
 // NB2: original values of a1,a2,a4 aren't used
 #define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
 	mov     0 sched,%a1;			\
 	movzx   %l(idx),%tmp;			\
 	mov     4 sched,%a2;			\
 	xor     table(,%tmp,4),%a1;		\
 	mov     12 sched,%a4;			\
 	movzx   %h(idx),%tmp;			\
 	shr     $16,%idx;			\
 	xor     table+tlen(,%tmp,4),%a2;	\
 	movzx   %l(idx),%tmp;			\
 	movzx   %h(idx),%idx;			\
 	xor     table+3*tlen(,%idx,4),%a4;	\
 	mov     %a3,%idx;			\
 	mov     8 sched,%a3;			\
 	xor     table+2*tlen(,%tmp,4),%a3;


 // original Gladman had conditional saves to MMX regs.
 #define save(a1, a2)		\
 	mov     %a2,4*a1(%esp)

 #define restore(a1, a2)		\
 	mov     4*a2(%esp),%a1

 // These macros perform a forward encryption cycle. They are entered with
 // the first previous round column values in r0,r1,r4,r5 and
 // exit with the final values in the same registers, using stack
 // for temporary storage.

 // round column values
 // on entry: r0,r1,r4,r5
 // on exit:  r2,r1,r4,r5
 #define fwd_rnd1(arg, table)						\
 	save   (0,r1);							\
 	save   (1,r5);							\
 									\
 	/* compute new column values */					\
 	do_fcol(table, r2,r5,r4,r1, r0,r3, arg);	/* idx=r0 */	\
 	do_col (table, r4,r1,r2,r5, r0,r3);		/* idx=r4 */	\
 	restore(r0,0);							\
 	do_col (table, r1,r2,r5,r4, r0,r3);		/* idx=r1 */	\
 	restore(r0,1);							\
 	do_col (table, r5,r4,r1,r2, r0,r3);		/* idx=r5 */

 // round column values
 // on entry: r2,r1,r4,r5
 // on exit:  r0,r1,r4,r5
 #define fwd_rnd2(arg, table)						\
 	save   (0,r1);							\
 	save   (1,r5);							\
 									\
 	/* compute new column values */					\
 	do_fcol(table, r0,r5,r4,r1, r2,r3, arg);	/* idx=r2 */	\
 	do_col (table, r4,r1,r0,r5, r2,r3);		/* idx=r4 */	\
 	restore(r2,0);							\
 	do_col (table, r1,r0,r5,r4, r2,r3);		/* idx=r1 */	\
 	restore(r2,1);							\
 	do_col (table, r5,r4,r1,r0, r2,r3);		/* idx=r5 */

 // These macros performs an inverse encryption cycle. They are entered with
 // the first previous round column values in r0,r1,r4,r5 and
 // exit with the final values in the same registers, using stack
 // for temporary storage

 // round column values
 // on entry: r0,r1,r4,r5
 // on exit:  r2,r1,r4,r5
 #define inv_rnd1(arg, table)						\
 	save    (0,r1);							\
 	save    (1,r5);							\
 									\
 	/* compute new column values */					\
 	do_icol(table, r2,r1,r4,r5, r0,r3, arg);	/* idx=r0 */	\
 	do_col (table, r4,r5,r2,r1, r0,r3);		/* idx=r4 */	\
 	restore(r0,0);							\
 	do_col (table, r1,r4,r5,r2, r0,r3);		/* idx=r1 */	\
 	restore(r0,1);							\
 	do_col (table, r5,r2,r1,r4, r0,r3);		/* idx=r5 */

 // round column values
 // on entry: r2,r1,r4,r5
 // on exit:  r0,r1,r4,r5
 #define inv_rnd2(arg, table)						\
 	save    (0,r1);							\
 	save    (1,r5);							\
 									\
 	/* compute new column values */					\
 	do_icol(table, r0,r1,r4,r5, r2,r3, arg);	/* idx=r2 */	\
 	do_col (table, r4,r5,r0,r1, r2,r3);		/* idx=r4 */	\
 	restore(r2,0);							\
 	do_col (table, r1,r4,r5,r0, r2,r3);		/* idx=r1 */	\
 	restore(r2,1);							\
 	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */

 // AES (Rijndael) Encryption Subroutine

 .global  aes_enc_blk

 .extern  ft_tab
 .extern  fl_tab

 .align 4

 aes_enc_blk:
 	push    %ebp
 	mov     ctx(%esp),%ebp      // pointer to context

 // CAUTION: the order and the values used in these assigns
 // rely on the register mappings

 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
 	mov     nrnd(%ebp),%r3   // number of rounds
 	push    %edi
 #if ekey != 0
 	lea     ekey(%ebp),%ebp  // key pointer
 #endif

 // input four columns and xor in first round key

 	mov     (%r2),%r0
 	mov     4(%r2),%r1
 	mov     8(%r2),%r4
 	mov     12(%r2),%r5
 	xor     (%ebp),%r0
 	xor     4(%ebp),%r1
 	xor     8(%ebp),%r4
 	xor     12(%ebp),%r5

 	sub     $8,%esp           // space for register saves on stack
 	add     $16,%ebp          // increment to next round key
 	sub     $10,%r3
 	je      4f              // 10 rounds for 128-bit key
 	add     $32,%ebp
 	sub     $2,%r3
 	je      3f              // 12 rounds for 128-bit key
 	add     $32,%ebp

 2:	fwd_rnd1( -64(%ebp) ,ft_tab)	// 14 rounds for 128-bit key
 	fwd_rnd2( -48(%ebp) ,ft_tab)
 3:	fwd_rnd1( -32(%ebp) ,ft_tab)	// 12 rounds for 128-bit key
 	fwd_rnd2( -16(%ebp) ,ft_tab)
 4:	fwd_rnd1(    (%ebp) ,ft_tab)	// 10 rounds for 128-bit key
 	fwd_rnd2( +16(%ebp) ,ft_tab)
 	fwd_rnd1( +32(%ebp) ,ft_tab)
 	fwd_rnd2( +48(%ebp) ,ft_tab)
 	fwd_rnd1( +64(%ebp) ,ft_tab)
 	fwd_rnd2( +80(%ebp) ,ft_tab)
 	fwd_rnd1( +96(%ebp) ,ft_tab)
 	fwd_rnd2(+112(%ebp) ,ft_tab)
 	fwd_rnd1(+128(%ebp) ,ft_tab)
 	fwd_rnd2(+144(%ebp) ,fl_tab)	// last round uses a different table

 // move final values to the output array.  CAUTION: the
 // order of these assigns rely on the register mappings

 	add     $8,%esp
 	mov     out_blk+12(%esp),%ebp
 	mov     %r5,12(%ebp)
 	pop     %edi
 	mov     %r4,8(%ebp)
 	pop     %esi
 	mov     %r1,4(%ebp)
 	pop     %ebx
 	mov     %r0,(%ebp)
 	pop     %ebp
 	mov     $1,%eax
 	ret

 // AES (Rijndael) Decryption Subroutine

 .global  aes_dec_blk

 .extern  it_tab
 .extern  il_tab

 .align 4

 aes_dec_blk:
 	push    %ebp
 	mov     ctx(%esp),%ebp       // pointer to context

 // CAUTION: the order and the values used in these assigns
 // rely on the register mappings

 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
 	mov     nrnd(%ebp),%r3   // number of rounds
 	push    %edi
 #if dkey != 0
 	lea     dkey(%ebp),%ebp  // key pointer
 #endif
 	mov     %r3,%r0
 	shl     $4,%r0
 	add     %r0,%ebp

 // input four columns and xor in first round key

 	mov     (%r2),%r0
 	mov     4(%r2),%r1
 	mov     8(%r2),%r4
 	mov     12(%r2),%r5
 	xor     (%ebp),%r0
 	xor     4(%ebp),%r1
 	xor     8(%ebp),%r4
 	xor     12(%ebp),%r5

 	sub     $8,%esp         // space for register saves on stack
 	sub     $16,%ebp        // increment to next round key
 	sub     $10,%r3
 	je      4f              // 10 rounds for 128-bit key
 	sub     $32,%ebp
 	sub     $2,%r3
 	je      3f              // 12 rounds for 128-bit key
 	sub     $32,%ebp

 2:	inv_rnd1( +64(%ebp), it_tab)	// 14 rounds for 128-bit key
 	inv_rnd2( +48(%ebp), it_tab)
 3:	inv_rnd1( +32(%ebp), it_tab)	// 12 rounds for 128-bit key
 	inv_rnd2( +16(%ebp), it_tab)
 4:	inv_rnd1(    (%ebp), it_tab)	// 10 rounds for 128-bit key
 	inv_rnd2( -16(%ebp), it_tab)
 	inv_rnd1( -32(%ebp), it_tab)
 	inv_rnd2( -48(%ebp), it_tab)
 	inv_rnd1( -64(%ebp), it_tab)
 	inv_rnd2( -80(%ebp), it_tab)
 	inv_rnd1( -96(%ebp), it_tab)
 	inv_rnd2(-112(%ebp), it_tab)
 	inv_rnd1(-128(%ebp), it_tab)
 	inv_rnd2(-144(%ebp), il_tab)	// last round uses a different table

 // move final values to the output array.  CAUTION: the
 // order of these assigns rely on the register mappings

 	add     $8,%esp
 	mov     out_blk+12(%esp),%ebp
 	mov     %r5,12(%ebp)
 	pop     %edi
 	mov     %r4,8(%ebp)
 	pop     %esi
 	mov     %r1,4(%ebp)
 	pop     %ebx
 	mov     %r0,(%ebp)
 	pop     %ebp
 	mov     $1,%eax
 	ret
	// -------------------------------------------------------------------------
	// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
	// All rights reserved.
	//
	// LICENSE TERMS
	//
	// The free distribution and use of this software in both source and binary
	// form is allowed (with or without changes) provided that:
	//
	// 1. distributions of this source code include the above copyright
	// notice, this list of conditions and the following disclaimer//
	//
	// 2. distributions in binary form include the above copyright
	// notice, this list of conditions and the following disclaimer
	// in the documentation and/or other associated materials//
	//
	// 3. the copyright holder's name is not used to endorse products
	// built using this software without specific written permission.
	//
	//
	// ALTERNATIVELY, provided that this notice is retained in full, this product
	// may be distributed under the terms of the GNU General Public License (GPL),
	// in which case the provisions of the GPL apply INSTEAD OF those given above.
	//
	// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
	// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>

	// DISCLAIMER
	//
	// This software is provided 'as is' with no explicit or implied warranties
	// in respect of its properties including, but not limited to, correctness
	// and fitness for purpose.
	// -------------------------------------------------------------------------
	// Issue Date: 29/07/2002

	.file "aes-i586-asm.S"
	.text

	// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//

	#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)

	// offsets to parameters with one register pushed onto stack

	#define in_blk 8 // input byte array address parameter
	#define out_blk 12 // output byte array address parameter
	#define ctx 16 // AES context structure

	// offsets in context structure

	#define ekey 0 // encryption key schedule base address
	#define nrnd 256 // number of rounds
	#define dkey 260 // decryption key schedule base address

	// register mapping for encrypt and decrypt subroutines

	#define r0 eax
	#define r1 ebx
	#define r2 ecx
	#define r3 edx
	#define r4 esi
	#define r5 edi

	#define eaxl al
	#define eaxh ah
	#define ebxl bl
	#define ebxh bh
	#define ecxl cl
	#define ecxh ch
	#define edxl dl
	#define edxh dh

	#define _h(reg) reg##h
	#define h(reg) _h(reg)

	#define _l(reg) reg##l
	#define l(reg) _l(reg)

	// This macro takes a 32-bit word representing a column and uses
	// each of its four bytes to index into four tables of 256 32-bit
	// words to obtain values that are then xored into the appropriate
	// output registers r0, r1, r4 or r5.

	// Parameters:
	// table table base address
	// %1 out_state[0]
	// %2 out_state[1]
	// %3 out_state[2]
	// %4 out_state[3]
	// idx input register for the round (destroyed)
	// tmp scratch register for the round
	// sched key schedule

	#define do_col(table, a1,a2,a3,a4, idx, tmp) \
	movzx %l(idx),%tmp; \
	xor table(,%tmp,4),%a1; \
	movzx %h(idx),%tmp; \
	shr $16,%idx; \
	xor table+tlen(,%tmp,4),%a2; \
	movzx %l(idx),%tmp; \
	movzx %h(idx),%idx; \
	xor table+2*tlen(,%tmp,4),%a3; \
	xor table+3*tlen(,%idx,4),%a4;

	// initialise output registers from the key schedule
	// NB1: original value of a3 is in idx on exit
	// NB2: original values of a1,a2,a4 aren't used
	#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov 0 sched,%a1; \
	movzx %l(idx),%tmp; \
	mov 12 sched,%a2; \
	xor table(,%tmp,4),%a1; \
	mov 4 sched,%a4; \
	movzx %h(idx),%tmp; \
	shr $16,%idx; \
	xor table+tlen(,%tmp,4),%a2; \
	movzx %l(idx),%tmp; \
	movzx %h(idx),%idx; \
	xor table+3*tlen(,%idx,4),%a4; \
	mov %a3,%idx; \
	mov 8 sched,%a3; \
	xor table+2*tlen(,%tmp,4),%a3;

	// initialise output registers from the key schedule
	// NB1: original value of a3 is in idx on exit
	// NB2: original values of a1,a2,a4 aren't used
	#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov 0 sched,%a1; \
	movzx %l(idx),%tmp; \
	mov 4 sched,%a2; \
	xor table(,%tmp,4),%a1; \
	mov 12 sched,%a4; \
	movzx %h(idx),%tmp; \
	shr $16,%idx; \
	xor table+tlen(,%tmp,4),%a2; \
	movzx %l(idx),%tmp; \
	movzx %h(idx),%idx; \
	xor table+3*tlen(,%idx,4),%a4; \
	mov %a3,%idx; \
	mov 8 sched,%a3; \
	xor table+2*tlen(,%tmp,4),%a3;


	// original Gladman had conditional saves to MMX regs.
	#define save(a1, a2) \
	mov %a2,4*a1(%esp)

	#define restore(a1, a2) \
	mov 4*a2(%esp),%a1

	// These macros perform a forward encryption cycle. They are entered with
	// the first previous round column values in r0,r1,r4,r5 and
	// exit with the final values in the same registers, using stack
	// for temporary storage.

	// round column values
	// on entry: r0,r1,r4,r5
	// on exit: r2,r1,r4,r5
	#define fwd_rnd1(arg, table) \
	save (0,r1); \
	save (1,r5); \
	\
	/* compute new column values */ \
	do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
	do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
	restore(r0,0); \
	do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
	restore(r0,1); \
	do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */

	// round column values
	// on entry: r2,r1,r4,r5
	// on exit: r0,r1,r4,r5
	#define fwd_rnd2(arg, table) \
	save (0,r1); \
	save (1,r5); \
	\
	/* compute new column values */ \
	do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
	do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
	restore(r2,0); \
	do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
	restore(r2,1); \
	do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */

	// These macros performs an inverse encryption cycle. They are entered with
	// the first previous round column values in r0,r1,r4,r5 and
	// exit with the final values in the same registers, using stack
	// for temporary storage

	// round column values
	// on entry: r0,r1,r4,r5
	// on exit: r2,r1,r4,r5
	#define inv_rnd1(arg, table) \
	save (0,r1); \
	save (1,r5); \
	\
	/* compute new column values */ \
	do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
	do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
	restore(r0,0); \
	do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
	restore(r0,1); \
	do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */

	// round column values
	// on entry: r2,r1,r4,r5
	// on exit: r0,r1,r4,r5
	#define inv_rnd2(arg, table) \
	save (0,r1); \
	save (1,r5); \
	\
	/* compute new column values */ \
	do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
	do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
	restore(r2,0); \
	do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
	restore(r2,1); \
	do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */

	// AES (Rijndael) Encryption Subroutine

	.global aes_enc_blk

	.extern ft_tab
	.extern fl_tab

	.align 4

	aes_enc_blk:
	push %ebp
	mov ctx(%esp),%ebp // pointer to context

	// CAUTION: the order and the values used in these assigns
	// rely on the register mappings

	1: push %ebx
	mov in_blk+4(%esp),%r2
	push %esi
	mov nrnd(%ebp),%r3 // number of rounds
	push %edi
	#if ekey != 0
	lea ekey(%ebp),%ebp // key pointer
	#endif

	// input four columns and xor in first round key

	mov (%r2),%r0
	mov 4(%r2),%r1
	mov 8(%r2),%r4
	mov 12(%r2),%r5
	xor (%ebp),%r0
	xor 4(%ebp),%r1
	xor 8(%ebp),%r4
	xor 12(%ebp),%r5

	sub $8,%esp // space for register saves on stack
	add $16,%ebp // increment to next round key
	sub $10,%r3
	je 4f // 10 rounds for 128-bit key
	add $32,%ebp
	sub $2,%r3
	je 3f // 12 rounds for 128-bit key
	add $32,%ebp

	2: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 128-bit key
	fwd_rnd2( -48(%ebp) ,ft_tab)
	3: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 128-bit key
	fwd_rnd2( -16(%ebp) ,ft_tab)
	4: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key
	fwd_rnd2( +16(%ebp) ,ft_tab)
	fwd_rnd1( +32(%ebp) ,ft_tab)
	fwd_rnd2( +48(%ebp) ,ft_tab)
	fwd_rnd1( +64(%ebp) ,ft_tab)
	fwd_rnd2( +80(%ebp) ,ft_tab)
	fwd_rnd1( +96(%ebp) ,ft_tab)
	fwd_rnd2(+112(%ebp) ,ft_tab)
	fwd_rnd1(+128(%ebp) ,ft_tab)
	fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table

	// move final values to the output array. CAUTION: the
	// order of these assigns rely on the register mappings

	add $8,%esp
	mov out_blk+12(%esp),%ebp
	mov %r5,12(%ebp)
	pop %edi
	mov %r4,8(%ebp)
	pop %esi
	mov %r1,4(%ebp)
	pop %ebx
	mov %r0,(%ebp)
	pop %ebp
	mov $1,%eax
	ret

	// AES (Rijndael) Decryption Subroutine

	.global aes_dec_blk

	.extern it_tab
	.extern il_tab

	.align 4

	aes_dec_blk:
	push %ebp
	mov ctx(%esp),%ebp // pointer to context

	// CAUTION: the order and the values used in these assigns
	// rely on the register mappings

	1: push %ebx
	mov in_blk+4(%esp),%r2
	push %esi
	mov nrnd(%ebp),%r3 // number of rounds
	push %edi
	#if dkey != 0
	lea dkey(%ebp),%ebp // key pointer
	#endif
	mov %r3,%r0
	shl $4,%r0
	add %r0,%ebp

	// input four columns and xor in first round key

	mov (%r2),%r0
	mov 4(%r2),%r1
	mov 8(%r2),%r4
	mov 12(%r2),%r5
	xor (%ebp),%r0
	xor 4(%ebp),%r1
	xor 8(%ebp),%r4
	xor 12(%ebp),%r5

	sub $8,%esp // space for register saves on stack
	sub $16,%ebp // increment to next round key
	sub $10,%r3
	je 4f // 10 rounds for 128-bit key
	sub $32,%ebp
	sub $2,%r3
	je 3f // 12 rounds for 128-bit key
	sub $32,%ebp

	2: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 128-bit key
	inv_rnd2( +48(%ebp), it_tab)
	3: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 128-bit key
	inv_rnd2( +16(%ebp), it_tab)
	4: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key
	inv_rnd2( -16(%ebp), it_tab)
	inv_rnd1( -32(%ebp), it_tab)
	inv_rnd2( -48(%ebp), it_tab)
	inv_rnd1( -64(%ebp), it_tab)
	inv_rnd2( -80(%ebp), it_tab)
	inv_rnd1( -96(%ebp), it_tab)
	inv_rnd2(-112(%ebp), it_tab)
	inv_rnd1(-128(%ebp), it_tab)
	inv_rnd2(-144(%ebp), il_tab) // last round uses a different table

	// move final values to the output array. CAUTION: the
	// order of these assigns rely on the register mappings

	add $8,%esp
	mov out_blk+12(%esp),%ebp
	mov %r5,12(%ebp)
	pop %edi
	mov %r4,8(%ebp)
	pop %esi
	mov %r1,4(%ebp)
	pop %ebx
	mov %r0,(%ebp)
	pop %ebp
	mov $1,%eax
	ret