blob: 0488455f0e1d848bb715dfe59e7ed4505658e5a0 [file] [log] [blame]
Kinson Chika8fa74c2011-07-29 11:33:41 -07001.text
2
3.global bn_mul_mont
4.type bn_mul_mont,%function
5
6.align 2
7bn_mul_mont:
8 stmdb sp!,{r0,r2} @ sp points at argument block
9 ldr r0,[sp,#3*4] @ load num
10 cmp r0,#2
11 movlt r0,#0
12 addlt sp,sp,#2*4
13 blt .Labrt
14
15 stmdb sp!,{r4-r12,lr} @ save 10 registers
16
17 mov r0,r0,lsl#2 @ rescale r0 for byte count
18 sub sp,sp,r0 @ alloca(4*num)
19 sub sp,sp,#4 @ +extra dword
20 sub r0,r0,#4 @ "num=num-1"
21 add r4,r2,r0 @ &bp[num-1]
22
23 add r0,sp,r0 @ r0 to point at &tp[num-1]
24 ldr r8,[r0,#14*4] @ &n0
25 ldr r2,[r2] @ bp[0]
26 ldr r5,[r1],#4 @ ap[0],ap++
27 ldr r6,[r3],#4 @ np[0],np++
28 ldr r8,[r8] @ *n0
29 str r4,[r0,#15*4] @ save &bp[num]
30
31 umull r10,r11,r5,r2 @ ap[0]*bp[0]
32 str r8,[r0,#14*4] @ save n0 value
33 mul r8,r10,r8 @ "tp[0]"*n0
34 mov r12,#0
35 umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
36 mov r4,sp
37
38.L1st:
39 ldr r5,[r1],#4 @ ap[j],ap++
40 mov r10,r11
41 mov r11,#0
42 umlal r10,r11,r5,r2 @ ap[j]*bp[0]
43 ldr r6,[r3],#4 @ np[j],np++
44 mov r14,#0
45 umlal r12,r14,r6,r8 @ np[j]*n0
46 adds r12,r12,r10
47 str r12,[r4],#4 @ tp[j-1]=,tp++
48 adc r12,r14,#0
49 cmp r4,r0
50 bne .L1st
51
52 adds r12,r12,r11
53 mov r14,#0
54 adc r14,r14,#0
55 ldr r4,[r0,#13*4] @ restore bp
56 str r12,[r0] @ tp[num-1]=
57 ldr r8,[r0,#14*4] @ restore n0
58 str r14,[r0,#4] @ tp[num]=
59
60.Louter:
61 sub r7,r0,sp @ "original" r0-1 value
62 sub r1,r1,r7 @ "rewind" ap to &ap[1]
63 sub r3,r3,r7 @ "rewind" np to &np[1]
64 ldr r2,[r4,#4]! @ *(++bp)
65 ldr r5,[r1,#-4] @ ap[0]
66 ldr r6,[r3,#-4] @ np[0]
67 ldr r10,[sp] @ tp[0]
68 ldr r7,[sp,#4] @ tp[1]
69
70 mov r11,#0
71 umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
72 str r4,[r0,#13*4] @ save bp
73 mul r8,r10,r8
74 mov r12,#0
75 umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
76 mov r4,sp
77
78.Linner:
79 ldr r5,[r1],#4 @ ap[j],ap++
80 adds r10,r11,r7 @ +=tp[j]
81 mov r11,#0
82 umlal r10,r11,r5,r2 @ ap[j]*bp[i]
83 ldr r6,[r3],#4 @ np[j],np++
84 mov r14,#0
85 umlal r12,r14,r6,r8 @ np[j]*n0
86 ldr r7,[r4,#8] @ tp[j+1]
87 adc r11,r11,#0
88 adds r12,r12,r10
89 str r12,[r4],#4 @ tp[j-1]=,tp++
90 adc r12,r14,#0
91 cmp r4,r0
92 bne .Linner
93
94 adds r12,r12,r11
95 mov r14,#0
96 adc r14,r14,#0
97 adds r12,r12,r7
98 adc r14,r14,#0
99 ldr r4,[r0,#13*4] @ restore bp
100 ldr r7,[r0,#15*4] @ restore &bp[num]
101 str r12,[r0] @ tp[num-1]=
102 ldr r8,[r0,#14*4] @ restore n0
103 str r14,[r0,#4] @ tp[num]=
104
105 cmp r4,r7
106 bne .Louter
107
108 ldr r2,[r0,#12*4] @ pull rp
109 add r0,r0,#4 @ r0 to point at &tp[num]
110 sub r5,r0,sp @ "original" num value
111 mov r4,sp @ "rewind" r4
112 mov r1,r4 @ "borrow" r1
113 sub r3,r3,r5 @ "rewind" r3 to &np[0]
114
115 subs r7,r7,r7 @ "clear" carry flag
116.Lsub: ldr r7,[r4],#4
117 ldr r6,[r3],#4
118 sbcs r7,r7,r6 @ tp[j]-np[j]
119 str r7,[r2],#4 @ rp[j]=
120 teq r4,r0 @ preserve carry
121 bne .Lsub
122 sbcs r14,r14,#0 @ upmost carry
123 mov r4,sp @ "rewind" r4
124 sub r2,r2,r5 @ "rewind" r2
125
126 and r1,r4,r14
127 bic r3,r2,r14
128 orr r1,r1,r3 @ ap=borrow?tp:rp
129
130.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh
131 str sp,[r4],#4 @ zap tp
132 str r7,[r2],#4
133 cmp r4,r0
134 bne .Lcopy
135
136 add sp,r0,#4 @ skip over tp[num+1]
137 ldmia sp!,{r4-r12,lr} @ restore registers
138 add sp,sp,#2*4 @ skip over {r0,r2}
139 mov r0,#1
140.Labrt: tst lr,#1
141 moveq pc,lr @ be binary compatible with V4, yet
142 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
143.size bn_mul_mont,.-bn_mul_mont
144.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
145.align 2