blob: 234cbc70474feb34dc8a389feaeb0bf7c38f4ed7 [file] [log] [blame]
Mike Klein894d5612017-03-07 07:59:52 -05001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8// This file is generated semi-automatically with this command:
9// $ src/jumper/build_stages.py
10
11#include <stdint.h>
12
13#if defined(_MSC_VER)
14 #pragma section("code", read,execute)
15 #define CODE extern "C" __declspec(allocate("code"))
16#elif defined(__MACH__)
17 #define CODE extern "C" __attribute__((section("__TEXT,__text")))
18#else
19 #define CODE extern "C" __attribute__((section(".text")))
20#endif
21
22#if defined(__aarch64__)
23
24CODE const uint32_t sk_start_pipeline_aarch64[] = {
25 0xa9bd5bf7, //stp x23, x22, [sp, #-48]!
26 0xa90153f5, //stp x21, x20, [sp, #16]
27 0xa9027bf3, //stp x19, x30, [sp, #32]
28 0xaa0103f5, //mov x21, x1
29 0xf84086b7, //ldr x23, [x21], #8
30 0xaa0003f6, //mov x22, x0
31 0xaa0303f3, //mov x19, x3
32 0xaa0203f4, //mov x20, x2
33 0x910012c8, //add x8, x22, #0x4
34 0xeb13011f, //cmp x8, x19
35 0x54000069, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast
36 0xaa1603e0, //mov x0, x22
37 0x14000012, //b 78 <sk_start_pipeline_aarch64+0x78>
38 0x6f00e400, //movi v0.2d, #0x0
39 0x6f00e401, //movi v1.2d, #0x0
40 0x6f00e402, //movi v2.2d, #0x0
41 0x6f00e403, //movi v3.2d, #0x0
42 0x6f00e404, //movi v4.2d, #0x0
43 0x6f00e405, //movi v5.2d, #0x0
44 0x6f00e406, //movi v6.2d, #0x0
45 0x6f00e407, //movi v7.2d, #0x0
46 0xaa1603e0, //mov x0, x22
47 0xaa1503e1, //mov x1, x21
48 0xaa1403e2, //mov x2, x20
49 0xd63f02e0, //blr x23
50 0x910022c8, //add x8, x22, #0x8
51 0x910012c0, //add x0, x22, #0x4
52 0xeb13011f, //cmp x8, x19
53 0xaa0003f6, //mov x22, x0
54 0x54fffe09, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast
55 0xa9427bf3, //ldp x19, x30, [sp, #32]
56 0xa94153f5, //ldp x21, x20, [sp, #16]
57 0xa8c35bf7, //ldp x23, x22, [sp], #48
58 0xd65f03c0, //ret
59};
60
61CODE const uint32_t sk_just_return_aarch64[] = {
62 0xd65f03c0, //ret
63};
64
65CODE const uint32_t sk_seed_shader_aarch64[] = {
66 0xaa0203e9, //mov x9, x2
67 0xa9400c28, //ldp x8, x3, [x1]
68 0x4ddfc922, //ld1r {v2.4s}, [x9], #4
69 0x3cc14047, //ldur q7, [x2, #20]
70 0x4e040c00, //dup v0.4s, w0
71 0x4d40c901, //ld1r {v1.4s}, [x8]
72 0x4d40c926, //ld1r {v6.4s}, [x9]
73 0x4e21d800, //scvtf v0.4s, v0.4s
74 0x91004028, //add x8, x1, #0x10
75 0x4e21d821, //scvtf v1.4s, v1.4s
76 0x4e26d400, //fadd v0.4s, v0.4s, v6.4s
77 0x6f00e403, //movi v3.2d, #0x0
78 0x6f00e404, //movi v4.2d, #0x0
79 0x6f00e405, //movi v5.2d, #0x0
80 0x4e26d421, //fadd v1.4s, v1.4s, v6.4s
81 0x6f00e406, //movi v6.2d, #0x0
82 0x4e20d4e0, //fadd v0.4s, v7.4s, v0.4s
83 0x6f00e407, //movi v7.2d, #0x0
84 0xaa0803e1, //mov x1, x8
85 0xd61f0060, //br x3
86};
87
88CODE const uint32_t sk_constant_color_aarch64[] = {
89 0xa8c10c28, //ldp x8, x3, [x1], #16
90 0x3dc00103, //ldr q3, [x8]
91 0x4e040460, //dup v0.4s, v3.s[0]
92 0x4e0c0461, //dup v1.4s, v3.s[1]
93 0x4e140462, //dup v2.4s, v3.s[2]
94 0x4e1c0463, //dup v3.4s, v3.s[3]
95 0xd61f0060, //br x3
96};
97
98CODE const uint32_t sk_clear_aarch64[] = {
99 0xf8408423, //ldr x3, [x1], #8
100 0x6f00e400, //movi v0.2d, #0x0
101 0x6f00e401, //movi v1.2d, #0x0
102 0x6f00e402, //movi v2.2d, #0x0
103 0x6f00e403, //movi v3.2d, #0x0
104 0xd61f0060, //br x3
105};
106
107CODE const uint32_t sk_plus__aarch64[] = {
108 0xf8408423, //ldr x3, [x1], #8
109 0x4e24d400, //fadd v0.4s, v0.4s, v4.4s
110 0x4e25d421, //fadd v1.4s, v1.4s, v5.4s
111 0x4e26d442, //fadd v2.4s, v2.4s, v6.4s
112 0x4e27d463, //fadd v3.4s, v3.4s, v7.4s
113 0xd61f0060, //br x3
114};
115
116CODE const uint32_t sk_srcover_aarch64[] = {
117 0x4d40c850, //ld1r {v16.4s}, [x2]
118 0xf8408423, //ldr x3, [x1], #8
119 0x4ea3d610, //fsub v16.4s, v16.4s, v3.4s
120 0x4e24ce00, //fmla v0.4s, v16.4s, v4.4s
121 0x4e25ce01, //fmla v1.4s, v16.4s, v5.4s
122 0x4e26ce02, //fmla v2.4s, v16.4s, v6.4s
123 0x4e27ce03, //fmla v3.4s, v16.4s, v7.4s
124 0xd61f0060, //br x3
125};
126
127CODE const uint32_t sk_dstover_aarch64[] = {
128 0x4d40c851, //ld1r {v17.4s}, [x2]
129 0xf8408423, //ldr x3, [x1], #8
130 0x4ea41c90, //mov v16.16b, v4.16b
131 0x4ea61cd2, //mov v18.16b, v6.16b
132 0x4ea7d634, //fsub v20.4s, v17.4s, v7.4s
133 0x4ea51cb1, //mov v17.16b, v5.16b
134 0x4ea71cf3, //mov v19.16b, v7.16b
135 0x4e20ce90, //fmla v16.4s, v20.4s, v0.4s
136 0x4e21ce91, //fmla v17.4s, v20.4s, v1.4s
137 0x4e22ce92, //fmla v18.4s, v20.4s, v2.4s
138 0x4e23ce93, //fmla v19.4s, v20.4s, v3.4s
139 0x4eb01e00, //mov v0.16b, v16.16b
140 0x4eb11e21, //mov v1.16b, v17.16b
141 0x4eb21e42, //mov v2.16b, v18.16b
142 0x4eb31e63, //mov v3.16b, v19.16b
143 0xd61f0060, //br x3
144};
145
146CODE const uint32_t sk_clamp_0_aarch64[] = {
147 0xf8408423, //ldr x3, [x1], #8
148 0x6f00e410, //movi v16.2d, #0x0
149 0x4e30f400, //fmax v0.4s, v0.4s, v16.4s
150 0x4e30f421, //fmax v1.4s, v1.4s, v16.4s
151 0x4e30f442, //fmax v2.4s, v2.4s, v16.4s
152 0x4e30f463, //fmax v3.4s, v3.4s, v16.4s
153 0xd61f0060, //br x3
154};
155
156CODE const uint32_t sk_clamp_1_aarch64[] = {
157 0x4d40c850, //ld1r {v16.4s}, [x2]
158 0xf8408423, //ldr x3, [x1], #8
159 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s
160 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s
161 0x4eb0f442, //fmin v2.4s, v2.4s, v16.4s
162 0x4eb0f463, //fmin v3.4s, v3.4s, v16.4s
163 0xd61f0060, //br x3
164};
165
166CODE const uint32_t sk_clamp_a_aarch64[] = {
167 0x4d40c850, //ld1r {v16.4s}, [x2]
168 0xf8408423, //ldr x3, [x1], #8
169 0x4eb0f463, //fmin v3.4s, v3.4s, v16.4s
170 0x4ea3f400, //fmin v0.4s, v0.4s, v3.4s
171 0x4ea3f421, //fmin v1.4s, v1.4s, v3.4s
172 0x4ea3f442, //fmin v2.4s, v2.4s, v3.4s
173 0xd61f0060, //br x3
174};
175
176CODE const uint32_t sk_set_rgb_aarch64[] = {
177 0xa8c10c28, //ldp x8, x3, [x1], #16
178 0xaa0803e9, //mov x9, x8
179 0x4ddfc920, //ld1r {v0.4s}, [x9], #4
180 0x91002108, //add x8, x8, #0x8
181 0x4d40c902, //ld1r {v2.4s}, [x8]
182 0x4d40c921, //ld1r {v1.4s}, [x9]
183 0xd61f0060, //br x3
184};
185
186CODE const uint32_t sk_swap_rb_aarch64[] = {
187 0xf8408423, //ldr x3, [x1], #8
188 0x4ea01c10, //mov v16.16b, v0.16b
189 0x4ea21c40, //mov v0.16b, v2.16b
190 0x4eb01e02, //mov v2.16b, v16.16b
191 0xd61f0060, //br x3
192};
193
194CODE const uint32_t sk_swap_aarch64[] = {
195 0xf8408423, //ldr x3, [x1], #8
196 0x4ea31c70, //mov v16.16b, v3.16b
197 0x4ea21c51, //mov v17.16b, v2.16b
198 0x4ea11c32, //mov v18.16b, v1.16b
199 0x4ea01c13, //mov v19.16b, v0.16b
200 0x4ea41c80, //mov v0.16b, v4.16b
201 0x4ea51ca1, //mov v1.16b, v5.16b
202 0x4ea61cc2, //mov v2.16b, v6.16b
203 0x4ea71ce3, //mov v3.16b, v7.16b
204 0x4eb31e64, //mov v4.16b, v19.16b
205 0x4eb21e45, //mov v5.16b, v18.16b
206 0x4eb11e26, //mov v6.16b, v17.16b
207 0x4eb01e07, //mov v7.16b, v16.16b
208 0xd61f0060, //br x3
209};
210
211CODE const uint32_t sk_move_src_dst_aarch64[] = {
212 0xf8408423, //ldr x3, [x1], #8
213 0x4ea01c04, //mov v4.16b, v0.16b
214 0x4ea11c25, //mov v5.16b, v1.16b
215 0x4ea21c46, //mov v6.16b, v2.16b
216 0x4ea31c67, //mov v7.16b, v3.16b
217 0xd61f0060, //br x3
218};
219
220CODE const uint32_t sk_move_dst_src_aarch64[] = {
221 0xf8408423, //ldr x3, [x1], #8
222 0x4ea41c80, //mov v0.16b, v4.16b
223 0x4ea51ca1, //mov v1.16b, v5.16b
224 0x4ea61cc2, //mov v2.16b, v6.16b
225 0x4ea71ce3, //mov v3.16b, v7.16b
226 0xd61f0060, //br x3
227};
228
229CODE const uint32_t sk_premul_aarch64[] = {
230 0xf8408423, //ldr x3, [x1], #8
231 0x6e23dc00, //fmul v0.4s, v0.4s, v3.4s
232 0x6e23dc21, //fmul v1.4s, v1.4s, v3.4s
233 0x6e23dc42, //fmul v2.4s, v2.4s, v3.4s
234 0xd61f0060, //br x3
235};
236
237CODE const uint32_t sk_unpremul_aarch64[] = {
238 0x4d40c850, //ld1r {v16.4s}, [x2]
239 0xf8408423, //ldr x3, [x1], #8
240 0x4ea0d871, //fcmeq v17.4s, v3.4s, #0.0
241 0x6e23fe10, //fdiv v16.4s, v16.4s, v3.4s
242 0x4e711e10, //bic v16.16b, v16.16b, v17.16b
243 0x6e20de00, //fmul v0.4s, v16.4s, v0.4s
244 0x6e21de01, //fmul v1.4s, v16.4s, v1.4s
245 0x6e22de02, //fmul v2.4s, v16.4s, v2.4s
246 0xd61f0060, //br x3
247};
248
249CODE const uint32_t sk_from_srgb_aarch64[] = {
250 0x9100e048, //add x8, x2, #0x38
251 0x4d40c910, //ld1r {v16.4s}, [x8]
252 0x9100d048, //add x8, x2, #0x34
253 0x2d47cc52, //ldp s18, s19, [x2, #60]
254 0x4d40c911, //ld1r {v17.4s}, [x8]
255 0x6e22dc54, //fmul v20.4s, v2.4s, v2.4s
256 0x4eb01e15, //mov v21.16b, v16.16b
257 0x4eb01e17, //mov v23.16b, v16.16b
258 0x4f921050, //fmla v16.4s, v2.4s, v18.s[0]
259 0x4eb11e36, //mov v22.16b, v17.16b
260 0x4eb11e38, //mov v24.16b, v17.16b
261 0x4e34ce11, //fmla v17.4s, v16.4s, v20.4s
262 0x6e20dc10, //fmul v16.4s, v0.4s, v0.4s
263 0x91011048, //add x8, x2, #0x44
264 0x4f921015, //fmla v21.4s, v0.4s, v18.s[0]
265 0x4e30ceb6, //fmla v22.4s, v21.4s, v16.4s
266 0x4d40c910, //ld1r {v16.4s}, [x8]
267 0xf8408423, //ldr x3, [x1], #8
268 0x6e21dc34, //fmul v20.4s, v1.4s, v1.4s
269 0x4f921037, //fmla v23.4s, v1.4s, v18.s[0]
270 0x4f939015, //fmul v21.4s, v0.4s, v19.s[0]
271 0x4f939032, //fmul v18.4s, v1.4s, v19.s[0]
272 0x4f939053, //fmul v19.4s, v2.4s, v19.s[0]
273 0x6ea0e600, //fcmgt v0.4s, v16.4s, v0.4s
274 0x6ea1e601, //fcmgt v1.4s, v16.4s, v1.4s
275 0x6ea2e602, //fcmgt v2.4s, v16.4s, v2.4s
276 0x4e34cef8, //fmla v24.4s, v23.4s, v20.4s
277 0x6e761ea0, //bsl v0.16b, v21.16b, v22.16b
278 0x6e781e41, //bsl v1.16b, v18.16b, v24.16b
279 0x6e711e62, //bsl v2.16b, v19.16b, v17.16b
280 0xd61f0060, //br x3
281};
282
283CODE const uint32_t sk_to_srgb_aarch64[] = {
284 0x6ea1d811, //frsqrte v17.4s, v0.4s
285 0x6ea1d835, //frsqrte v21.4s, v1.4s
286 0x6e31de37, //fmul v23.4s, v17.4s, v17.4s
287 0x6ea1d856, //frsqrte v22.4s, v2.4s
288 0x6e35deb9, //fmul v25.4s, v21.4s, v21.4s
289 0x4eb7fc17, //frsqrts v23.4s, v0.4s, v23.4s
290 0x91015048, //add x8, x2, #0x54
291 0x6e36deda, //fmul v26.4s, v22.4s, v22.4s
292 0x4eb9fc39, //frsqrts v25.4s, v1.4s, v25.4s
293 0x6e37de31, //fmul v17.4s, v17.4s, v23.4s
294 0x4d40c914, //ld1r {v20.4s}, [x8]
295 0x4ebafc5a, //frsqrts v26.4s, v2.4s, v26.4s
296 0x6e39deb5, //fmul v21.4s, v21.4s, v25.4s
297 0x4ea1da37, //frecpe v23.4s, v17.4s
298 0xbd405053, //ldr s19, [x2, #80]
299 0x91016048, //add x8, x2, #0x58
300 0x6e3aded6, //fmul v22.4s, v22.4s, v26.4s
301 0x4ea1dabb, //frecpe v27.4s, v21.4s
302 0x4e37fe3d, //frecps v29.4s, v17.4s, v23.4s
303 0x2d494052, //ldp s18, s16, [x2, #72]
304 0x4d40c918, //ld1r {v24.4s}, [x8]
305 0x4ea1dadc, //frecpe v28.4s, v22.4s
306 0x6e3ddef7, //fmul v23.4s, v23.4s, v29.4s
307 0x4e3bfebd, //frecps v29.4s, v21.4s, v27.4s
308 0x6e3ddf7b, //fmul v27.4s, v27.4s, v29.4s
309 0x4e3cfedd, //frecps v29.4s, v22.4s, v28.4s
310 0x6e3ddf9c, //fmul v28.4s, v28.4s, v29.4s
311 0x4eb41e9d, //mov v29.16b, v20.16b
312 0x6ea1da39, //frsqrte v25.4s, v17.4s
313 0x4f9312fd, //fmla v29.4s, v23.4s, v19.s[0]
314 0x4eb41e97, //mov v23.16b, v20.16b
315 0x4f92901a, //fmul v26.4s, v0.4s, v18.s[0]
316 0x4f931377, //fmla v23.4s, v27.4s, v19.s[0]
317 0x4f931394, //fmla v20.4s, v28.4s, v19.s[0]
318 0x4f929033, //fmul v19.4s, v1.4s, v18.s[0]
319 0x4f929052, //fmul v18.4s, v2.4s, v18.s[0]
320 0x6ea0e700, //fcmgt v0.4s, v24.4s, v0.4s
321 0x6ea1e701, //fcmgt v1.4s, v24.4s, v1.4s
322 0x6ea2e702, //fcmgt v2.4s, v24.4s, v2.4s
323 0x6e39df38, //fmul v24.4s, v25.4s, v25.4s
324 0x6ea1dabb, //frsqrte v27.4s, v21.4s
325 0x4eb8fe31, //frsqrts v17.4s, v17.4s, v24.4s
326 0x6ea1dadc, //frsqrte v28.4s, v22.4s
327 0x6e3bdf78, //fmul v24.4s, v27.4s, v27.4s
328 0x6e31df31, //fmul v17.4s, v25.4s, v17.4s
329 0x4eb8feb5, //frsqrts v21.4s, v21.4s, v24.4s
330 0x6e3cdf98, //fmul v24.4s, v28.4s, v28.4s
331 0x4f90123d, //fmla v29.4s, v17.4s, v16.s[0]
332 0x4d40c851, //ld1r {v17.4s}, [x2]
333 0x4eb8fed6, //frsqrts v22.4s, v22.4s, v24.4s
334 0x6e35df75, //fmul v21.4s, v27.4s, v21.4s
335 0x6e36df96, //fmul v22.4s, v28.4s, v22.4s
336 0xf8408423, //ldr x3, [x1], #8
337 0x4f9012b7, //fmla v23.4s, v21.4s, v16.s[0]
338 0x4f9012d4, //fmla v20.4s, v22.4s, v16.s[0]
339 0x4ebdf630, //fmin v16.4s, v17.4s, v29.4s
340 0x4eb7f635, //fmin v21.4s, v17.4s, v23.4s
341 0x4eb4f631, //fmin v17.4s, v17.4s, v20.4s
342 0x6e701f40, //bsl v0.16b, v26.16b, v16.16b
343 0x6e751e61, //bsl v1.16b, v19.16b, v21.16b
344 0x6e711e42, //bsl v2.16b, v18.16b, v17.16b
345 0xd61f0060, //br x3
346};
347
348CODE const uint32_t sk_scale_1_float_aarch64[] = {
349 0xa8c10c28, //ldp x8, x3, [x1], #16
350 0xbd400110, //ldr s16, [x8]
351 0x4f909000, //fmul v0.4s, v0.4s, v16.s[0]
352 0x4f909021, //fmul v1.4s, v1.4s, v16.s[0]
353 0x4f909042, //fmul v2.4s, v2.4s, v16.s[0]
354 0x4f909063, //fmul v3.4s, v3.4s, v16.s[0]
355 0xd61f0060, //br x3
356};
357
358CODE const uint32_t sk_scale_u8_aarch64[] = {
359 0xa8c10c28, //ldp x8, x3, [x1], #16
360 0xbd400c51, //ldr s17, [x2, #12]
361 0xf9400108, //ldr x8, [x8]
362 0x8b000108, //add x8, x8, x0
363 0x39400109, //ldrb w9, [x8]
364 0x3940050a, //ldrb w10, [x8, #1]
365 0x3940090b, //ldrb w11, [x8, #2]
366 0x39400d08, //ldrb w8, [x8, #3]
367 0x4e021d30, //mov v16.h[0], w9
368 0x4e061d50, //mov v16.h[1], w10
369 0x4e0a1d70, //mov v16.h[2], w11
370 0x4e0e1d10, //mov v16.h[3], w8
371 0x2f07b7f0, //bic v16.4h, #0xff, lsl #8
372 0x2f10a610, //uxtl v16.4s, v16.4h
373 0x6e21da10, //ucvtf v16.4s, v16.4s
374 0x4f919210, //fmul v16.4s, v16.4s, v17.s[0]
375 0x6e20de00, //fmul v0.4s, v16.4s, v0.4s
376 0x6e21de01, //fmul v1.4s, v16.4s, v1.4s
377 0x6e22de02, //fmul v2.4s, v16.4s, v2.4s
378 0x6e23de03, //fmul v3.4s, v16.4s, v3.4s
379 0xd61f0060, //br x3
380};
381
382CODE const uint32_t sk_lerp_1_float_aarch64[] = {
383 0xa8c10c28, //ldp x8, x3, [x1], #16
384 0x4ea4d411, //fsub v17.4s, v0.4s, v4.4s
385 0x4ea41c80, //mov v0.16b, v4.16b
386 0x4ea5d432, //fsub v18.4s, v1.4s, v5.4s
387 0xbd400110, //ldr s16, [x8]
388 0x4ea51ca1, //mov v1.16b, v5.16b
389 0x4f901220, //fmla v0.4s, v17.4s, v16.s[0]
390 0x4ea6d451, //fsub v17.4s, v2.4s, v6.4s
391 0x4f901241, //fmla v1.4s, v18.4s, v16.s[0]
392 0x4ea61cc2, //mov v2.16b, v6.16b
393 0x4ea7d472, //fsub v18.4s, v3.4s, v7.4s
394 0x4ea71ce3, //mov v3.16b, v7.16b
395 0x4f901222, //fmla v2.4s, v17.4s, v16.s[0]
396 0x4f901243, //fmla v3.4s, v18.4s, v16.s[0]
397 0xd61f0060, //br x3
398};
399
400CODE const uint32_t sk_lerp_u8_aarch64[] = {
401 0xa8c10c28, //ldp x8, x3, [x1], #16
402 0xbd400c51, //ldr s17, [x2, #12]
403 0x4ea4d412, //fsub v18.4s, v0.4s, v4.4s
404 0xf9400108, //ldr x8, [x8]
405 0x8b000108, //add x8, x8, x0
406 0x39400109, //ldrb w9, [x8]
407 0x3940050a, //ldrb w10, [x8, #1]
408 0x3940090b, //ldrb w11, [x8, #2]
409 0x39400d08, //ldrb w8, [x8, #3]
410 0x4e021d30, //mov v16.h[0], w9
411 0x4e061d50, //mov v16.h[1], w10
412 0x4e0a1d70, //mov v16.h[2], w11
413 0x4e0e1d10, //mov v16.h[3], w8
414 0x2f07b7f0, //bic v16.4h, #0xff, lsl #8
415 0x2f10a600, //uxtl v0.4s, v16.4h
416 0x6e21d800, //ucvtf v0.4s, v0.4s
417 0x4f919010, //fmul v16.4s, v0.4s, v17.s[0]
418 0x4ea41c80, //mov v0.16b, v4.16b
419 0x4ea5d431, //fsub v17.4s, v1.4s, v5.4s
420 0x4ea51ca1, //mov v1.16b, v5.16b
421 0x4e32ce00, //fmla v0.4s, v16.4s, v18.4s
422 0x4ea6d452, //fsub v18.4s, v2.4s, v6.4s
423 0x4e31ce01, //fmla v1.4s, v16.4s, v17.4s
424 0x4ea61cc2, //mov v2.16b, v6.16b
425 0x4ea7d471, //fsub v17.4s, v3.4s, v7.4s
426 0x4ea71ce3, //mov v3.16b, v7.16b
427 0x4e32ce02, //fmla v2.4s, v16.4s, v18.4s
428 0x4e31ce03, //fmla v3.4s, v16.4s, v17.4s
429 0xd61f0060, //br x3
430};
431
432CODE const uint32_t sk_lerp_565_aarch64[] = {
433 0xa8c10c28, //ldp x8, x3, [x1], #16
434 0xd37ff809, //lsl x9, x0, #1
435 0x2d4ec851, //ldp s17, s18, [x2, #116]
436 0x4ea4d413, //fsub v19.4s, v0.4s, v4.4s
437 0xf9400108, //ldr x8, [x8]
438 0x4ea41c80, //mov v0.16b, v4.16b
439 0xfc696903, //ldr d3, [x8, x9]
440 0x9101a048, //add x8, x2, #0x68
441 0x4d40c910, //ld1r {v16.4s}, [x8]
442 0x9101b048, //add x8, x2, #0x6c
443 0x2f10a463, //uxtl v3.4s, v3.4h
444 0x4e231e10, //and v16.16b, v16.16b, v3.16b
445 0x4e21da10, //scvtf v16.4s, v16.4s
446 0x4f919210, //fmul v16.4s, v16.4s, v17.s[0]
447 0x4d40c911, //ld1r {v17.4s}, [x8]
448 0x9101c048, //add x8, x2, #0x70
449 0x4e33ce00, //fmla v0.4s, v16.4s, v19.4s
450 0x4ea5d430, //fsub v16.4s, v1.4s, v5.4s
451 0x4e231e31, //and v17.16b, v17.16b, v3.16b
452 0x4e21da31, //scvtf v17.4s, v17.4s
453 0x4f929231, //fmul v17.4s, v17.4s, v18.s[0]
454 0x4d40c912, //ld1r {v18.4s}, [x8]
455 0x4ea51ca1, //mov v1.16b, v5.16b
456 0x4e30ce21, //fmla v1.4s, v17.4s, v16.4s
457 0xbd407c50, //ldr s16, [x2, #124]
458 0x4e231e52, //and v18.16b, v18.16b, v3.16b
459 0x4d40c843, //ld1r {v3.4s}, [x2]
460 0x4e21da52, //scvtf v18.4s, v18.4s
461 0x4ea6d451, //fsub v17.4s, v2.4s, v6.4s
462 0x4ea61cc2, //mov v2.16b, v6.16b
463 0x4f909250, //fmul v16.4s, v18.4s, v16.s[0]
464 0x4e31ce02, //fmla v2.4s, v16.4s, v17.4s
465 0xd61f0060, //br x3
466};
467
468CODE const uint32_t sk_load_tables_aarch64[] = {
469 0xa8c10c28, //ldp x8, x3, [x1], #16
470 0x9100404b, //add x11, x2, #0x10
471 0x4d40c960, //ld1r {v0.4s}, [x11]
472 0xd37ef409, //lsl x9, x0, #2
473 0xa9402d0a, //ldp x10, x11, [x8]
474 0x3ce96942, //ldr q2, [x10, x9]
475 0xa9412109, //ldp x9, x8, [x8, #16]
476 0x4e221c01, //and v1.16b, v0.16b, v2.16b
477 0x0e143c2c, //mov w12, v1.s[2]
478 0xbc6c5971, //ldr s17, [x11, w12, uxtw #2]
479 0x1e26002c, //fmov w12, s1
480 0x6f380443, //ushr v3.4s, v2.4s, #8
481 0x6f300450, //ushr v16.4s, v2.4s, #16
482 0x8b2c496c, //add x12, x11, w12, uxtw #2
483 0x0e0c3c2a, //mov w10, v1.s[1]
484 0x0e1c3c2d, //mov w13, v1.s[3]
485 0x4e231c01, //and v1.16b, v0.16b, v3.16b
486 0x4e301c03, //and v3.16b, v0.16b, v16.16b
487 0x0d408180, //ld1 {v0.s}[0], [x12]
488 0x0e143c2c, //mov w12, v1.s[2]
489 0xbc6c5932, //ldr s18, [x9, w12, uxtw #2]
490 0x1e26002c, //fmov w12, s1
491 0x8b2a496a, //add x10, x11, w10, uxtw #2
492 0xbc6d5970, //ldr s16, [x11, w13, uxtw #2]
493 0x0e0c3c2b, //mov w11, v1.s[1]
494 0x0e1c3c2d, //mov w13, v1.s[3]
495 0x8b2c492c, //add x12, x9, w12, uxtw #2
496 0xbc6d5933, //ldr s19, [x9, w13, uxtw #2]
497 0x0e0c3c6d, //mov w13, v3.s[1]
498 0x8b2b4929, //add x9, x9, w11, uxtw #2
499 0x0e143c6b, //mov w11, v3.s[2]
500 0x0d408181, //ld1 {v1.s}[0], [x12]
501 0x0e1c3c6c, //mov w12, v3.s[3]
502 0x0d409140, //ld1 {v0.s}[1], [x10]
503 0x1e26006a, //fmov w10, s3
504 0xbd400c43, //ldr s3, [x2, #12]
505 0x6f280442, //ushr v2.4s, v2.4s, #24
506 0x4e21d842, //scvtf v2.4s, v2.4s
507 0x8b2a490a, //add x10, x8, w10, uxtw #2
508 0x4f839043, //fmul v3.4s, v2.4s, v3.s[0]
509 0x0d408142, //ld1 {v2.s}[0], [x10]
510 0x8b2d490a, //add x10, x8, w13, uxtw #2
511 0x6e140620, //mov v0.s[2], v17.s[0]
512 0xbc6b5911, //ldr s17, [x8, w11, uxtw #2]
513 0x0d409121, //ld1 {v1.s}[1], [x9]
514 0x0d409142, //ld1 {v2.s}[1], [x10]
515 0x6e1c0600, //mov v0.s[3], v16.s[0]
516 0xbc6c5910, //ldr s16, [x8, w12, uxtw #2]
517 0x6e140641, //mov v1.s[2], v18.s[0]
518 0x6e140622, //mov v2.s[2], v17.s[0]
519 0x6e1c0661, //mov v1.s[3], v19.s[0]
520 0x6e1c0602, //mov v2.s[3], v16.s[0]
521 0xd61f0060, //br x3
522};
523
524CODE const uint32_t sk_load_a8_aarch64[] = {
525 0xa8c10c28, //ldp x8, x3, [x1], #16
526 0xbd400c43, //ldr s3, [x2, #12]
527 0x6f00e400, //movi v0.2d, #0x0
528 0x6f00e401, //movi v1.2d, #0x0
529 0xf9400108, //ldr x8, [x8]
530 0x8b000108, //add x8, x8, x0
531 0x39400109, //ldrb w9, [x8]
532 0x3940050a, //ldrb w10, [x8, #1]
533 0x3940090b, //ldrb w11, [x8, #2]
534 0x39400d08, //ldrb w8, [x8, #3]
535 0x4e021d22, //mov v2.h[0], w9
536 0x4e061d42, //mov v2.h[1], w10
537 0x4e0a1d62, //mov v2.h[2], w11
538 0x4e0e1d02, //mov v2.h[3], w8
539 0x2f07b7e2, //bic v2.4h, #0xff, lsl #8
540 0x2f10a442, //uxtl v2.4s, v2.4h
541 0x6e21d842, //ucvtf v2.4s, v2.4s
542 0x4f839043, //fmul v3.4s, v2.4s, v3.s[0]
543 0x6f00e402, //movi v2.2d, #0x0
544 0xd61f0060, //br x3
545};
546
547CODE const uint32_t sk_store_a8_aarch64[] = {
548 0xf9400028, //ldr x8, [x1]
549 0xbd400850, //ldr s16, [x2, #8]
550 0xf9400108, //ldr x8, [x8]
551 0x4f909070, //fmul v16.4s, v3.4s, v16.s[0]
552 0x6e21aa10, //fcvtnu v16.4s, v16.4s
553 0x0e612a10, //xtn v16.4h, v16.4s
554 0x0e0e3e09, //umov w9, v16.h[3]
555 0x8b000108, //add x8, x8, x0
556 0x39000d09, //strb w9, [x8, #3]
557 0x0e0a3e09, //umov w9, v16.h[2]
558 0x39000909, //strb w9, [x8, #2]
559 0x0e063e09, //umov w9, v16.h[1]
560 0x39000509, //strb w9, [x8, #1]
561 0x0e023e09, //umov w9, v16.h[0]
562 0x39000109, //strb w9, [x8]
563 0xf9400423, //ldr x3, [x1, #8]
564 0x91004021, //add x1, x1, #0x10
565 0xd61f0060, //br x3
566};
567
568CODE const uint32_t sk_load_565_aarch64[] = {
569 0xa8c10c28, //ldp x8, x3, [x1], #16
570 0xd37ff809, //lsl x9, x0, #1
571 0xf9400108, //ldr x8, [x8]
572 0xfc696900, //ldr d0, [x8, x9]
573 0x9101a048, //add x8, x2, #0x68
574 0x4d40c901, //ld1r {v1.4s}, [x8]
575 0x9101b048, //add x8, x2, #0x6c
576 0x4d40c902, //ld1r {v2.4s}, [x8]
577 0x9101c048, //add x8, x2, #0x70
578 0x4d40c903, //ld1r {v3.4s}, [x8]
579 0x2f10a400, //uxtl v0.4s, v0.4h
580 0x4e201c21, //and v1.16b, v1.16b, v0.16b
581 0x4e201c42, //and v2.16b, v2.16b, v0.16b
582 0x4e201c71, //and v17.16b, v3.16b, v0.16b
583 0x2d4e8c50, //ldp s16, s3, [x2, #116]
584 0x4e21d820, //scvtf v0.4s, v1.4s
585 0x4e21d841, //scvtf v1.4s, v2.4s
586 0x4e21da22, //scvtf v2.4s, v17.4s
587 0x4f909000, //fmul v0.4s, v0.4s, v16.s[0]
588 0xbd407c50, //ldr s16, [x2, #124]
589 0x4f839021, //fmul v1.4s, v1.4s, v3.s[0]
590 0x4d40c843, //ld1r {v3.4s}, [x2]
591 0x4f909042, //fmul v2.4s, v2.4s, v16.s[0]
592 0xd61f0060, //br x3
593};
594
595CODE const uint32_t sk_store_565_aarch64[] = {
596 0x2d504450, //ldp s16, s17, [x2, #128]
597 0xf9400028, //ldr x8, [x1]
598 0xd37ff809, //lsl x9, x0, #1
599 0x4f909012, //fmul v18.4s, v0.4s, v16.s[0]
600 0x4f919031, //fmul v17.4s, v1.4s, v17.s[0]
601 0x6e21aa52, //fcvtnu v18.4s, v18.4s
602 0x6e21aa31, //fcvtnu v17.4s, v17.4s
603 0xf9400108, //ldr x8, [x8]
604 0x4f909050, //fmul v16.4s, v2.4s, v16.s[0]
605 0x4f2b5652, //shl v18.4s, v18.4s, #11
606 0x4f255631, //shl v17.4s, v17.4s, #5
607 0x4eb21e31, //orr v17.16b, v17.16b, v18.16b
608 0x6e21aa10, //fcvtnu v16.4s, v16.4s
609 0x4eb01e30, //orr v16.16b, v17.16b, v16.16b
610 0x0e612a10, //xtn v16.4h, v16.4s
611 0xfc296910, //str d16, [x8, x9]
612 0xf9400423, //ldr x3, [x1, #8]
613 0x91004021, //add x1, x1, #0x10
614 0xd61f0060, //br x3
615};
616
617CODE const uint32_t sk_load_8888_aarch64[] = {
618 0xa8c10c28, //ldp x8, x3, [x1], #16
619 0xd37ef409, //lsl x9, x0, #2
620 0xbd400c42, //ldr s2, [x2, #12]
621 0xf9400108, //ldr x8, [x8]
622 0x3ce96900, //ldr q0, [x8, x9]
623 0x91004048, //add x8, x2, #0x10
624 0x4d40c901, //ld1r {v1.4s}, [x8]
625 0x6f380410, //ushr v16.4s, v0.4s, #8
626 0x6f300411, //ushr v17.4s, v0.4s, #16
627 0x4e201c23, //and v3.16b, v1.16b, v0.16b
628 0x6f280400, //ushr v0.4s, v0.4s, #24
629 0x4e301c30, //and v16.16b, v1.16b, v16.16b
630 0x4e311c21, //and v1.16b, v1.16b, v17.16b
631 0x4e21d863, //scvtf v3.4s, v3.4s
632 0x4e21d811, //scvtf v17.4s, v0.4s
633 0x4e21da10, //scvtf v16.4s, v16.4s
634 0x4e21d832, //scvtf v18.4s, v1.4s
635 0x4f829060, //fmul v0.4s, v3.4s, v2.s[0]
636 0x4f829223, //fmul v3.4s, v17.4s, v2.s[0]
637 0x4f829201, //fmul v1.4s, v16.4s, v2.s[0]
638 0x4f829242, //fmul v2.4s, v18.4s, v2.s[0]
639 0xd61f0060, //br x3
640};
641
642CODE const uint32_t sk_store_8888_aarch64[] = {
643 0xbd400850, //ldr s16, [x2, #8]
644 0xf9400028, //ldr x8, [x1]
645 0xd37ef409, //lsl x9, x0, #2
646 0x4f909032, //fmul v18.4s, v1.4s, v16.s[0]
647 0x4f909011, //fmul v17.4s, v0.4s, v16.s[0]
648 0x6e21aa52, //fcvtnu v18.4s, v18.4s
649 0x6e21aa31, //fcvtnu v17.4s, v17.4s
650 0x4f285652, //shl v18.4s, v18.4s, #8
651 0x4eb11e51, //orr v17.16b, v18.16b, v17.16b
652 0x4f909052, //fmul v18.4s, v2.4s, v16.s[0]
653 0xf9400108, //ldr x8, [x8]
654 0x4f909070, //fmul v16.4s, v3.4s, v16.s[0]
655 0x6e21aa52, //fcvtnu v18.4s, v18.4s
656 0x6e21aa10, //fcvtnu v16.4s, v16.4s
657 0x4f305652, //shl v18.4s, v18.4s, #16
658 0x4eb21e31, //orr v17.16b, v17.16b, v18.16b
659 0x4f385610, //shl v16.4s, v16.4s, #24
660 0x4eb01e30, //orr v16.16b, v17.16b, v16.16b
661 0x3ca96910, //str q16, [x8, x9]
662 0xf9400423, //ldr x3, [x1, #8]
663 0x91004021, //add x1, x1, #0x10
664 0xd61f0060, //br x3
665};
666
667CODE const uint32_t sk_load_f16_aarch64[] = {
668 0xa8c10c28, //ldp x8, x3, [x1], #16
669 0xf9400108, //ldr x8, [x8]
670 0x8b000d08, //add x8, x8, x0, lsl #3
671 0x0c400510, //ld4 {v16.4h-v19.4h}, [x8]
672 0x0e217a00, //fcvtl v0.4s, v16.4h
673 0x0e217a21, //fcvtl v1.4s, v17.4h
674 0x0e217a42, //fcvtl v2.4s, v18.4h
675 0x0e217a63, //fcvtl v3.4s, v19.4h
676 0xd61f0060, //br x3
677};
678
679CODE const uint32_t sk_store_f16_aarch64[] = {
680 0xf9400028, //ldr x8, [x1]
681 0x0e216810, //fcvtn v16.4h, v0.4s
682 0x0e216831, //fcvtn v17.4h, v1.4s
683 0x0e216852, //fcvtn v18.4h, v2.4s
684 0xf9400108, //ldr x8, [x8]
685 0x0e216873, //fcvtn v19.4h, v3.4s
686 0x8b000d08, //add x8, x8, x0, lsl #3
687 0x0c000510, //st4 {v16.4h-v19.4h}, [x8]
688 0xf9400423, //ldr x3, [x1, #8]
689 0x91004021, //add x1, x1, #0x10
690 0xd61f0060, //br x3
691};
692
693CODE const uint32_t sk_store_f32_aarch64[] = {
694 0xf9400028, //ldr x8, [x1]
695 0xf9400108, //ldr x8, [x8]
696 0x8b001108, //add x8, x8, x0, lsl #4
697 0x4c000900, //st4 {v0.4s-v3.4s}, [x8]
698 0xf9400423, //ldr x3, [x1, #8]
699 0x91004021, //add x1, x1, #0x10
700 0xd61f0060, //br x3
701};
702
703CODE const uint32_t sk_clamp_x_aarch64[] = {
704 0xa8c10c28, //ldp x8, x3, [x1], #16
705 0x6f00e411, //movi v17.2d, #0x0
706 0x4e20f620, //fmax v0.4s, v17.4s, v0.4s
707 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff
708 0x4d40c910, //ld1r {v16.4s}, [x8]
709 0x4eb18610, //add v16.4s, v16.4s, v17.4s
710 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s
711 0xd61f0060, //br x3
712};
713
714CODE const uint32_t sk_clamp_y_aarch64[] = {
715 0xa8c10c28, //ldp x8, x3, [x1], #16
716 0x6f00e411, //movi v17.2d, #0x0
717 0x4e21f621, //fmax v1.4s, v17.4s, v1.4s
718 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff
719 0x4d40c910, //ld1r {v16.4s}, [x8]
720 0x4eb18610, //add v16.4s, v16.4s, v17.4s
721 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s
722 0xd61f0060, //br x3
723};
724
725CODE const uint32_t sk_repeat_x_aarch64[] = {
726 0xa8c10c28, //ldp x8, x3, [x1], #16
727 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff
728 0xbd400110, //ldr s16, [x8]
729 0x4e040612, //dup v18.4s, v16.s[0]
730 0x4eb18651, //add v17.4s, v18.4s, v17.4s
731 0x6e32fc12, //fdiv v18.4s, v0.4s, v18.4s
732 0x4e219a52, //frintm v18.4s, v18.4s
733 0x4f905240, //fmls v0.4s, v18.4s, v16.s[0]
734 0x4eb1f400, //fmin v0.4s, v0.4s, v17.4s
735 0xd61f0060, //br x3
736};
737
738CODE const uint32_t sk_repeat_y_aarch64[] = {
739 0xa8c10c28, //ldp x8, x3, [x1], #16
740 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff
741 0xbd400110, //ldr s16, [x8]
742 0x4e040612, //dup v18.4s, v16.s[0]
743 0x4eb18651, //add v17.4s, v18.4s, v17.4s
744 0x6e32fc32, //fdiv v18.4s, v1.4s, v18.4s
745 0x4e219a52, //frintm v18.4s, v18.4s
746 0x4f905241, //fmls v1.4s, v18.4s, v16.s[0]
747 0x4eb1f421, //fmin v1.4s, v1.4s, v17.4s
748 0xd61f0060, //br x3
749};
750
751CODE const uint32_t sk_mirror_x_aarch64[] = {
752 0xa8c10c28, //ldp x8, x3, [x1], #16
753 0xbd400110, //ldr s16, [x8]
754 0x4e040611, //dup v17.4s, v16.s[0]
755 0x1e302a10, //fadd s16, s16, s16
756 0x4eb1d400, //fsub v0.4s, v0.4s, v17.4s
757 0x4e040612, //dup v18.4s, v16.s[0]
758 0x6e32fc12, //fdiv v18.4s, v0.4s, v18.4s
759 0x4e219a52, //frintm v18.4s, v18.4s
760 0x4f905240, //fmls v0.4s, v18.4s, v16.s[0]
761 0x6f07e7f0, //movi v16.2d, #0xffffffffffffffff
762 0x4eb1d400, //fsub v0.4s, v0.4s, v17.4s
763 0x4eb08630, //add v16.4s, v17.4s, v16.4s
764 0x4ea0f800, //fabs v0.4s, v0.4s
765 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s
766 0xd61f0060, //br x3
767};
768
769CODE const uint32_t sk_mirror_y_aarch64[] = {
770 0xa8c10c28, //ldp x8, x3, [x1], #16
771 0xbd400110, //ldr s16, [x8]
772 0x4e040611, //dup v17.4s, v16.s[0]
773 0x1e302a10, //fadd s16, s16, s16
774 0x4eb1d421, //fsub v1.4s, v1.4s, v17.4s
775 0x4e040612, //dup v18.4s, v16.s[0]
776 0x6e32fc32, //fdiv v18.4s, v1.4s, v18.4s
777 0x4e219a52, //frintm v18.4s, v18.4s
778 0x4f905241, //fmls v1.4s, v18.4s, v16.s[0]
779 0x6f07e7f0, //movi v16.2d, #0xffffffffffffffff
780 0x4eb1d421, //fsub v1.4s, v1.4s, v17.4s
781 0x4eb08630, //add v16.4s, v17.4s, v16.4s
782 0x4ea0f821, //fabs v1.4s, v1.4s
783 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s
784 0xd61f0060, //br x3
785};
786
787CODE const uint32_t sk_matrix_2x3_aarch64[] = {
788 0xa8c10c28, //ldp x8, x3, [x1], #16
789 0xaa0803e9, //mov x9, x8
790 0x9100410a, //add x10, x8, #0x10
791 0x4ddfc932, //ld1r {v18.4s}, [x9], #4
792 0x4d40c950, //ld1r {v16.4s}, [x10]
793 0x2d415113, //ldp s19, s20, [x8, #8]
794 0x9100510a, //add x10, x8, #0x14
795 0x4d40c951, //ld1r {v17.4s}, [x10]
796 0x4f931030, //fmla v16.4s, v1.4s, v19.s[0]
797 0xbd400133, //ldr s19, [x9]
798 0x4f941031, //fmla v17.4s, v1.4s, v20.s[0]
799 0x4e20ce50, //fmla v16.4s, v18.4s, v0.4s
800 0x4f931011, //fmla v17.4s, v0.4s, v19.s[0]
801 0x4eb01e00, //mov v0.16b, v16.16b
802 0x4eb11e21, //mov v1.16b, v17.16b
803 0xd61f0060, //br x3
804};
805
806CODE const uint32_t sk_matrix_3x4_aarch64[] = {
807 0xa8c10c28, //ldp x8, x3, [x1], #16
808 0xaa0803e9, //mov x9, x8
809 0x9100910a, //add x10, x8, #0x24
810 0x4ddfc933, //ld1r {v19.4s}, [x9], #4
811 0x4d40c950, //ld1r {v16.4s}, [x10]
812 0x9100a10a, //add x10, x8, #0x28
813 0x4d40c951, //ld1r {v17.4s}, [x10]
814 0x9100b10a, //add x10, x8, #0x2c
815 0x2d435514, //ldp s20, s21, [x8, #24]
816 0xbd402116, //ldr s22, [x8, #32]
817 0x4d40c952, //ld1r {v18.4s}, [x10]
818 0x4f941050, //fmla v16.4s, v2.4s, v20.s[0]
819 0x4f951051, //fmla v17.4s, v2.4s, v21.s[0]
820 0x4f961052, //fmla v18.4s, v2.4s, v22.s[0]
821 0x2d425502, //ldp s2, s21, [x8, #16]
822 0x2d415d14, //ldp s20, s23, [x8, #8]
823 0x4f821031, //fmla v17.4s, v1.4s, v2.s[0]
824 0xbd400122, //ldr s2, [x9]
825 0x4f971030, //fmla v16.4s, v1.4s, v23.s[0]
826 0x4f951032, //fmla v18.4s, v1.4s, v21.s[0]
827 0x4e20ce70, //fmla v16.4s, v19.4s, v0.4s
828 0x4f941012, //fmla v18.4s, v0.4s, v20.s[0]
829 0x4f821011, //fmla v17.4s, v0.4s, v2.s[0]
830 0x4eb01e00, //mov v0.16b, v16.16b
831 0x4eb11e21, //mov v1.16b, v17.16b
832 0x4eb21e42, //mov v2.16b, v18.16b
833 0xd61f0060, //br x3
834};
835
836CODE const uint32_t sk_matrix_perspective_aarch64[] = {
837 0xa8c10c28, //ldp x8, x3, [x1], #16
838 0xaa0803e9, //mov x9, x8
839 0x9100510a, //add x10, x8, #0x14
840 0x4ddfc930, //ld1r {v16.4s}, [x9], #4
841 0x4d40c951, //ld1r {v17.4s}, [x10]
842 0x9100810a, //add x10, x8, #0x20
843 0x4d40c952, //ld1r {v18.4s}, [x10]
844 0x2d41d113, //ldp s19, s20, [x8, #12]
845 0x2d435915, //ldp s21, s22, [x8, #24]
846 0x91002108, //add x8, x8, #0x8
847 0x4f941031, //fmla v17.4s, v1.4s, v20.s[0]
848 0x4d40c914, //ld1r {v20.4s}, [x8]
849 0x4f961032, //fmla v18.4s, v1.4s, v22.s[0]
850 0xbd400136, //ldr s22, [x9]
851 0x4f951012, //fmla v18.4s, v0.4s, v21.s[0]
852 0x4f931011, //fmla v17.4s, v0.4s, v19.s[0]
853 0x4f961034, //fmla v20.4s, v1.4s, v22.s[0]
854 0x4ea1da41, //frecpe v1.4s, v18.4s
855 0x4e21fe52, //frecps v18.4s, v18.4s, v1.4s
856 0x6e32dc32, //fmul v18.4s, v1.4s, v18.4s
857 0x4e20ce14, //fmla v20.4s, v16.4s, v0.4s
858 0x6e32de21, //fmul v1.4s, v17.4s, v18.4s
859 0x6e32de80, //fmul v0.4s, v20.4s, v18.4s
860 0xd61f0060, //br x3
861};
862
863CODE const uint32_t sk_linear_gradient_2stops_aarch64[] = {
864 0xa8c10c28, //ldp x8, x3, [x1], #16
865 0xad404503, //ldp q3, q17, [x8]
866 0x4e040470, //dup v16.4s, v3.s[0]
867 0x4e0c0461, //dup v1.4s, v3.s[1]
868 0x4e140462, //dup v2.4s, v3.s[2]
869 0x4e1c0463, //dup v3.4s, v3.s[3]
870 0x4f911010, //fmla v16.4s, v0.4s, v17.s[0]
871 0x4fb11001, //fmla v1.4s, v0.4s, v17.s[1]
872 0x4f911802, //fmla v2.4s, v0.4s, v17.s[2]
873 0x4fb11803, //fmla v3.4s, v0.4s, v17.s[3]
874 0x4eb01e00, //mov v0.16b, v16.16b
875 0xd61f0060, //br x3
876};
877#elif defined(__arm__)
878
879CODE const uint32_t sk_start_pipeline_vfp4[] = {
880 0xe92d41f0, //push {r4, r5, r6, r7, r8, lr}
881 0xe1a07001, //mov r7, r1
882 0xe1a04000, //mov r4, r0
883 0xe1a05003, //mov r5, r3
884 0xe1a08002, //mov r8, r2
885 0xe4976004, //ldr r6, [r7], #4
886 0xe2840002, //add r0, r4, #2
887 0xea00000d, //b 58 <sk_start_pipeline_vfp4+0x58>
888 0xf2800010, //vmov.i32 d0, #0
889 0xe1a00004, //mov r0, r4
890 0xf2801010, //vmov.i32 d1, #0
891 0xe1a01007, //mov r1, r7
892 0xf2802010, //vmov.i32 d2, #0
893 0xe1a02008, //mov r2, r8
894 0xf2803010, //vmov.i32 d3, #0
895 0xf2804010, //vmov.i32 d4, #0
896 0xf2805010, //vmov.i32 d5, #0
897 0xf2806010, //vmov.i32 d6, #0
898 0xf2807010, //vmov.i32 d7, #0
899 0xe12fff36, //blx r6
900 0xe2840004, //add r0, r4, #4
901 0xe2844002, //add r4, r4, #2
902 0xe1500005, //cmp r0, r5
903 0x9affffef, //bls 20 <sk_start_pipeline_vfp4+0x20>
904 0xe1a00004, //mov r0, r4
905 0xe8bd81f0, //pop {r4, r5, r6, r7, r8, pc}
906};
907
908CODE const uint32_t sk_just_return_vfp4[] = {
909 0xe12fff1e, //bx lr
910};
911
912CODE const uint32_t sk_seed_shader_vfp4[] = {
913 0xe8911008, //ldm r1, {r3, ip}
914 0xee800b90, //vdup.32 d16, r0
915 0xf3fb0620, //vcvt.f32.s32 d16, d16
916 0xedd23b05, //vldr d19, [r2, #20]
917 0xf2803010, //vmov.i32 d3, #0
918 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
919 0xe2823004, //add r3, r2, #4
920 0xf3fb1621, //vcvt.f32.s32 d17, d17
921 0xe2811008, //add r1, r1, #8
922 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
923 0xf2804010, //vmov.i32 d4, #0
924 0xf2400da2, //vadd.f32 d16, d16, d18
925 0xf2805010, //vmov.i32 d5, #0
926 0xf4a22c9f, //vld1.32 {d2[]}, [r2 :32]
927 0xf2011da2, //vadd.f32 d1, d17, d18
928 0xf2806010, //vmov.i32 d6, #0
929 0xf2030da0, //vadd.f32 d0, d19, d16
930 0xf2807010, //vmov.i32 d7, #0
931 0xe12fff1c, //bx ip
932};
933
934CODE const uint32_t sk_constant_color_vfp4[] = {
935 0xe8911008, //ldm r1, {r3, ip}
936 0xe2811008, //add r1, r1, #8
937 0xf4630a0f, //vld1.8 {d16-d17}, [r3]
938 0xf3b40c20, //vdup.32 d0, d16[0]
939 0xf3bc1c20, //vdup.32 d1, d16[1]
940 0xf3b42c21, //vdup.32 d2, d17[0]
941 0xf3bc3c21, //vdup.32 d3, d17[1]
942 0xe12fff1c, //bx ip
943};
944
945CODE const uint32_t sk_clear_vfp4[] = {
946 0xe4913004, //ldr r3, [r1], #4
947 0xf2800010, //vmov.i32 d0, #0
948 0xf2801010, //vmov.i32 d1, #0
949 0xf2802010, //vmov.i32 d2, #0
950 0xf2803010, //vmov.i32 d3, #0
951 0xe12fff13, //bx r3
952};
953
954CODE const uint32_t sk_plus__vfp4[] = {
955 0xf2000d04, //vadd.f32 d0, d0, d4
956 0xe4913004, //ldr r3, [r1], #4
957 0xf2011d05, //vadd.f32 d1, d1, d5
958 0xf2022d06, //vadd.f32 d2, d2, d6
959 0xf2033d07, //vadd.f32 d3, d3, d7
960 0xe12fff13, //bx r3
961};
962
963CODE const uint32_t sk_srcover_vfp4[] = {
964 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
965 0xe4913004, //ldr r3, [r1], #4
966 0xf2600d83, //vsub.f32 d16, d16, d3
967 0xf2040c30, //vfma.f32 d0, d4, d16
968 0xf2051c30, //vfma.f32 d1, d5, d16
969 0xf2062c30, //vfma.f32 d2, d6, d16
970 0xf2073c30, //vfma.f32 d3, d7, d16
971 0xe12fff13, //bx r3
972};
973
974CODE const uint32_t sk_dstover_vfp4[] = {
975 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
976 0xf2651115, //vorr d17, d5, d5
977 0xf2604d87, //vsub.f32 d20, d16, d7
978 0xf2640114, //vorr d16, d4, d4
979 0xf2662116, //vorr d18, d6, d6
980 0xe4913004, //ldr r3, [r1], #4
981 0xf2673117, //vorr d19, d7, d7
982 0xf2400c34, //vfma.f32 d16, d0, d20
983 0xf2411c34, //vfma.f32 d17, d1, d20
984 0xf2422c34, //vfma.f32 d18, d2, d20
985 0xf2433c34, //vfma.f32 d19, d3, d20
986 0xf22001b0, //vorr d0, d16, d16
987 0xf22111b1, //vorr d1, d17, d17
988 0xf22221b2, //vorr d2, d18, d18
989 0xf22331b3, //vorr d3, d19, d19
990 0xe12fff13, //bx r3
991};
992
993CODE const uint32_t sk_clamp_0_vfp4[] = {
994 0xf2c00010, //vmov.i32 d16, #0
995 0xe4913004, //ldr r3, [r1], #4
996 0xf2000f20, //vmax.f32 d0, d0, d16
997 0xf2011f20, //vmax.f32 d1, d1, d16
998 0xf2022f20, //vmax.f32 d2, d2, d16
999 0xf2033f20, //vmax.f32 d3, d3, d16
1000 0xe12fff13, //bx r3
1001};
1002
1003CODE const uint32_t sk_clamp_1_vfp4[] = {
1004 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
1005 0xe4913004, //ldr r3, [r1], #4
1006 0xf2200f20, //vmin.f32 d0, d0, d16
1007 0xf2211f20, //vmin.f32 d1, d1, d16
1008 0xf2222f20, //vmin.f32 d2, d2, d16
1009 0xf2233f20, //vmin.f32 d3, d3, d16
1010 0xe12fff13, //bx r3
1011};
1012
1013CODE const uint32_t sk_clamp_a_vfp4[] = {
1014 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
1015 0xe4913004, //ldr r3, [r1], #4
1016 0xf2233f20, //vmin.f32 d3, d3, d16
1017 0xf2200f03, //vmin.f32 d0, d0, d3
1018 0xf2211f03, //vmin.f32 d1, d1, d3
1019 0xf2222f03, //vmin.f32 d2, d2, d3
1020 0xe12fff13, //bx r3
1021};
1022
1023CODE const uint32_t sk_set_rgb_vfp4[] = {
1024 0xe92d4800, //push {fp, lr}
1025 0xe591e000, //ldr lr, [r1]
1026 0xe591c004, //ldr ip, [r1, #4]
1027 0xe2811008, //add r1, r1, #8
1028 0xe28e3008, //add r3, lr, #8
1029 0xf4ae0c9f, //vld1.32 {d0[]}, [lr :32]
1030 0xf4a32c9f, //vld1.32 {d2[]}, [r3 :32]
1031 0xe28e3004, //add r3, lr, #4
1032 0xf4a31c9f, //vld1.32 {d1[]}, [r3 :32]
1033 0xe8bd4800, //pop {fp, lr}
1034 0xe12fff1c, //bx ip
1035};
1036
1037CODE const uint32_t sk_swap_rb_vfp4[] = {
1038 0xeef00b40, //vmov.f64 d16, d0
1039 0xe4913004, //ldr r3, [r1], #4
1040 0xeeb00b42, //vmov.f64 d0, d2
1041 0xeeb02b60, //vmov.f64 d2, d16
1042 0xe12fff13, //bx r3
1043};
1044
1045CODE const uint32_t sk_swap_vfp4[] = {
1046 0xeef00b43, //vmov.f64 d16, d3
1047 0xe4913004, //ldr r3, [r1], #4
1048 0xeef01b42, //vmov.f64 d17, d2
1049 0xeef02b41, //vmov.f64 d18, d1
1050 0xeef03b40, //vmov.f64 d19, d0
1051 0xeeb00b44, //vmov.f64 d0, d4
1052 0xeeb01b45, //vmov.f64 d1, d5
1053 0xeeb02b46, //vmov.f64 d2, d6
1054 0xeeb03b47, //vmov.f64 d3, d7
1055 0xeeb04b63, //vmov.f64 d4, d19
1056 0xeeb05b62, //vmov.f64 d5, d18
1057 0xeeb06b61, //vmov.f64 d6, d17
1058 0xeeb07b60, //vmov.f64 d7, d16
1059 0xe12fff13, //bx r3
1060};
1061
1062CODE const uint32_t sk_move_src_dst_vfp4[] = {
1063 0xeeb04b40, //vmov.f64 d4, d0
1064 0xe4913004, //ldr r3, [r1], #4
1065 0xeeb05b41, //vmov.f64 d5, d1
1066 0xeeb06b42, //vmov.f64 d6, d2
1067 0xeeb07b43, //vmov.f64 d7, d3
1068 0xe12fff13, //bx r3
1069};
1070
1071CODE const uint32_t sk_move_dst_src_vfp4[] = {
1072 0xeeb00b44, //vmov.f64 d0, d4
1073 0xe4913004, //ldr r3, [r1], #4
1074 0xeeb01b45, //vmov.f64 d1, d5
1075 0xeeb02b46, //vmov.f64 d2, d6
1076 0xeeb03b47, //vmov.f64 d3, d7
1077 0xe12fff13, //bx r3
1078};
1079
1080CODE const uint32_t sk_premul_vfp4[] = {
1081 0xf3000d13, //vmul.f32 d0, d0, d3
1082 0xe4913004, //ldr r3, [r1], #4
1083 0xf3011d13, //vmul.f32 d1, d1, d3
1084 0xf3022d13, //vmul.f32 d2, d2, d3
1085 0xe12fff13, //bx r3
1086};
1087
1088CODE const uint32_t sk_unpremul_vfp4[] = {
1089 0xed2d8b04, //vpush {d8-d9}
1090 0xed928a00, //vldr s16, [r2]
1091 0xf2c00010, //vmov.i32 d16, #0
1092 0xf3f91503, //vceq.f32 d17, d3, #0
1093 0xe4913004, //ldr r3, [r1], #4
1094 0xeec89a23, //vdiv.f32 s19, s16, s7
1095 0xee889a03, //vdiv.f32 s18, s16, s6
1096 0xf3501199, //vbsl d17, d16, d9
1097 0xf3010d90, //vmul.f32 d0, d17, d0
1098 0xf3011d91, //vmul.f32 d1, d17, d1
1099 0xf3012d92, //vmul.f32 d2, d17, d2
1100 0xecbd8b04, //vpop {d8-d9}
1101 0xe12fff13, //bx r3
1102};
1103
1104CODE const uint32_t sk_from_srgb_vfp4[] = {
1105 0xed2d8b02, //vpush {d8}
1106 0xe282303c, //add r3, r2, #60
1107 0xed928a10, //vldr s16, [r2, #64]
1108 0xf3402d10, //vmul.f32 d18, d0, d0
1109 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1110 0xe2823038, //add r3, r2, #56
1111 0xf3413d11, //vmul.f32 d19, d1, d1
1112 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1113 0xe2823044, //add r3, r2, #68
1114 0xf26141b1, //vorr d20, d17, d17
1115 0xf26171b1, //vorr d23, d17, d17
1116 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32]
1117 0xf2404c30, //vfma.f32 d20, d0, d16
1118 0xe2823034, //add r3, r2, #52
1119 0xf2417c30, //vfma.f32 d23, d1, d16
1120 0xf2421c30, //vfma.f32 d17, d2, d16
1121 0xf3425d12, //vmul.f32 d21, d2, d2
1122 0xf2e16948, //vmul.f32 d22, d1, d8[0]
1123 0xf2e00948, //vmul.f32 d16, d0, d8[0]
1124 0xf2e29948, //vmul.f32 d25, d2, d8[0]
1125 0xf3282e82, //vcgt.f32 d2, d24, d2
1126 0xf3281e81, //vcgt.f32 d1, d24, d1
1127 0xf3280e80, //vcgt.f32 d0, d24, d0
1128 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32]
1129 0xf268a1b8, //vorr d26, d24, d24
1130 0xf242acb4, //vfma.f32 d26, d18, d20
1131 0xf26821b8, //vorr d18, d24, d24
1132 0xe4913004, //ldr r3, [r1], #4
1133 0xf2432cb7, //vfma.f32 d18, d19, d23
1134 0xf2458cb1, //vfma.f32 d24, d21, d17
1135 0xf31001ba, //vbsl d0, d16, d26
1136 0xf31611b2, //vbsl d1, d22, d18
1137 0xf31921b8, //vbsl d2, d25, d24
1138 0xecbd8b02, //vpop {d8}
1139 0xe12fff13, //bx r3
1140};
1141
1142CODE const uint32_t sk_to_srgb_vfp4[] = {
1143 0xed2d8b02, //vpush {d8}
1144 0xf3fb0580, //vrsqrte.f32 d16, d0
1145 0xe2823050, //add r3, r2, #80
1146 0xf3fb1581, //vrsqrte.f32 d17, d1
1147 0xed928a12, //vldr s16, [r2, #72]
1148 0xf3fb2582, //vrsqrte.f32 d18, d2
1149 0xf3403db0, //vmul.f32 d19, d16, d16
1150 0xf3414db1, //vmul.f32 d20, d17, d17
1151 0xf3425db2, //vmul.f32 d21, d18, d18
1152 0xf2603f33, //vrsqrts.f32 d19, d0, d19
1153 0xf2614f34, //vrsqrts.f32 d20, d1, d20
1154 0xf2625f35, //vrsqrts.f32 d21, d2, d21
1155 0xf3400db3, //vmul.f32 d16, d16, d19
1156 0xf3411db4, //vmul.f32 d17, d17, d20
1157 0xf3422db5, //vmul.f32 d18, d18, d21
1158 0xf3fb3520, //vrecpe.f32 d19, d16
1159 0xf3fb4521, //vrecpe.f32 d20, d17
1160 0xf3fb6522, //vrecpe.f32 d22, d18
1161 0xf3fb55a2, //vrsqrte.f32 d21, d18
1162 0xf3fb75a0, //vrsqrte.f32 d23, d16
1163 0xf3fb85a1, //vrsqrte.f32 d24, d17
1164 0xf2409fb3, //vrecps.f32 d25, d16, d19
1165 0xf241afb4, //vrecps.f32 d26, d17, d20
1166 0xf242bfb6, //vrecps.f32 d27, d18, d22
1167 0xf345cdb5, //vmul.f32 d28, d21, d21
1168 0xf347ddb7, //vmul.f32 d29, d23, d23
1169 0xf348edb8, //vmul.f32 d30, d24, d24
1170 0xf2622fbc, //vrsqrts.f32 d18, d18, d28
1171 0xf2600fbd, //vrsqrts.f32 d16, d16, d29
1172 0xf2611fbe, //vrsqrts.f32 d17, d17, d30
1173 0xf3433db9, //vmul.f32 d19, d19, d25
1174 0xf4e39c9f, //vld1.32 {d25[]}, [r3 :32]
1175 0xe2823054, //add r3, r2, #84
1176 0xf3444dba, //vmul.f32 d20, d20, d26
1177 0xf3466dbb, //vmul.f32 d22, d22, d27
1178 0xf4e3ac9f, //vld1.32 {d26[]}, [r3 :32]
1179 0xe282304c, //add r3, r2, #76
1180 0xf26ab1ba, //vorr d27, d26, d26
1181 0xf249bcb3, //vfma.f32 d27, d25, d19
1182 0xf26a31ba, //vorr d19, d26, d26
1183 0xf2493cb4, //vfma.f32 d19, d25, d20
1184 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1185 0xf249acb6, //vfma.f32 d26, d25, d22
1186 0xe2823058, //add r3, r2, #88
1187 0xf3452db2, //vmul.f32 d18, d21, d18
1188 0xf3470db0, //vmul.f32 d16, d23, d16
1189 0xf3481db1, //vmul.f32 d17, d24, d17
1190 0xf2e05948, //vmul.f32 d21, d0, d8[0]
1191 0xf244bcb0, //vfma.f32 d27, d20, d16
1192 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1193 0xf2443cb1, //vfma.f32 d19, d20, d17
1194 0xf244acb2, //vfma.f32 d26, d20, d18
1195 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32]
1196 0xf2e11948, //vmul.f32 d17, d1, d8[0]
1197 0xf2e22948, //vmul.f32 d18, d2, d8[0]
1198 0xf3201e81, //vcgt.f32 d1, d16, d1
1199 0xe4913004, //ldr r3, [r1], #4
1200 0xf3200e80, //vcgt.f32 d0, d16, d0
1201 0xf3202e82, //vcgt.f32 d2, d16, d2
1202 0xf2640fab, //vmin.f32 d16, d20, d27
1203 0xf2643fa3, //vmin.f32 d19, d20, d19
1204 0xf2644faa, //vmin.f32 d20, d20, d26
1205 0xf31501b0, //vbsl d0, d21, d16
1206 0xf31111b3, //vbsl d1, d17, d19
1207 0xf31221b4, //vbsl d2, d18, d20
1208 0xecbd8b02, //vpop {d8}
1209 0xe12fff13, //bx r3
1210};
1211
1212CODE const uint32_t sk_scale_1_float_vfp4[] = {
1213 0xed2d8b02, //vpush {d8}
1214 0xe8911008, //ldm r1, {r3, ip}
1215 0xe2811008, //add r1, r1, #8
1216 0xed938a00, //vldr s16, [r3]
1217 0xf2a00948, //vmul.f32 d0, d0, d8[0]
1218 0xf2a11948, //vmul.f32 d1, d1, d8[0]
1219 0xf2a22948, //vmul.f32 d2, d2, d8[0]
1220 0xf2a33948, //vmul.f32 d3, d3, d8[0]
1221 0xecbd8b02, //vpop {d8}
1222 0xe12fff1c, //bx ip
1223};
1224
1225CODE const uint32_t sk_scale_u8_vfp4[] = {
1226 0xed2d8b02, //vpush {d8}
1227 0xe24dd008, //sub sp, sp, #8
1228 0xe8911008, //ldm r1, {r3, ip}
1229 0xe2811008, //add r1, r1, #8
1230 0xe5933000, //ldr r3, [r3]
1231 0xe0833000, //add r3, r3, r0
1232 0xe1d330b0, //ldrh r3, [r3]
1233 0xe1cd30b4, //strh r3, [sp, #4]
1234 0xe28d3004, //add r3, sp, #4
1235 0xed928a03, //vldr s16, [r2, #12]
1236 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16]
1237 0xf3c80a30, //vmovl.u8 q8, d16
1238 0xf3d00a30, //vmovl.u16 q8, d16
1239 0xf3fb06a0, //vcvt.f32.u32 d16, d16
1240 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
1241 0xf3000d90, //vmul.f32 d0, d16, d0
1242 0xf3001d91, //vmul.f32 d1, d16, d1
1243 0xf3002d92, //vmul.f32 d2, d16, d2
1244 0xf3003d93, //vmul.f32 d3, d16, d3
1245 0xe28dd008, //add sp, sp, #8
1246 0xecbd8b02, //vpop {d8}
1247 0xe12fff1c, //bx ip
1248};
1249
1250CODE const uint32_t sk_lerp_1_float_vfp4[] = {
1251 0xe8911008, //ldm r1, {r3, ip}
1252 0xf2600d04, //vsub.f32 d16, d0, d4
1253 0xf2611d05, //vsub.f32 d17, d1, d5
1254 0xf2622d06, //vsub.f32 d18, d2, d6
1255 0xe2811008, //add r1, r1, #8
1256 0xf2633d07, //vsub.f32 d19, d3, d7
1257 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1258 0xf2240114, //vorr d0, d4, d4
1259 0xf2251115, //vorr d1, d5, d5
1260 0xf2262116, //vorr d2, d6, d6
1261 0xf2273117, //vorr d3, d7, d7
1262 0xf2000cb4, //vfma.f32 d0, d16, d20
1263 0xf2011cb4, //vfma.f32 d1, d17, d20
1264 0xf2022cb4, //vfma.f32 d2, d18, d20
1265 0xf2033cb4, //vfma.f32 d3, d19, d20
1266 0xe12fff1c, //bx ip
1267};
1268
1269CODE const uint32_t sk_lerp_u8_vfp4[] = {
1270 0xed2d8b02, //vpush {d8}
1271 0xe24dd008, //sub sp, sp, #8
1272 0xe8911008, //ldm r1, {r3, ip}
1273 0xf2612d05, //vsub.f32 d18, d1, d5
1274 0xf2623d06, //vsub.f32 d19, d2, d6
1275 0xf2634d07, //vsub.f32 d20, d3, d7
1276 0xe2811008, //add r1, r1, #8
1277 0xe5933000, //ldr r3, [r3]
1278 0xf2251115, //vorr d1, d5, d5
1279 0xf2262116, //vorr d2, d6, d6
1280 0xe0833000, //add r3, r3, r0
1281 0xf2273117, //vorr d3, d7, d7
1282 0xe1d330b0, //ldrh r3, [r3]
1283 0xe1cd30b4, //strh r3, [sp, #4]
1284 0xe28d3004, //add r3, sp, #4
1285 0xed928a03, //vldr s16, [r2, #12]
1286 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16]
1287 0xf3c80a30, //vmovl.u8 q8, d16
1288 0xf3d00a30, //vmovl.u16 q8, d16
1289 0xf3fb06a0, //vcvt.f32.u32 d16, d16
1290 0xf2601d04, //vsub.f32 d17, d0, d4
1291 0xf2240114, //vorr d0, d4, d4
1292 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
1293 0xf2010cb0, //vfma.f32 d0, d17, d16
1294 0xf2021cb0, //vfma.f32 d1, d18, d16
1295 0xf2032cb0, //vfma.f32 d2, d19, d16
1296 0xf2043cb0, //vfma.f32 d3, d20, d16
1297 0xe28dd008, //add sp, sp, #8
1298 0xecbd8b02, //vpop {d8}
1299 0xe12fff1c, //bx ip
1300};
1301
1302CODE const uint32_t sk_lerp_565_vfp4[] = {
1303 0xed2d8b04, //vpush {d8-d9}
1304 0xe24dd008, //sub sp, sp, #8
1305 0xe8911008, //ldm r1, {r3, ip}
1306 0xf2603d04, //vsub.f32 d19, d0, d4
1307 0xf2240114, //vorr d0, d4, d4
1308 0xe2811008, //add r1, r1, #8
1309 0xe5933000, //ldr r3, [r3]
1310 0xe7933080, //ldr r3, [r3, r0, lsl #1]
1311 0xe58d3004, //str r3, [sp, #4]
1312 0xe28d3004, //add r3, sp, #4
1313 0xed923a1d, //vldr s6, [r2, #116]
1314 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32]
1315 0xe282306c, //add r3, r2, #108
1316 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1317 0xe2823068, //add r3, r2, #104
1318 0xf3d04a30, //vmovl.u16 q10, d16
1319 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1320 0xe2823070, //add r3, r2, #112
1321 0xf24201b4, //vand d16, d18, d20
1322 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1323 0xf24221b4, //vand d18, d18, d20
1324 0xf24111b4, //vand d17, d17, d20
1325 0xf3fb0620, //vcvt.f32.s32 d16, d16
1326 0xed928a1e, //vldr s16, [r2, #120]
1327 0xf3fb1621, //vcvt.f32.s32 d17, d17
1328 0xed929a1f, //vldr s18, [r2, #124]
1329 0xf3fb2622, //vcvt.f32.s32 d18, d18
1330 0xf2614d05, //vsub.f32 d20, d1, d5
1331 0xf2e009c3, //vmul.f32 d16, d16, d3[0]
1332 0xf4a23c9f, //vld1.32 {d3[]}, [r2 :32]
1333 0xf2625d06, //vsub.f32 d21, d2, d6
1334 0xf2e119c8, //vmul.f32 d17, d17, d8[0]
1335 0xf2e229c9, //vmul.f32 d18, d18, d9[0]
1336 0xf2251115, //vorr d1, d5, d5
1337 0xf2262116, //vorr d2, d6, d6
1338 0xf2030cb0, //vfma.f32 d0, d19, d16
1339 0xf2041cb1, //vfma.f32 d1, d20, d17
1340 0xf2052cb2, //vfma.f32 d2, d21, d18
1341 0xe28dd008, //add sp, sp, #8
1342 0xecbd8b04, //vpop {d8-d9}
1343 0xe12fff1c, //bx ip
1344};
1345
1346CODE const uint32_t sk_load_tables_vfp4[] = {
1347 0xe92d48f0, //push {r4, r5, r6, r7, fp, lr}
1348 0xe8911008, //ldm r1, {r3, ip}
1349 0xe2826010, //add r6, r2, #16
1350 0xe2811008, //add r1, r1, #8
1351 0xe593e000, //ldr lr, [r3]
1352 0xe99300b0, //ldmib r3, {r4, r5, r7}
1353 0xf4e60c9f, //vld1.32 {d16[]}, [r6 :32]
1354 0xe08e6100, //add r6, lr, r0, lsl #2
1355 0xedd61b00, //vldr d17, [r6]
1356 0xf24021b1, //vand d18, d16, d17
1357 0xed922a03, //vldr s4, [r2, #12]
1358 0xf3f03031, //vshr.u32 d19, d17, #16
1359 0xee326b90, //vmov.32 r6, d18[1]
1360 0xe0846106, //add r6, r4, r6, lsl #2
1361 0xedd60a00, //vldr s1, [r6]
1362 0xee126b90, //vmov.32 r6, d18[0]
1363 0xf3f82031, //vshr.u32 d18, d17, #8
1364 0xf24021b2, //vand d18, d16, d18
1365 0xf24001b3, //vand d16, d16, d19
1366 0xee103b90, //vmov.32 r3, d16[0]
1367 0xe0846106, //add r6, r4, r6, lsl #2
1368 0xee304b90, //vmov.32 r4, d16[1]
1369 0xf3e80031, //vshr.u32 d16, d17, #24
1370 0xed960a00, //vldr s0, [r6]
1371 0xee326b90, //vmov.32 r6, d18[1]
1372 0xf3fb0620, //vcvt.f32.s32 d16, d16
1373 0xe0873103, //add r3, r7, r3, lsl #2
1374 0xf2a039c2, //vmul.f32 d3, d16, d2[0]
1375 0xe0874104, //add r4, r7, r4, lsl #2
1376 0xedd42a00, //vldr s5, [r4]
1377 0xe0856106, //add r6, r5, r6, lsl #2
1378 0xed932a00, //vldr s4, [r3]
1379 0xedd61a00, //vldr s3, [r6]
1380 0xee126b90, //vmov.32 r6, d18[0]
1381 0xe0856106, //add r6, r5, r6, lsl #2
1382 0xed961a00, //vldr s2, [r6]
1383 0xe8bd48f0, //pop {r4, r5, r6, r7, fp, lr}
1384 0xe12fff1c, //bx ip
1385};
1386
1387CODE const uint32_t sk_load_a8_vfp4[] = {
1388 0xe24dd004, //sub sp, sp, #4
1389 0xe8911008, //ldm r1, {r3, ip}
1390 0xe2811008, //add r1, r1, #8
1391 0xf2801010, //vmov.i32 d1, #0
1392 0xf2802010, //vmov.i32 d2, #0
1393 0xe5933000, //ldr r3, [r3]
1394 0xe0833000, //add r3, r3, r0
1395 0xe1d330b0, //ldrh r3, [r3]
1396 0xe1cd30b0, //strh r3, [sp]
1397 0xe1a0300d, //mov r3, sp
1398 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16]
1399 0xed920a03, //vldr s0, [r2, #12]
1400 0xf3c80a30, //vmovl.u8 q8, d16
1401 0xf3d00a30, //vmovl.u16 q8, d16
1402 0xf3fb06a0, //vcvt.f32.u32 d16, d16
1403 0xf2a039c0, //vmul.f32 d3, d16, d0[0]
1404 0xf2800010, //vmov.i32 d0, #0
1405 0xe28dd004, //add sp, sp, #4
1406 0xe12fff1c, //bx ip
1407};
1408
1409CODE const uint32_t sk_store_a8_vfp4[] = {
1410 0xe92d4800, //push {fp, lr}
1411 0xe2823008, //add r3, r2, #8
1412 0xf2c3061f, //vmov.i32 d16, #1056964608
1413 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1414 0xe5913000, //ldr r3, [r1]
1415 0xf2430c31, //vfma.f32 d16, d3, d17
1416 0xe5933000, //ldr r3, [r3]
1417 0xf3fb07a0, //vcvt.u32.f32 d16, d16
1418 0xee10eb90, //vmov.32 lr, d16[0]
1419 0xee30cb90, //vmov.32 ip, d16[1]
1420 0xe7e3e000, //strb lr, [r3, r0]!
1421 0xe5c3c001, //strb ip, [r3, #1]
1422 0xe5913004, //ldr r3, [r1, #4]
1423 0xe2811008, //add r1, r1, #8
1424 0xe8bd4800, //pop {fp, lr}
1425 0xe12fff13, //bx r3
1426};
1427
1428CODE const uint32_t sk_load_565_vfp4[] = {
1429 0xe24dd004, //sub sp, sp, #4
1430 0xe8911008, //ldm r1, {r3, ip}
1431 0xe2811008, //add r1, r1, #8
1432 0xe5933000, //ldr r3, [r3]
1433 0xe7933080, //ldr r3, [r3, r0, lsl #1]
1434 0xe58d3000, //str r3, [sp]
1435 0xe1a0300d, //mov r3, sp
1436 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32]
1437 0xe282306c, //add r3, r2, #108
1438 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1439 0xe2823068, //add r3, r2, #104
1440 0xf3d04a30, //vmovl.u16 q10, d16
1441 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1442 0xe2823070, //add r3, r2, #112
1443 0xf24201b4, //vand d16, d18, d20
1444 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1445 0xf24111b4, //vand d17, d17, d20
1446 0xf24221b4, //vand d18, d18, d20
1447 0xf4a23c9f, //vld1.32 {d3[]}, [r2 :32]
1448 0xf3fb0620, //vcvt.f32.s32 d16, d16
1449 0xf3fb1621, //vcvt.f32.s32 d17, d17
1450 0xf3fb2622, //vcvt.f32.s32 d18, d18
1451 0xed920a1d, //vldr s0, [r2, #116]
1452 0xed921a1e, //vldr s2, [r2, #120]
1453 0xed922a1f, //vldr s4, [r2, #124]
1454 0xf2a009c0, //vmul.f32 d0, d16, d0[0]
1455 0xf2a119c1, //vmul.f32 d1, d17, d1[0]
1456 0xf2a229c2, //vmul.f32 d2, d18, d2[0]
1457 0xe28dd004, //add sp, sp, #4
1458 0xe12fff1c, //bx ip
1459};
1460
1461CODE const uint32_t sk_store_565_vfp4[] = {
1462 0xe2823080, //add r3, r2, #128
1463 0xf2c3361f, //vmov.i32 d19, #1056964608
1464 0xf2c3461f, //vmov.i32 d20, #1056964608
1465 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1466 0xe2823084, //add r3, r2, #132
1467 0xf2403c31, //vfma.f32 d19, d0, d17
1468 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1469 0xf2c3061f, //vmov.i32 d16, #1056964608
1470 0xf2414c32, //vfma.f32 d20, d1, d18
1471 0xf2420c31, //vfma.f32 d16, d2, d17
1472 0xe5913000, //ldr r3, [r1]
1473 0xe5933000, //ldr r3, [r3]
1474 0xf3fb17a3, //vcvt.u32.f32 d17, d19
1475 0xe0833080, //add r3, r3, r0, lsl #1
1476 0xf3fb27a4, //vcvt.u32.f32 d18, d20
1477 0xf3fb07a0, //vcvt.u32.f32 d16, d16
1478 0xf2eb1531, //vshl.s32 d17, d17, #11
1479 0xf2e52532, //vshl.s32 d18, d18, #5
1480 0xf26101b0, //vorr d16, d17, d16
1481 0xf26001b2, //vorr d16, d16, d18
1482 0xf3f60121, //vuzp.16 d16, d17
1483 0xf4c3080f, //vst1.32 {d16[0]}, [r3]
1484 0xe5913004, //ldr r3, [r1, #4]
1485 0xe2811008, //add r1, r1, #8
1486 0xe12fff13, //bx r3
1487};
1488
1489CODE const uint32_t sk_load_8888_vfp4[] = {
1490 0xe92d4800, //push {fp, lr}
1491 0xe8911008, //ldm r1, {r3, ip}
1492 0xe2811008, //add r1, r1, #8
1493 0xed922a03, //vldr s4, [r2, #12]
1494 0xe593e000, //ldr lr, [r3]
1495 0xe2823010, //add r3, r2, #16
1496 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1497 0xe08e3100, //add r3, lr, r0, lsl #2
1498 0xedd31b00, //vldr d17, [r3]
1499 0xf24021b1, //vand d18, d16, d17
1500 0xf3f83031, //vshr.u32 d19, d17, #8
1501 0xf3e84031, //vshr.u32 d20, d17, #24
1502 0xf3f01031, //vshr.u32 d17, d17, #16
1503 0xf24031b3, //vand d19, d16, d19
1504 0xf24001b1, //vand d16, d16, d17
1505 0xf3fb2622, //vcvt.f32.s32 d18, d18
1506 0xf3fb4624, //vcvt.f32.s32 d20, d20
1507 0xf3fb1623, //vcvt.f32.s32 d17, d19
1508 0xf3fb0620, //vcvt.f32.s32 d16, d16
1509 0xf2a209c2, //vmul.f32 d0, d18, d2[0]
1510 0xf2a439c2, //vmul.f32 d3, d20, d2[0]
1511 0xf2a119c2, //vmul.f32 d1, d17, d2[0]
1512 0xf2a029c2, //vmul.f32 d2, d16, d2[0]
1513 0xe8bd4800, //pop {fp, lr}
1514 0xe12fff1c, //bx ip
1515};
1516
1517CODE const uint32_t sk_store_8888_vfp4[] = {
1518 0xe2823008, //add r3, r2, #8
1519 0xf2c3261f, //vmov.i32 d18, #1056964608
1520 0xf2c3361f, //vmov.i32 d19, #1056964608
1521 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1522 0xf2c3061f, //vmov.i32 d16, #1056964608
1523 0xf2412c31, //vfma.f32 d18, d1, d17
1524 0xf2423c31, //vfma.f32 d19, d2, d17
1525 0xf2c3461f, //vmov.i32 d20, #1056964608
1526 0xe5913000, //ldr r3, [r1]
1527 0xf2400c31, //vfma.f32 d16, d0, d17
1528 0xf2434c31, //vfma.f32 d20, d3, d17
1529 0xe5933000, //ldr r3, [r3]
1530 0xe0833100, //add r3, r3, r0, lsl #2
1531 0xf3fb17a2, //vcvt.u32.f32 d17, d18
1532 0xf3fb27a3, //vcvt.u32.f32 d18, d19
1533 0xf3fb07a0, //vcvt.u32.f32 d16, d16
1534 0xf3fb37a4, //vcvt.u32.f32 d19, d20
1535 0xf2e81531, //vshl.s32 d17, d17, #8
1536 0xf2f02532, //vshl.s32 d18, d18, #16
1537 0xf26101b0, //vorr d16, d17, d16
1538 0xf2f81533, //vshl.s32 d17, d19, #24
1539 0xf26001b2, //vorr d16, d16, d18
1540 0xf26001b1, //vorr d16, d16, d17
1541 0xedc30b00, //vstr d16, [r3]
1542 0xe5913004, //ldr r3, [r1, #4]
1543 0xe2811008, //add r1, r1, #8
1544 0xe12fff13, //bx r3
1545};
1546
1547CODE const uint32_t sk_load_f16_vfp4[] = {
1548 0xed2d8b04, //vpush {d8-d9}
1549 0xe8911008, //ldm r1, {r3, ip}
1550 0xe2811008, //add r1, r1, #8
1551 0xe5933000, //ldr r3, [r3]
1552 0xe0833180, //add r3, r3, r0, lsl #3
1553 0xf463084f, //vld2.16 {d16-d17}, [r3]
1554 0xf3b62720, //vcvt.f32.f16 q1, d16
1555 0xf3b68721, //vcvt.f32.f16 q4, d17
1556 0xf2220112, //vorr d0, d2, d2
1557 0xeef00a43, //vmov.f32 s1, s6
1558 0xf2281118, //vorr d1, d8, d8
1559 0xeeb03a62, //vmov.f32 s6, s5
1560 0xeef01a49, //vmov.f32 s3, s18
1561 0xeeb09a68, //vmov.f32 s18, s17
1562 0xeeb02b43, //vmov.f64 d2, d3
1563 0xeeb03b49, //vmov.f64 d3, d9
1564 0xecbd8b04, //vpop {d8-d9}
1565 0xe12fff1c, //bx ip
1566};
1567
1568CODE const uint32_t sk_store_f16_vfp4[] = {
1569 0xeef00b41, //vmov.f64 d16, d1
1570 0xeef03b42, //vmov.f64 d19, d2
1571 0xf2631113, //vorr d17, d3, d3
1572 0xf2602110, //vorr d18, d0, d0
1573 0xf3fa00a1, //vtrn.32 d16, d17
1574 0xf3f61620, //vcvt.f16.f32 d17, q8
1575 0xf3fa20a3, //vtrn.32 d18, d19
1576 0xe5913000, //ldr r3, [r1]
1577 0xf3f60622, //vcvt.f16.f32 d16, q9
1578 0xe5933000, //ldr r3, [r3]
1579 0xe0833180, //add r3, r3, r0, lsl #3
1580 0xf443084f, //vst2.16 {d16-d17}, [r3]
1581 0xe2813008, //add r3, r1, #8
1582 0xe591c004, //ldr ip, [r1, #4]
1583 0xe1a01003, //mov r1, r3
1584 0xe12fff1c, //bx ip
1585};
1586
1587CODE const uint32_t sk_store_f32_vfp4[] = {
1588 0xe5913000, //ldr r3, [r1]
1589 0xe5933000, //ldr r3, [r3]
1590 0xe0833200, //add r3, r3, r0, lsl #4
1591 0xf403008f, //vst4.32 {d0-d3}, [r3]
1592 0xe2813008, //add r3, r1, #8
1593 0xe591c004, //ldr ip, [r1, #4]
1594 0xe1a01003, //mov r1, r3
1595 0xe12fff1c, //bx ip
1596};
1597
1598CODE const uint32_t sk_clamp_x_vfp4[] = {
1599 0xe8911008, //ldm r1, {r3, ip}
1600 0xf2c00010, //vmov.i32 d16, #0
1601 0xf3c71e1f, //vmov.i8 d17, #255
1602 0xf2400f80, //vmax.f32 d16, d16, d0
1603 0xe2811008, //add r1, r1, #8
1604 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1605 0xf26218a1, //vadd.i32 d17, d18, d17
1606 0xf2200fa1, //vmin.f32 d0, d16, d17
1607 0xe12fff1c, //bx ip
1608};
1609
1610CODE const uint32_t sk_clamp_y_vfp4[] = {
1611 0xe8911008, //ldm r1, {r3, ip}
1612 0xf2c00010, //vmov.i32 d16, #0
1613 0xf3c71e1f, //vmov.i8 d17, #255
1614 0xf2400f81, //vmax.f32 d16, d16, d1
1615 0xe2811008, //add r1, r1, #8
1616 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1617 0xf26218a1, //vadd.i32 d17, d18, d17
1618 0xf2201fa1, //vmin.f32 d1, d16, d17
1619 0xe12fff1c, //bx ip
1620};
1621
1622CODE const uint32_t sk_repeat_x_vfp4[] = {
1623 0xed2d8b04, //vpush {d8-d9}
1624 0xe8911008, //ldm r1, {r3, ip}
1625 0xf2c02010, //vmov.i32 d18, #0
1626 0xf4e23c9f, //vld1.32 {d19[]}, [r2 :32]
1627 0xe2811008, //add r1, r1, #8
1628 0xed938a00, //vldr s16, [r3]
1629 0xeec09a88, //vdiv.f32 s19, s1, s16
1630 0xee809a08, //vdiv.f32 s18, s0, s16
1631 0xf3fb0709, //vcvt.s32.f32 d16, d9
1632 0xf3fb0620, //vcvt.f32.s32 d16, d16
1633 0xf3601e89, //vcgt.f32 d17, d16, d9
1634 0xf35311b2, //vbsl d17, d19, d18
1635 0xf3f42c08, //vdup.32 d18, d8[0]
1636 0xf2600da1, //vsub.f32 d16, d16, d17
1637 0xf3c71e1f, //vmov.i8 d17, #255
1638 0xf26218a1, //vadd.i32 d17, d18, d17
1639 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
1640 0xf2600d20, //vsub.f32 d16, d0, d16
1641 0xf2200fa1, //vmin.f32 d0, d16, d17
1642 0xecbd8b04, //vpop {d8-d9}
1643 0xe12fff1c, //bx ip
1644};
1645
1646CODE const uint32_t sk_repeat_y_vfp4[] = {
1647 0xed2d8b04, //vpush {d8-d9}
1648 0xe8911008, //ldm r1, {r3, ip}
1649 0xf2c02010, //vmov.i32 d18, #0
1650 0xf4e23c9f, //vld1.32 {d19[]}, [r2 :32]
1651 0xe2811008, //add r1, r1, #8
1652 0xed938a00, //vldr s16, [r3]
1653 0xeec19a88, //vdiv.f32 s19, s3, s16
1654 0xee819a08, //vdiv.f32 s18, s2, s16
1655 0xf3fb0709, //vcvt.s32.f32 d16, d9
1656 0xf3fb0620, //vcvt.f32.s32 d16, d16
1657 0xf3601e89, //vcgt.f32 d17, d16, d9
1658 0xf35311b2, //vbsl d17, d19, d18
1659 0xf3f42c08, //vdup.32 d18, d8[0]
1660 0xf2600da1, //vsub.f32 d16, d16, d17
1661 0xf3c71e1f, //vmov.i8 d17, #255
1662 0xf26218a1, //vadd.i32 d17, d18, d17
1663 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
1664 0xf2610d20, //vsub.f32 d16, d1, d16
1665 0xf2201fa1, //vmin.f32 d1, d16, d17
1666 0xecbd8b04, //vpop {d8-d9}
1667 0xe12fff1c, //bx ip
1668};
1669
1670CODE const uint32_t sk_mirror_x_vfp4[] = {
1671 0xed2d8b04, //vpush {d8-d9}
1672 0xe8911008, //ldm r1, {r3, ip}
1673 0xf2c03010, //vmov.i32 d19, #0
1674 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32]
1675 0xe2811008, //add r1, r1, #8
1676 0xed938a00, //vldr s16, [r3]
1677 0xee389a08, //vadd.f32 s18, s16, s16
1678 0xf3f40c08, //vdup.32 d16, d8[0]
1679 0xf2200d20, //vsub.f32 d0, d0, d16
1680 0xeec08a89, //vdiv.f32 s17, s1, s18
1681 0xee808a09, //vdiv.f32 s16, s0, s18
1682 0xf3fb1708, //vcvt.s32.f32 d17, d8
1683 0xf3fb1621, //vcvt.f32.s32 d17, d17
1684 0xf3612e88, //vcgt.f32 d18, d17, d8
1685 0xf35421b3, //vbsl d18, d20, d19
1686 0xf2611da2, //vsub.f32 d17, d17, d18
1687 0xf3c72e1f, //vmov.i8 d18, #255
1688 0xf2e119c9, //vmul.f32 d17, d17, d9[0]
1689 0xf2601d21, //vsub.f32 d17, d0, d17
1690 0xf2611da0, //vsub.f32 d17, d17, d16
1691 0xf26008a2, //vadd.i32 d16, d16, d18
1692 0xf3f91721, //vabs.f32 d17, d17
1693 0xf2210fa0, //vmin.f32 d0, d17, d16
1694 0xecbd8b04, //vpop {d8-d9}
1695 0xe12fff1c, //bx ip
1696};
1697
1698CODE const uint32_t sk_mirror_y_vfp4[] = {
1699 0xed2d8b04, //vpush {d8-d9}
1700 0xe8911008, //ldm r1, {r3, ip}
1701 0xf2c03010, //vmov.i32 d19, #0
1702 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32]
1703 0xe2811008, //add r1, r1, #8
1704 0xed938a00, //vldr s16, [r3]
1705 0xee389a08, //vadd.f32 s18, s16, s16
1706 0xf3f40c08, //vdup.32 d16, d8[0]
1707 0xf2211d20, //vsub.f32 d1, d1, d16
1708 0xeec18a89, //vdiv.f32 s17, s3, s18
1709 0xee818a09, //vdiv.f32 s16, s2, s18
1710 0xf3fb1708, //vcvt.s32.f32 d17, d8
1711 0xf3fb1621, //vcvt.f32.s32 d17, d17
1712 0xf3612e88, //vcgt.f32 d18, d17, d8
1713 0xf35421b3, //vbsl d18, d20, d19
1714 0xf2611da2, //vsub.f32 d17, d17, d18
1715 0xf3c72e1f, //vmov.i8 d18, #255
1716 0xf2e119c9, //vmul.f32 d17, d17, d9[0]
1717 0xf2611d21, //vsub.f32 d17, d1, d17
1718 0xf2611da0, //vsub.f32 d17, d17, d16
1719 0xf26008a2, //vadd.i32 d16, d16, d18
1720 0xf3f91721, //vabs.f32 d17, d17
1721 0xf2211fa0, //vmin.f32 d1, d17, d16
1722 0xecbd8b04, //vpop {d8-d9}
1723 0xe12fff1c, //bx ip
1724};
1725
1726CODE const uint32_t sk_matrix_2x3_vfp4[] = {
1727 0xe92d4800, //push {fp, lr}
1728 0xe591e000, //ldr lr, [r1]
1729 0xe591c004, //ldr ip, [r1, #4]
1730 0xe2811008, //add r1, r1, #8
1731 0xe28e300c, //add r3, lr, #12
1732 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1733 0xe28e3008, //add r3, lr, #8
1734 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1735 0xe28e3010, //add r3, lr, #16
1736 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1737 0xe28e3014, //add r3, lr, #20
1738 0xf2410c31, //vfma.f32 d16, d1, d17
1739 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1740 0xe28e3004, //add r3, lr, #4
1741 0xf2411c32, //vfma.f32 d17, d1, d18
1742 0xf4ee2c9f, //vld1.32 {d18[]}, [lr :32]
1743 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
1744 0xf2400c32, //vfma.f32 d16, d0, d18
1745 0xf2401c33, //vfma.f32 d17, d0, d19
1746 0xf22001b0, //vorr d0, d16, d16
1747 0xf22111b1, //vorr d1, d17, d17
1748 0xe8bd4800, //pop {fp, lr}
1749 0xe12fff1c, //bx ip
1750};
1751
1752CODE const uint32_t sk_matrix_3x4_vfp4[] = {
1753 0xe92d4800, //push {fp, lr}
1754 0xe591e000, //ldr lr, [r1]
1755 0xe591c004, //ldr ip, [r1, #4]
1756 0xe2811008, //add r1, r1, #8
1757 0xe28e3020, //add r3, lr, #32
1758 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
1759 0xe28e302c, //add r3, lr, #44
1760 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1761 0xe28e301c, //add r3, lr, #28
1762 0xf2420c33, //vfma.f32 d16, d2, d19
1763 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1764 0xe28e3018, //add r3, lr, #24
1765 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1766 0xe28e3024, //add r3, lr, #36
1767 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1768 0xe28e3028, //add r3, lr, #40
1769 0xf2421c32, //vfma.f32 d17, d2, d18
1770 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1771 0xe28e3010, //add r3, lr, #16
1772 0xf2422c34, //vfma.f32 d18, d2, d20
1773 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
1774 0xe28e300c, //add r3, lr, #12
1775 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1776 0xe28e3014, //add r3, lr, #20
1777 0xf2411c34, //vfma.f32 d17, d1, d20
1778 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1779 0xf2410c34, //vfma.f32 d16, d1, d20
1780 0xe28e3004, //add r3, lr, #4
1781 0xf2412c33, //vfma.f32 d18, d1, d19
1782 0xf4ee3c9f, //vld1.32 {d19[]}, [lr :32]
1783 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1784 0xe28e3008, //add r3, lr, #8
1785 0xf2401c33, //vfma.f32 d17, d0, d19
1786 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
1787 0xf2400c33, //vfma.f32 d16, d0, d19
1788 0xf2402c34, //vfma.f32 d18, d0, d20
1789 0xf22101b1, //vorr d0, d17, d17
1790 0xf22021b0, //vorr d2, d16, d16
1791 0xf22211b2, //vorr d1, d18, d18
1792 0xe8bd4800, //pop {fp, lr}
1793 0xe12fff1c, //bx ip
1794};
1795
1796CODE const uint32_t sk_matrix_perspective_vfp4[] = {
1797 0xe92d4800, //push {fp, lr}
1798 0xe591e000, //ldr lr, [r1]
1799 0xe591c004, //ldr ip, [r1, #4]
1800 0xe2811008, //add r1, r1, #8
1801 0xe28e301c, //add r3, lr, #28
1802 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1803 0xe28e3020, //add r3, lr, #32
1804 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
1805 0xe28e3018, //add r3, lr, #24
1806 0xf2411c30, //vfma.f32 d17, d1, d16
1807 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1808 0xe28e3010, //add r3, lr, #16
1809 0xf2401c30, //vfma.f32 d17, d0, d16
1810 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
1811 0xe28e3004, //add r3, lr, #4
1812 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1813 0xe28e3008, //add r3, lr, #8
1814 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32]
1815 0xe28e3014, //add r3, lr, #20
1816 0xf2414c32, //vfma.f32 d20, d1, d18
1817 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
1818 0xe28e300c, //add r3, lr, #12
1819 0xf3fb3521, //vrecpe.f32 d19, d17
1820 0xf2412c30, //vfma.f32 d18, d1, d16
1821 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32]
1822 0xf2410fb3, //vrecps.f32 d16, d17, d19
1823 0xf4ee1c9f, //vld1.32 {d17[]}, [lr :32]
1824 0xf2404c31, //vfma.f32 d20, d0, d17
1825 0xf2402c35, //vfma.f32 d18, d0, d21
1826 0xf3430db0, //vmul.f32 d16, d19, d16
1827 0xf3040db0, //vmul.f32 d0, d20, d16
1828 0xf3021db0, //vmul.f32 d1, d18, d16
1829 0xe8bd4800, //pop {fp, lr}
1830 0xe12fff1c, //bx ip
1831};
1832
1833CODE const uint32_t sk_linear_gradient_2stops_vfp4[] = {
1834 0xe8911008, //ldm r1, {r3, ip}
1835 0xe2811008, //add r1, r1, #8
1836 0xf4632a0d, //vld1.8 {d18-d19}, [r3]!
1837 0xf4634a0f, //vld1.8 {d20-d21}, [r3]
1838 0xf3f40c22, //vdup.32 d16, d18[0]
1839 0xf3f41c24, //vdup.32 d17, d20[0]
1840 0xf2400c31, //vfma.f32 d16, d0, d17
1841 0xf3fc6c24, //vdup.32 d22, d20[1]
1842 0xf3bc1c22, //vdup.32 d1, d18[1]
1843 0xf3b42c23, //vdup.32 d2, d19[0]
1844 0xf2001c36, //vfma.f32 d1, d0, d22
1845 0xf3f41c25, //vdup.32 d17, d21[0]
1846 0xf3fc4c25, //vdup.32 d20, d21[1]
1847 0xf2002c31, //vfma.f32 d2, d0, d17
1848 0xf3bc3c23, //vdup.32 d3, d19[1]
1849 0xf2003c34, //vfma.f32 d3, d0, d20
1850 0xf22001b0, //vorr d0, d16, d16
1851 0xe12fff1c, //bx ip
1852};
1853#elif defined(__x86_64__)
1854
1855CODE const uint8_t sk_start_pipeline_hsw[] = {
1856 65,87, //push %r15
1857 65,86, //push %r14
1858 65,85, //push %r13
1859 65,84, //push %r12
1860 83, //push %rbx
1861 73,137,205, //mov %rcx,%r13
1862 73,137,214, //mov %rdx,%r14
1863 72,137,251, //mov %rdi,%rbx
1864 72,173, //lods %ds:(%rsi),%rax
1865 73,137,199, //mov %rax,%r15
1866 73,137,244, //mov %rsi,%r12
1867 72,141,67,8, //lea 0x8(%rbx),%rax
1868 76,57,232, //cmp %r13,%rax
1869 118,5, //jbe 28 <_sk_start_pipeline_hsw+0x28>
1870 72,137,223, //mov %rbx,%rdi
1871 235,65, //jmp 69 <_sk_start_pipeline_hsw+0x69>
1872 185,0,0,0,0, //mov $0x0,%ecx
1873 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
1874 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
1875 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
1876 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
1877 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
1878 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
1879 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
1880 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
1881 72,137,223, //mov %rbx,%rdi
1882 76,137,230, //mov %r12,%rsi
1883 76,137,242, //mov %r14,%rdx
1884 65,255,215, //callq *%r15
1885 72,141,123,8, //lea 0x8(%rbx),%rdi
1886 72,131,195,16, //add $0x10,%rbx
1887 76,57,235, //cmp %r13,%rbx
1888 72,137,251, //mov %rdi,%rbx
1889 118,191, //jbe 28 <_sk_start_pipeline_hsw+0x28>
1890 76,137,233, //mov %r13,%rcx
1891 72,41,249, //sub %rdi,%rcx
1892 116,41, //je 9a <_sk_start_pipeline_hsw+0x9a>
1893 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
1894 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
1895 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
1896 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
1897 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
1898 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
1899 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
1900 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
1901 76,137,230, //mov %r12,%rsi
1902 76,137,242, //mov %r14,%rdx
1903 65,255,215, //callq *%r15
1904 76,137,232, //mov %r13,%rax
1905 91, //pop %rbx
1906 65,92, //pop %r12
1907 65,93, //pop %r13
1908 65,94, //pop %r14
1909 65,95, //pop %r15
1910 197,248,119, //vzeroupper
1911 195, //retq
1912};
1913
1914CODE const uint8_t sk_just_return_hsw[] = {
1915 195, //retq
1916};
1917
1918CODE const uint8_t sk_seed_shader_hsw[] = {
1919 72,173, //lods %ds:(%rsi),%rax
1920 197,249,110,199, //vmovd %edi,%xmm0
1921 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
1922 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
1923 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
1924 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
1925 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
1926 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
1927 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
1928 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
1929 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
1930 72,173, //lods %ds:(%rsi),%rax
1931 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
1932 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
1933 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
1934 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
1935 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
1936 255,224, //jmpq *%rax
1937};
1938
1939CODE const uint8_t sk_constant_color_hsw[] = {
1940 72,173, //lods %ds:(%rsi),%rax
1941 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
1942 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
1943 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
1944 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
1945 72,173, //lods %ds:(%rsi),%rax
1946 255,224, //jmpq *%rax
1947};
1948
1949CODE const uint8_t sk_clear_hsw[] = {
1950 72,173, //lods %ds:(%rsi),%rax
1951 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
1952 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
1953 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
1954 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
1955 255,224, //jmpq *%rax
1956};
1957
1958CODE const uint8_t sk_plus__hsw[] = {
1959 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
1960 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
1961 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
1962 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
1963 72,173, //lods %ds:(%rsi),%rax
1964 255,224, //jmpq *%rax
1965};
1966
1967CODE const uint8_t sk_srcover_hsw[] = {
1968 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
1969 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
1970 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0
1971 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1
1972 196,194,77,184,208, //vfmadd231ps %ymm8,%ymm6,%ymm2
1973 196,194,69,184,216, //vfmadd231ps %ymm8,%ymm7,%ymm3
1974 72,173, //lods %ds:(%rsi),%rax
1975 255,224, //jmpq *%rax
1976};
1977
1978CODE const uint8_t sk_dstover_hsw[] = {
1979 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
1980 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
1981 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
1982 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
1983 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
1984 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
1985 72,173, //lods %ds:(%rsi),%rax
1986 255,224, //jmpq *%rax
1987};
1988
1989CODE const uint8_t sk_clamp_0_hsw[] = {
1990 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
1991 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0
1992 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1
1993 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2
1994 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3
1995 72,173, //lods %ds:(%rsi),%rax
1996 255,224, //jmpq *%rax
1997};
1998
1999CODE const uint8_t sk_clamp_1_hsw[] = {
2000 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
2001 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
2002 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
2003 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
2004 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
2005 72,173, //lods %ds:(%rsi),%rax
2006 255,224, //jmpq *%rax
2007};
2008
2009CODE const uint8_t sk_clamp_a_hsw[] = {
2010 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
2011 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
2012 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
2013 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
2014 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2
2015 72,173, //lods %ds:(%rsi),%rax
2016 255,224, //jmpq *%rax
2017};
2018
2019CODE const uint8_t sk_set_rgb_hsw[] = {
2020 72,173, //lods %ds:(%rsi),%rax
2021 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
2022 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
2023 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
2024 72,173, //lods %ds:(%rsi),%rax
2025 255,224, //jmpq *%rax
2026};
2027
2028CODE const uint8_t sk_swap_rb_hsw[] = {
2029 197,124,40,192, //vmovaps %ymm0,%ymm8
2030 72,173, //lods %ds:(%rsi),%rax
2031 197,252,40,194, //vmovaps %ymm2,%ymm0
2032 197,124,41,194, //vmovaps %ymm8,%ymm2
2033 255,224, //jmpq *%rax
2034};
2035
2036CODE const uint8_t sk_swap_hsw[] = {
2037 197,124,40,195, //vmovaps %ymm3,%ymm8
2038 197,124,40,202, //vmovaps %ymm2,%ymm9
2039 197,124,40,209, //vmovaps %ymm1,%ymm10
2040 197,124,40,216, //vmovaps %ymm0,%ymm11
2041 72,173, //lods %ds:(%rsi),%rax
2042 197,252,40,196, //vmovaps %ymm4,%ymm0
2043 197,252,40,205, //vmovaps %ymm5,%ymm1
2044 197,252,40,214, //vmovaps %ymm6,%ymm2
2045 197,252,40,223, //vmovaps %ymm7,%ymm3
2046 197,124,41,220, //vmovaps %ymm11,%ymm4
2047 197,124,41,213, //vmovaps %ymm10,%ymm5
2048 197,124,41,206, //vmovaps %ymm9,%ymm6
2049 197,124,41,199, //vmovaps %ymm8,%ymm7
2050 255,224, //jmpq *%rax
2051};
2052
2053CODE const uint8_t sk_move_src_dst_hsw[] = {
2054 72,173, //lods %ds:(%rsi),%rax
2055 197,252,40,224, //vmovaps %ymm0,%ymm4
2056 197,252,40,233, //vmovaps %ymm1,%ymm5
2057 197,252,40,242, //vmovaps %ymm2,%ymm6
2058 197,252,40,251, //vmovaps %ymm3,%ymm7
2059 255,224, //jmpq *%rax
2060};
2061
2062CODE const uint8_t sk_move_dst_src_hsw[] = {
2063 72,173, //lods %ds:(%rsi),%rax
2064 197,252,40,196, //vmovaps %ymm4,%ymm0
2065 197,252,40,205, //vmovaps %ymm5,%ymm1
2066 197,252,40,214, //vmovaps %ymm6,%ymm2
2067 197,252,40,223, //vmovaps %ymm7,%ymm3
2068 255,224, //jmpq *%rax
2069};
2070
2071CODE const uint8_t sk_premul_hsw[] = {
2072 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0
2073 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1
2074 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
2075 72,173, //lods %ds:(%rsi),%rax
2076 255,224, //jmpq *%rax
2077};
2078
2079CODE const uint8_t sk_unpremul_hsw[] = {
2080 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
2081 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
2082 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
2083 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
2084 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
2085 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
2086 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
2087 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
2088 72,173, //lods %ds:(%rsi),%rax
2089 255,224, //jmpq *%rax
2090};
2091
2092CODE const uint8_t sk_from_srgb_hsw[] = {
2093 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8
2094 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
2095 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10
2096 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11
2097 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12
2098 196,65,124,40,235, //vmovaps %ymm11,%ymm13
2099 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13
2100 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14
2101 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13
2102 196,98,125,24,82,68, //vbroadcastss 0x44(%rdx),%ymm10
2103 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
2104 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0
2105 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
2106 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13
2107 196,65,124,40,251, //vmovaps %ymm11,%ymm15
2108 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15
2109 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15
2110 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
2111 196,195,5,74,201,16, //vblendvps %ymm1,%ymm9,%ymm15,%ymm1
2112 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
2113 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9
2114 196,66,109,168,220, //vfmadd213ps %ymm12,%ymm2,%ymm11
2115 196,66,53,168,222, //vfmadd213ps %ymm14,%ymm9,%ymm11
2116 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
2117 196,195,37,74,208,32, //vblendvps %ymm2,%ymm8,%ymm11,%ymm2
2118 72,173, //lods %ds:(%rsi),%rax
2119 255,224, //jmpq *%rax
2120};
2121
2122CODE const uint8_t sk_to_srgb_hsw[] = {
2123 197,124,82,192, //vrsqrtps %ymm0,%ymm8
2124 196,65,124,83,200, //vrcpps %ymm8,%ymm9
2125 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10
2126 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8
2127 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11
2128 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12
2129 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13
2130 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14
2131 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15
2132 196,66,13,168,207, //vfmadd213ps %ymm15,%ymm14,%ymm9
2133 196,66,21,184,202, //vfmadd231ps %ymm10,%ymm13,%ymm9
2134 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
2135 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10
2136 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
2137 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0
2138 197,124,82,201, //vrsqrtps %ymm1,%ymm9
2139 196,65,124,83,217, //vrcpps %ymm9,%ymm11
2140 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
2141 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11
2142 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11
2143 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
2144 196,65,28,93,219, //vminps %ymm11,%ymm12,%ymm11
2145 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
2146 196,195,37,74,201,16, //vblendvps %ymm1,%ymm9,%ymm11,%ymm1
2147 197,124,82,202, //vrsqrtps %ymm2,%ymm9
2148 196,65,124,83,217, //vrcpps %ymm9,%ymm11
2149 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11
2150 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
2151 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11
2152 196,65,28,93,203, //vminps %ymm11,%ymm12,%ymm9
2153 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
2154 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
2155 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
2156 72,173, //lods %ds:(%rsi),%rax
2157 255,224, //jmpq *%rax
2158};
2159
2160CODE const uint8_t sk_scale_1_float_hsw[] = {
2161 72,173, //lods %ds:(%rsi),%rax
2162 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2163 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
2164 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
2165 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
2166 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
2167 72,173, //lods %ds:(%rsi),%rax
2168 255,224, //jmpq *%rax
2169};
2170
2171CODE const uint8_t sk_scale_u8_hsw[] = {
2172 73,137,200, //mov %rcx,%r8
2173 72,173, //lods %ds:(%rsi),%rax
2174 72,139,0, //mov (%rax),%rax
2175 72,1,248, //add %rdi,%rax
2176 77,133,192, //test %r8,%r8
2177 117,48, //jne 41a <_sk_scale_u8_hsw+0x40>
2178 197,123,16,0, //vmovsd (%rax),%xmm8
2179 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
2180 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
2181 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
2182 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
2183 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
2184 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
2185 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
2186 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
2187 72,173, //lods %ds:(%rsi),%rax
2188 76,137,193, //mov %r8,%rcx
2189 255,224, //jmpq *%rax
2190 49,201, //xor %ecx,%ecx
2191 77,137,194, //mov %r8,%r10
2192 69,49,201, //xor %r9d,%r9d
2193 68,15,182,24, //movzbl (%rax),%r11d
2194 72,255,192, //inc %rax
2195 73,211,227, //shl %cl,%r11
2196 77,9,217, //or %r11,%r9
2197 72,131,193,8, //add $0x8,%rcx
2198 73,255,202, //dec %r10
2199 117,234, //jne 422 <_sk_scale_u8_hsw+0x48>
2200 196,65,249,110,193, //vmovq %r9,%xmm8
2201 235,175, //jmp 3ee <_sk_scale_u8_hsw+0x14>
2202};
2203
2204CODE const uint8_t sk_lerp_1_float_hsw[] = {
2205 72,173, //lods %ds:(%rsi),%rax
2206 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2207 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
2208 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
2209 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
2210 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
2211 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
2212 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
2213 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
2214 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
2215 72,173, //lods %ds:(%rsi),%rax
2216 255,224, //jmpq *%rax
2217};
2218
2219CODE const uint8_t sk_lerp_u8_hsw[] = {
2220 73,137,200, //mov %rcx,%r8
2221 72,173, //lods %ds:(%rsi),%rax
2222 72,139,0, //mov (%rax),%rax
2223 72,1,248, //add %rdi,%rax
2224 77,133,192, //test %r8,%r8
2225 117,68, //jne 4c2 <_sk_lerp_u8_hsw+0x54>
2226 197,123,16,0, //vmovsd (%rax),%xmm8
2227 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
2228 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
2229 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
2230 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
2231 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
2232 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
2233 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
2234 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
2235 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
2236 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
2237 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
2238 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
2239 72,173, //lods %ds:(%rsi),%rax
2240 76,137,193, //mov %r8,%rcx
2241 255,224, //jmpq *%rax
2242 49,201, //xor %ecx,%ecx
2243 77,137,194, //mov %r8,%r10
2244 69,49,201, //xor %r9d,%r9d
2245 68,15,182,24, //movzbl (%rax),%r11d
2246 72,255,192, //inc %rax
2247 73,211,227, //shl %cl,%r11
2248 77,9,217, //or %r11,%r9
2249 72,131,193,8, //add $0x8,%rcx
2250 73,255,202, //dec %r10
2251 117,234, //jne 4ca <_sk_lerp_u8_hsw+0x5c>
2252 196,65,249,110,193, //vmovq %r9,%xmm8
2253 235,155, //jmp 482 <_sk_lerp_u8_hsw+0x14>
2254};
2255
2256CODE const uint8_t sk_lerp_565_hsw[] = {
2257 72,173, //lods %ds:(%rsi),%rax
2258 76,139,16, //mov (%rax),%r10
2259 72,133,201, //test %rcx,%rcx
2260 117,123, //jne 56c <_sk_lerp_565_hsw+0x85>
2261 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3
2262 196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3
2263 196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8
2264 197,61,219,195, //vpand %ymm3,%ymm8,%ymm8
2265 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
2266 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9
2267 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8
2268 196,98,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm9
2269 197,53,219,203, //vpand %ymm3,%ymm9,%ymm9
2270 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9
2271 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10
2272 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9
2273 196,98,125,88,82,112, //vpbroadcastd 0x70(%rdx),%ymm10
2274 197,173,219,219, //vpand %ymm3,%ymm10,%ymm3
2275 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
2276 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10
2277 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3
2278 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
2279 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
2280 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
2281 196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1
2282 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
2283 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2
2284 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
2285 72,173, //lods %ds:(%rsi),%rax
2286 255,224, //jmpq *%rax
2287 65,137,200, //mov %ecx,%r8d
2288 65,128,224,7, //and $0x7,%r8b
2289 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
2290 65,254,200, //dec %r8b
2291 69,15,182,192, //movzbl %r8b,%r8d
2292 65,128,248,6, //cmp $0x6,%r8b
2293 15,135,111,255,255,255, //ja 4f7 <_sk_lerp_565_hsw+0x10>
2294 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 5d8 <_sk_lerp_565_hsw+0xf1>
2295 75,99,4,129, //movslq (%r9,%r8,4),%rax
2296 76,1,200, //add %r9,%rax
2297 255,224, //jmpq *%rax
2298 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
2299 196,193,97,196,92,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
2300 196,193,97,196,92,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
2301 196,193,97,196,92,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
2302 196,193,97,196,92,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
2303 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
2304 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
2305 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3
2306 233,31,255,255,255, //jmpq 4f7 <_sk_lerp_565_hsw+0x10>
2307 244, //hlt
2308 255, //(bad)
2309 255, //(bad)
2310 255, //(bad)
2311 236, //in (%dx),%al
2312 255, //(bad)
2313 255, //(bad)
2314 255,228, //jmpq *%rsp
2315 255, //(bad)
2316 255, //(bad)
2317 255, //(bad)
2318 220,255, //fdivr %st,%st(7)
2319 255, //(bad)
2320 255,212, //callq *%rsp
2321 255, //(bad)
2322 255, //(bad)
2323 255,204, //dec %esp
2324 255, //(bad)
2325 255, //(bad)
2326 255,192, //inc %eax
2327 255, //(bad)
2328 255, //(bad)
2329 255, //.byte 0xff
2330};
2331
2332CODE const uint8_t sk_load_tables_hsw[] = {
2333 73,137,200, //mov %rcx,%r8
2334 72,173, //lods %ds:(%rsi),%rax
2335 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
2336 76,3,8, //add (%rax),%r9
2337 77,133,192, //test %r8,%r8
2338 117,106, //jne 673 <_sk_load_tables_hsw+0x7f>
2339 196,193,126,111,25, //vmovdqu (%r9),%ymm3
2340 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
2341 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1
2342 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
2343 72,139,72,8, //mov 0x8(%rax),%rcx
2344 76,139,72,16, //mov 0x10(%rax),%r9
2345 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
2346 196,226,53,146,4,137, //vgatherdps %ymm9,(%rcx,%ymm1,4),%ymm0
2347 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
2348 197,109,219,201, //vpand %ymm1,%ymm2,%ymm9
2349 196,65,45,118,210, //vpcmpeqd %ymm10,%ymm10,%ymm10
2350 196,130,45,146,12,137, //vgatherdps %ymm10,(%r9,%ymm9,4),%ymm1
2351 72,139,64,24, //mov 0x18(%rax),%rax
2352 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
2353 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9
2354 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
2355 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
2356 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
2357 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
2358 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
2359 72,173, //lods %ds:(%rsi),%rax
2360 76,137,193, //mov %r8,%rcx
2361 255,224, //jmpq *%rax
2362 185,8,0,0,0, //mov $0x8,%ecx
2363 68,41,193, //sub %r8d,%ecx
2364 192,225,3, //shl $0x3,%cl
2365 73,199,194,255,255,255,255, //mov $0xffffffffffffffff,%r10
2366 73,211,234, //shr %cl,%r10
2367 196,193,249,110,194, //vmovq %r10,%xmm0
2368 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
2369 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
2370 233,114,255,255,255, //jmpq 60e <_sk_load_tables_hsw+0x1a>
2371};
2372
2373CODE const uint8_t sk_load_a8_hsw[] = {
2374 73,137,200, //mov %rcx,%r8
2375 72,173, //lods %ds:(%rsi),%rax
2376 72,139,0, //mov (%rax),%rax
2377 72,1,248, //add %rdi,%rax
2378 77,133,192, //test %r8,%r8
2379 117,42, //jne 6d6 <_sk_load_a8_hsw+0x3a>
2380 197,251,16,0, //vmovsd (%rax),%xmm0
2381 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0
2382 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
2383 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1
2384 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3
2385 72,173, //lods %ds:(%rsi),%rax
2386 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
2387 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
2388 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
2389 76,137,193, //mov %r8,%rcx
2390 255,224, //jmpq *%rax
2391 49,201, //xor %ecx,%ecx
2392 77,137,194, //mov %r8,%r10
2393 69,49,201, //xor %r9d,%r9d
2394 68,15,182,24, //movzbl (%rax),%r11d
2395 72,255,192, //inc %rax
2396 73,211,227, //shl %cl,%r11
2397 77,9,217, //or %r11,%r9
2398 72,131,193,8, //add $0x8,%rcx
2399 73,255,202, //dec %r10
2400 117,234, //jne 6de <_sk_load_a8_hsw+0x42>
2401 196,193,249,110,193, //vmovq %r9,%xmm0
2402 235,181, //jmp 6b0 <_sk_load_a8_hsw+0x14>
2403};
2404
2405CODE const uint8_t sk_store_a8_hsw[] = {
2406 72,173, //lods %ds:(%rsi),%rax
2407 76,139,8, //mov (%rax),%r9
2408 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
2409 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
2410 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
2411 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
2412 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
2413 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
2414 72,133,201, //test %rcx,%rcx
2415 117,10, //jne 72e <_sk_store_a8_hsw+0x33>
2416 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
2417 72,173, //lods %ds:(%rsi),%rax
2418 255,224, //jmpq *%rax
2419 137,200, //mov %ecx,%eax
2420 36,7, //and $0x7,%al
2421 254,200, //dec %al
2422 68,15,182,192, //movzbl %al,%r8d
2423 65,128,248,6, //cmp $0x6,%r8b
2424 119,236, //ja 72a <_sk_store_a8_hsw+0x2f>
2425 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
2426 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 78c <_sk_store_a8_hsw+0x91>
2427 75,99,4,130, //movslq (%r10,%r8,4),%rax
2428 76,1,208, //add %r10,%rax
2429 255,224, //jmpq *%rax
2430 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1)
2431 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1)
2432 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1)
2433 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1)
2434 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
2435 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
2436 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
2437 235,158, //jmp 72a <_sk_store_a8_hsw+0x2f>
2438 247,255, //idiv %edi
2439 255, //(bad)
2440 255, //(bad)
2441 239, //out %eax,(%dx)
2442 255, //(bad)
2443 255, //(bad)
2444 255,231, //jmpq *%rdi
2445 255, //(bad)
2446 255, //(bad)
2447 255, //(bad)
2448 223,255, //(bad)
2449 255, //(bad)
2450 255,215, //callq *%rdi
2451 255, //(bad)
2452 255, //(bad)
2453 255,207, //dec %edi
2454 255, //(bad)
2455 255, //(bad)
2456 255,199, //inc %edi
2457 255, //(bad)
2458 255, //(bad)
2459 255, //.byte 0xff
2460};
2461
2462CODE const uint8_t sk_load_565_hsw[] = {
2463 72,173, //lods %ds:(%rsi),%rax
2464 76,139,16, //mov (%rax),%r10
2465 72,133,201, //test %rcx,%rcx
2466 117,92, //jne 80e <_sk_load_565_hsw+0x66>
2467 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
2468 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2
2469 196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0
2470 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0
2471 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
2472 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1
2473 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0
2474 196,226,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm1
2475 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1
2476 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
2477 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3
2478 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
2479 196,226,125,88,90,112, //vpbroadcastd 0x70(%rdx),%ymm3
2480 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2
2481 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
2482 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3
2483 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2
2484 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
2485 72,173, //lods %ds:(%rsi),%rax
2486 255,224, //jmpq *%rax
2487 65,137,200, //mov %ecx,%r8d
2488 65,128,224,7, //and $0x7,%r8b
2489 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
2490 65,254,200, //dec %r8b
2491 69,15,182,192, //movzbl %r8b,%r8d
2492 65,128,248,6, //cmp $0x6,%r8b
2493 119,146, //ja 7b8 <_sk_load_565_hsw+0x10>
2494 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 878 <_sk_load_565_hsw+0xd0>
2495 75,99,4,129, //movslq (%r9,%r8,4),%rax
2496 76,1,200, //add %r9,%rax
2497 255,224, //jmpq *%rax
2498 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
2499 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
2500 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
2501 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
2502 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
2503 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
2504 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
2505 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
2506 233,66,255,255,255, //jmpq 7b8 <_sk_load_565_hsw+0x10>
2507 102,144, //xchg %ax,%ax
2508 242,255, //repnz (bad)
2509 255, //(bad)
2510 255, //(bad)
2511 234, //(bad)
2512 255, //(bad)
2513 255, //(bad)
2514 255,226, //jmpq *%rdx
2515 255, //(bad)
2516 255, //(bad)
2517 255, //(bad)
2518 218,255, //(bad)
2519 255, //(bad)
2520 255,210, //callq *%rdx
2521 255, //(bad)
2522 255, //(bad)
2523 255,202, //dec %edx
2524 255, //(bad)
2525 255, //(bad)
2526 255, //(bad)
2527 190, //.byte 0xbe
2528 255, //(bad)
2529 255, //(bad)
2530 255, //.byte 0xff
2531};
2532
2533CODE const uint8_t sk_store_565_hsw[] = {
2534 72,173, //lods %ds:(%rsi),%rax
2535 76,139,8, //mov (%rax),%r9
2536 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8
2537 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
2538 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
2539 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9
2540 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10
2541 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
2542 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
2543 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10
2544 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9
2545 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
2546 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
2547 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
2548 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9
2549 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
2550 72,133,201, //test %rcx,%rcx
2551 117,10, //jne 8f6 <_sk_store_565_hsw+0x62>
2552 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
2553 72,173, //lods %ds:(%rsi),%rax
2554 255,224, //jmpq *%rax
2555 137,200, //mov %ecx,%eax
2556 36,7, //and $0x7,%al
2557 254,200, //dec %al
2558 68,15,182,192, //movzbl %al,%r8d
2559 65,128,248,6, //cmp $0x6,%r8b
2560 119,236, //ja 8f2 <_sk_store_565_hsw+0x5e>
2561 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 954 <_sk_store_565_hsw+0xc0>
2562 75,99,4,130, //movslq (%r10,%r8,4),%rax
2563 76,1,208, //add %r10,%rax
2564 255,224, //jmpq *%rax
2565 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2)
2566 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2)
2567 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2)
2568 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2)
2569 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2)
2570 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
2571 197,121,126,192, //vmovd %xmm8,%eax
2572 102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
2573 235,161, //jmp 8f2 <_sk_store_565_hsw+0x5e>
2574 15,31,0, //nopl (%rax)
2575 242,255, //repnz (bad)
2576 255, //(bad)
2577 255, //(bad)
2578 234, //(bad)
2579 255, //(bad)
2580 255, //(bad)
2581 255,226, //jmpq *%rdx
2582 255, //(bad)
2583 255, //(bad)
2584 255, //(bad)
2585 218,255, //(bad)
2586 255, //(bad)
2587 255,210, //callq *%rdx
2588 255, //(bad)
2589 255, //(bad)
2590 255,202, //dec %edx
2591 255, //(bad)
2592 255, //(bad)
2593 255,194, //inc %edx
2594 255, //(bad)
2595 255, //(bad)
2596 255, //.byte 0xff
2597};
2598
2599CODE const uint8_t sk_load_8888_hsw[] = {
2600 73,137,200, //mov %rcx,%r8
2601 72,173, //lods %ds:(%rsi),%rax
2602 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
2603 76,3,8, //add (%rax),%r9
2604 77,133,192, //test %r8,%r8
2605 117,85, //jne 9da <_sk_load_8888_hsw+0x6a>
2606 196,193,126,111,25, //vmovdqu (%r9),%ymm3
2607 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
2608 197,237,219,195, //vpand %ymm3,%ymm2,%ymm0
2609 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
2610 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
2611 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
2612 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
2613 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1
2614 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
2615 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
2616 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
2617 196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2
2618 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
2619 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
2620 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
2621 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
2622 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
2623 72,173, //lods %ds:(%rsi),%rax
2624 76,137,193, //mov %r8,%rcx
2625 255,224, //jmpq *%rax
2626 185,8,0,0,0, //mov $0x8,%ecx
2627 68,41,193, //sub %r8d,%ecx
2628 192,225,3, //shl $0x3,%cl
2629 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax
2630 72,211,232, //shr %cl,%rax
2631 196,225,249,110,192, //vmovq %rax,%xmm0
2632 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
2633 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
2634 235,138, //jmp 98a <_sk_load_8888_hsw+0x1a>
2635};
2636
2637CODE const uint8_t sk_store_8888_hsw[] = {
2638 73,137,200, //mov %rcx,%r8
2639 72,173, //lods %ds:(%rsi),%rax
2640 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
2641 76,3,8, //add (%rax),%r9
2642 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
2643 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
2644 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
2645 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
2646 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
2647 196,193,45,114,242,8, //vpslld $0x8,%ymm10,%ymm10
2648 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9
2649 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10
2650 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
2651 196,193,45,114,242,16, //vpslld $0x10,%ymm10,%ymm10
2652 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
2653 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
2654 196,193,61,114,240,24, //vpslld $0x18,%ymm8,%ymm8
2655 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8
2656 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
2657 77,133,192, //test %r8,%r8
2658 117,12, //jne a6c <_sk_store_8888_hsw+0x6c>
2659 196,65,126,127,1, //vmovdqu %ymm8,(%r9)
2660 72,173, //lods %ds:(%rsi),%rax
2661 76,137,193, //mov %r8,%rcx
2662 255,224, //jmpq *%rax
2663 185,8,0,0,0, //mov $0x8,%ecx
2664 68,41,193, //sub %r8d,%ecx
2665 192,225,3, //shl $0x3,%cl
2666 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax
2667 72,211,232, //shr %cl,%rax
2668 196,97,249,110,200, //vmovq %rax,%xmm9
2669 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9
2670 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9)
2671 235,211, //jmp a65 <_sk_store_8888_hsw+0x65>
2672};
2673
2674CODE const uint8_t sk_load_f16_hsw[] = {
2675 72,173, //lods %ds:(%rsi),%rax
2676 72,139,0, //mov (%rax),%rax
2677 72,133,201, //test %rcx,%rcx
2678 117,97, //jne afd <_sk_load_f16_hsw+0x6b>
2679 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
2680 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
2681 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
2682 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8
2683 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0
2684 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1
2685 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2
2686 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3
2687 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8
2688 197,121,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm9
2689 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1
2690 197,233,105,219, //vpunpckhwd %xmm3,%xmm2,%xmm3
2691 197,185,108,193, //vpunpcklqdq %xmm1,%xmm8,%xmm0
2692 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0
2693 197,185,109,201, //vpunpckhqdq %xmm1,%xmm8,%xmm1
2694 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1
2695 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2
2696 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2
2697 197,177,109,219, //vpunpckhqdq %xmm3,%xmm9,%xmm3
2698 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3
2699 72,173, //lods %ds:(%rsi),%rax
2700 255,224, //jmpq *%rax
2701 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
2702 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
2703 72,131,249,1, //cmp $0x1,%rcx
2704 117,6, //jne b13 <_sk_load_f16_hsw+0x81>
2705 197,250,126,201, //vmovq %xmm1,%xmm1
2706 235,30, //jmp b31 <_sk_load_f16_hsw+0x9f>
2707 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
2708 72,131,249,3, //cmp $0x3,%rcx
2709 114,18, //jb b31 <_sk_load_f16_hsw+0x9f>
2710 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
2711 72,131,249,3, //cmp $0x3,%rcx
2712 117,19, //jne b3e <_sk_load_f16_hsw+0xac>
2713 197,250,126,210, //vmovq %xmm2,%xmm2
2714 235,46, //jmp b5f <_sk_load_f16_hsw+0xcd>
2715 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
2716 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
2717 233,117,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
2718 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
2719 72,131,249,5, //cmp $0x5,%rcx
2720 114,21, //jb b5f <_sk_load_f16_hsw+0xcd>
2721 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
2722 72,131,249,5, //cmp $0x5,%rcx
2723 117,18, //jne b68 <_sk_load_f16_hsw+0xd6>
2724 197,250,126,219, //vmovq %xmm3,%xmm3
2725 233,84,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
2726 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
2727 233,75,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
2728 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
2729 72,131,249,7, //cmp $0x7,%rcx
2730 15,130,59,255,255,255, //jb ab3 <_sk_load_f16_hsw+0x21>
2731 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
2732 233,48,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
2733};
2734
2735CODE const uint8_t sk_store_f16_hsw[] = {
2736 72,173, //lods %ds:(%rsi),%rax
2737 72,139,0, //mov (%rax),%rax
2738 196,195,125,29,192,4, //vcvtps2ph $0x4,%ymm0,%xmm8
2739 196,195,125,29,201,4, //vcvtps2ph $0x4,%ymm1,%xmm9
2740 196,195,125,29,210,4, //vcvtps2ph $0x4,%ymm2,%xmm10
2741 196,195,125,29,219,4, //vcvtps2ph $0x4,%ymm3,%xmm11
2742 196,65,57,97,225, //vpunpcklwd %xmm9,%xmm8,%xmm12
2743 196,65,57,105,193, //vpunpckhwd %xmm9,%xmm8,%xmm8
2744 196,65,41,97,203, //vpunpcklwd %xmm11,%xmm10,%xmm9
2745 196,65,41,105,235, //vpunpckhwd %xmm11,%xmm10,%xmm13
2746 196,65,25,98,217, //vpunpckldq %xmm9,%xmm12,%xmm11
2747 196,65,25,106,209, //vpunpckhdq %xmm9,%xmm12,%xmm10
2748 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9
2749 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8
2750 72,133,201, //test %rcx,%rcx
2751 117,27, //jne be8 <_sk_store_f16_hsw+0x65>
2752 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
2753 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
2754 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
2755 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8)
2756 72,173, //lods %ds:(%rsi),%rax
2757 255,224, //jmpq *%rax
2758 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
2759 72,131,249,1, //cmp $0x1,%rcx
2760 116,241, //je be4 <_sk_store_f16_hsw+0x61>
2761 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
2762 72,131,249,3, //cmp $0x3,%rcx
2763 114,229, //jb be4 <_sk_store_f16_hsw+0x61>
2764 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
2765 116,221, //je be4 <_sk_store_f16_hsw+0x61>
2766 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
2767 72,131,249,5, //cmp $0x5,%rcx
2768 114,209, //jb be4 <_sk_store_f16_hsw+0x61>
2769 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
2770 116,201, //je be4 <_sk_store_f16_hsw+0x61>
2771 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
2772 72,131,249,7, //cmp $0x7,%rcx
2773 114,189, //jb be4 <_sk_store_f16_hsw+0x61>
2774 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
2775 235,181, //jmp be4 <_sk_store_f16_hsw+0x61>
2776};
2777
2778CODE const uint8_t sk_store_f32_hsw[] = {
2779 72,173, //lods %ds:(%rsi),%rax
2780 76,139,0, //mov (%rax),%r8
2781 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax
2782 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8
2783 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11
2784 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9
2785 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12
2786 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10
2787 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9
2788 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
2789 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
2790 72,133,201, //test %rcx,%rcx
2791 117,55, //jne c9c <_sk_store_f32_hsw+0x6d>
2792 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
2793 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
2794 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
2795 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
2796 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4)
2797 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4)
2798 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4)
2799 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4)
2800 72,173, //lods %ds:(%rsi),%rax
2801 255,224, //jmpq *%rax
2802 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
2803 72,131,249,1, //cmp $0x1,%rcx
2804 116,240, //je c98 <_sk_store_f32_hsw+0x69>
2805 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
2806 72,131,249,3, //cmp $0x3,%rcx
2807 114,227, //jb c98 <_sk_store_f32_hsw+0x69>
2808 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
2809 116,218, //je c98 <_sk_store_f32_hsw+0x69>
2810 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
2811 72,131,249,5, //cmp $0x5,%rcx
2812 114,205, //jb c98 <_sk_store_f32_hsw+0x69>
2813 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
2814 116,195, //je c98 <_sk_store_f32_hsw+0x69>
2815 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
2816 72,131,249,7, //cmp $0x7,%rcx
2817 114,181, //jb c98 <_sk_store_f32_hsw+0x69>
2818 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
2819 235,171, //jmp c98 <_sk_store_f32_hsw+0x69>
2820};
2821
2822CODE const uint8_t sk_clamp_x_hsw[] = {
2823 72,173, //lods %ds:(%rsi),%rax
2824 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
2825 197,188,95,192, //vmaxps %ymm0,%ymm8,%ymm0
2826 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8
2827 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
2828 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8
2829 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
2830 72,173, //lods %ds:(%rsi),%rax
2831 255,224, //jmpq *%rax
2832};
2833
2834CODE const uint8_t sk_clamp_y_hsw[] = {
2835 72,173, //lods %ds:(%rsi),%rax
2836 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
2837 197,188,95,201, //vmaxps %ymm1,%ymm8,%ymm1
2838 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8
2839 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
2840 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8
2841 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
2842 72,173, //lods %ds:(%rsi),%rax
2843 255,224, //jmpq *%rax
2844};
2845
2846CODE const uint8_t sk_repeat_x_hsw[] = {
2847 72,173, //lods %ds:(%rsi),%rax
2848 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2849 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9
2850 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
2851 196,98,61,172,200, //vfnmadd213ps %ymm0,%ymm8,%ymm9
2852 197,253,118,192, //vpcmpeqd %ymm0,%ymm0,%ymm0
2853 197,189,254,192, //vpaddd %ymm0,%ymm8,%ymm0
2854 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
2855 72,173, //lods %ds:(%rsi),%rax
2856 255,224, //jmpq *%rax
2857};
2858
2859CODE const uint8_t sk_repeat_y_hsw[] = {
2860 72,173, //lods %ds:(%rsi),%rax
2861 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2862 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9
2863 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
2864 196,98,61,172,201, //vfnmadd213ps %ymm1,%ymm8,%ymm9
2865 197,245,118,201, //vpcmpeqd %ymm1,%ymm1,%ymm1
2866 197,189,254,201, //vpaddd %ymm1,%ymm8,%ymm1
2867 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
2868 72,173, //lods %ds:(%rsi),%rax
2869 255,224, //jmpq *%rax
2870};
2871
2872CODE const uint8_t sk_mirror_x_hsw[] = {
2873 72,173, //lods %ds:(%rsi),%rax
2874 197,122,16,0, //vmovss (%rax),%xmm8
2875 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9
2876 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10
2877 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0
2878 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
2879 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8
2880 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
2881 196,66,125,172,194, //vfnmadd213ps %ymm10,%ymm0,%ymm8
2882 196,193,60,92,193, //vsubps %ymm9,%ymm8,%ymm0
2883 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
2884 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8
2885 197,188,84,192, //vandps %ymm0,%ymm8,%ymm0
2886 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
2887 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8
2888 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
2889 72,173, //lods %ds:(%rsi),%rax
2890 255,224, //jmpq *%rax
2891};
2892
2893CODE const uint8_t sk_mirror_y_hsw[] = {
2894 72,173, //lods %ds:(%rsi),%rax
2895 197,122,16,0, //vmovss (%rax),%xmm8
2896 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9
2897 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10
2898 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1
2899 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1
2900 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8
2901 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
2902 196,66,117,172,194, //vfnmadd213ps %ymm10,%ymm1,%ymm8
2903 196,193,60,92,201, //vsubps %ymm9,%ymm8,%ymm1
2904 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
2905 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8
2906 197,188,84,201, //vandps %ymm1,%ymm8,%ymm1
2907 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
2908 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8
2909 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
2910 72,173, //lods %ds:(%rsi),%rax
2911 255,224, //jmpq *%rax
2912};
2913
2914CODE const uint8_t sk_matrix_2x3_hsw[] = {
2915 72,173, //lods %ds:(%rsi),%rax
2916 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
2917 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
2918 196,98,125,24,64,16, //vbroadcastss 0x10(%rax),%ymm8
2919 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
2920 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
2921 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
2922 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11
2923 196,98,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm9
2924 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
2925 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
2926 72,173, //lods %ds:(%rsi),%rax
2927 197,124,41,192, //vmovaps %ymm8,%ymm0
2928 197,124,41,201, //vmovaps %ymm9,%ymm1
2929 255,224, //jmpq *%rax
2930};
2931
2932CODE const uint8_t sk_matrix_3x4_hsw[] = {
2933 72,173, //lods %ds:(%rsi),%rax
2934 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
2935 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10
2936 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11
2937 196,98,125,24,64,36, //vbroadcastss 0x24(%rax),%ymm8
2938 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8
2939 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
2940 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
2941 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
2942 196,98,125,24,88,16, //vbroadcastss 0x10(%rax),%ymm11
2943 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12
2944 196,98,125,24,72,40, //vbroadcastss 0x28(%rax),%ymm9
2945 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9
2946 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
2947 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
2948 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11
2949 196,98,125,24,96,20, //vbroadcastss 0x14(%rax),%ymm12
2950 196,98,125,24,104,32, //vbroadcastss 0x20(%rax),%ymm13
2951 196,98,125,24,80,44, //vbroadcastss 0x2c(%rax),%ymm10
2952 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10
2953 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10
2954 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10
2955 72,173, //lods %ds:(%rsi),%rax
2956 197,124,41,192, //vmovaps %ymm8,%ymm0
2957 197,124,41,201, //vmovaps %ymm9,%ymm1
2958 197,124,41,210, //vmovaps %ymm10,%ymm2
2959 255,224, //jmpq *%rax
2960};
2961
2962CODE const uint8_t sk_matrix_perspective_hsw[] = {
2963 72,173, //lods %ds:(%rsi),%rax
2964 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2965 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
2966 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
2967 196,66,117,184,209, //vfmadd231ps %ymm9,%ymm1,%ymm10
2968 196,66,125,184,208, //vfmadd231ps %ymm8,%ymm0,%ymm10
2969 196,98,125,24,64,12, //vbroadcastss 0xc(%rax),%ymm8
2970 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9
2971 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
2972 196,66,117,184,217, //vfmadd231ps %ymm9,%ymm1,%ymm11
2973 196,66,125,184,216, //vfmadd231ps %ymm8,%ymm0,%ymm11
2974 196,98,125,24,64,24, //vbroadcastss 0x18(%rax),%ymm8
2975 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9
2976 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
2977 196,66,117,184,225, //vfmadd231ps %ymm9,%ymm1,%ymm12
2978 196,66,125,184,224, //vfmadd231ps %ymm8,%ymm0,%ymm12
2979 196,193,124,83,204, //vrcpps %ymm12,%ymm1
2980 197,172,89,193, //vmulps %ymm1,%ymm10,%ymm0
2981 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
2982 72,173, //lods %ds:(%rsi),%rax
2983 255,224, //jmpq *%rax
2984};
2985
2986CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
2987 72,173, //lods %ds:(%rsi),%rax
2988 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1
2989 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
2990 196,98,125,184,193, //vfmadd231ps %ymm1,%ymm0,%ymm8
2991 196,226,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm2
2992 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
2993 196,226,125,184,202, //vfmadd231ps %ymm2,%ymm0,%ymm1
2994 196,226,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm3
2995 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
2996 196,226,125,184,211, //vfmadd231ps %ymm3,%ymm0,%ymm2
2997 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9
2998 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
2999 196,194,125,184,217, //vfmadd231ps %ymm9,%ymm0,%ymm3
3000 72,173, //lods %ds:(%rsi),%rax
3001 197,124,41,192, //vmovaps %ymm8,%ymm0
3002 255,224, //jmpq *%rax
3003};
3004
3005CODE const uint8_t sk_start_pipeline_avx[] = {
3006 65,87, //push %r15
3007 65,86, //push %r14
3008 65,85, //push %r13
3009 65,84, //push %r12
3010 83, //push %rbx
3011 73,137,205, //mov %rcx,%r13
3012 73,137,214, //mov %rdx,%r14
3013 72,137,251, //mov %rdi,%rbx
3014 72,173, //lods %ds:(%rsi),%rax
3015 73,137,199, //mov %rax,%r15
3016 73,137,244, //mov %rsi,%r12
3017 72,141,67,8, //lea 0x8(%rbx),%rax
3018 76,57,232, //cmp %r13,%rax
3019 118,5, //jbe 28 <_sk_start_pipeline_avx+0x28>
3020 72,137,223, //mov %rbx,%rdi
3021 235,65, //jmp 69 <_sk_start_pipeline_avx+0x69>
3022 185,0,0,0,0, //mov $0x0,%ecx
3023 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
3024 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3025 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
3026 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
3027 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
3028 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
3029 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
3030 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
3031 72,137,223, //mov %rbx,%rdi
3032 76,137,230, //mov %r12,%rsi
3033 76,137,242, //mov %r14,%rdx
3034 65,255,215, //callq *%r15
3035 72,141,123,8, //lea 0x8(%rbx),%rdi
3036 72,131,195,16, //add $0x10,%rbx
3037 76,57,235, //cmp %r13,%rbx
3038 72,137,251, //mov %rdi,%rbx
3039 118,191, //jbe 28 <_sk_start_pipeline_avx+0x28>
3040 76,137,233, //mov %r13,%rcx
3041 72,41,249, //sub %rdi,%rcx
3042 116,41, //je 9a <_sk_start_pipeline_avx+0x9a>
3043 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
3044 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3045 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
3046 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
3047 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
3048 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
3049 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
3050 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
3051 76,137,230, //mov %r12,%rsi
3052 76,137,242, //mov %r14,%rdx
3053 65,255,215, //callq *%r15
3054 76,137,232, //mov %r13,%rax
3055 91, //pop %rbx
3056 65,92, //pop %r12
3057 65,93, //pop %r13
3058 65,94, //pop %r14
3059 65,95, //pop %r15
3060 197,248,119, //vzeroupper
3061 195, //retq
3062};
3063
3064CODE const uint8_t sk_just_return_avx[] = {
3065 195, //retq
3066};
3067
3068CODE const uint8_t sk_seed_shader_avx[] = {
3069 72,173, //lods %ds:(%rsi),%rax
3070 197,249,110,199, //vmovd %edi,%xmm0
3071 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
3072 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
3073 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
3074 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
3075 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
3076 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
3077 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
3078 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
3079 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
3080 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
3081 72,173, //lods %ds:(%rsi),%rax
3082 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
3083 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
3084 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
3085 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
3086 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
3087 255,224, //jmpq *%rax
3088};
3089
3090CODE const uint8_t sk_constant_color_avx[] = {
3091 72,173, //lods %ds:(%rsi),%rax
3092 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
3093 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
3094 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
3095 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
3096 72,173, //lods %ds:(%rsi),%rax
3097 255,224, //jmpq *%rax
3098};
3099
3100CODE const uint8_t sk_clear_avx[] = {
3101 72,173, //lods %ds:(%rsi),%rax
3102 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
3103 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3104 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
3105 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
3106 255,224, //jmpq *%rax
3107};
3108
3109CODE const uint8_t sk_plus__avx[] = {
3110 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
3111 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
3112 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
3113 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
3114 72,173, //lods %ds:(%rsi),%rax
3115 255,224, //jmpq *%rax
3116};
3117
3118CODE const uint8_t sk_srcover_avx[] = {
3119 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
3120 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
3121 197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9
3122 197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0
3123 197,60,89,205, //vmulps %ymm5,%ymm8,%ymm9
3124 197,180,88,201, //vaddps %ymm1,%ymm9,%ymm1
3125 197,60,89,206, //vmulps %ymm6,%ymm8,%ymm9
3126 197,180,88,210, //vaddps %ymm2,%ymm9,%ymm2
3127 197,60,89,199, //vmulps %ymm7,%ymm8,%ymm8
3128 197,188,88,219, //vaddps %ymm3,%ymm8,%ymm3
3129 72,173, //lods %ds:(%rsi),%rax
3130 255,224, //jmpq *%rax
3131};
3132
3133CODE const uint8_t sk_dstover_avx[] = {
3134 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
3135 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
3136 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
3137 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
3138 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
3139 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
3140 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
3141 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
3142 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
3143 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
3144 72,173, //lods %ds:(%rsi),%rax
3145 255,224, //jmpq *%rax
3146};
3147
3148CODE const uint8_t sk_clamp_0_avx[] = {
3149 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
3150 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0
3151 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1
3152 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2
3153 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3
3154 72,173, //lods %ds:(%rsi),%rax
3155 255,224, //jmpq *%rax
3156};
3157
3158CODE const uint8_t sk_clamp_1_avx[] = {
3159 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
3160 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
3161 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
3162 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
3163 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
3164 72,173, //lods %ds:(%rsi),%rax
3165 255,224, //jmpq *%rax
3166};
3167
3168CODE const uint8_t sk_clamp_a_avx[] = {
3169 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
3170 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
3171 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
3172 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
3173 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2
3174 72,173, //lods %ds:(%rsi),%rax
3175 255,224, //jmpq *%rax
3176};
3177
3178CODE const uint8_t sk_set_rgb_avx[] = {
3179 72,173, //lods %ds:(%rsi),%rax
3180 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
3181 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
3182 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
3183 72,173, //lods %ds:(%rsi),%rax
3184 255,224, //jmpq *%rax
3185};
3186
3187CODE const uint8_t sk_swap_rb_avx[] = {
3188 197,124,40,192, //vmovaps %ymm0,%ymm8
3189 72,173, //lods %ds:(%rsi),%rax
3190 197,252,40,194, //vmovaps %ymm2,%ymm0
3191 197,124,41,194, //vmovaps %ymm8,%ymm2
3192 255,224, //jmpq *%rax
3193};
3194
3195CODE const uint8_t sk_swap_avx[] = {
3196 197,124,40,195, //vmovaps %ymm3,%ymm8
3197 197,124,40,202, //vmovaps %ymm2,%ymm9
3198 197,124,40,209, //vmovaps %ymm1,%ymm10
3199 197,124,40,216, //vmovaps %ymm0,%ymm11
3200 72,173, //lods %ds:(%rsi),%rax
3201 197,252,40,196, //vmovaps %ymm4,%ymm0
3202 197,252,40,205, //vmovaps %ymm5,%ymm1
3203 197,252,40,214, //vmovaps %ymm6,%ymm2
3204 197,252,40,223, //vmovaps %ymm7,%ymm3
3205 197,124,41,220, //vmovaps %ymm11,%ymm4
3206 197,124,41,213, //vmovaps %ymm10,%ymm5
3207 197,124,41,206, //vmovaps %ymm9,%ymm6
3208 197,124,41,199, //vmovaps %ymm8,%ymm7
3209 255,224, //jmpq *%rax
3210};
3211
3212CODE const uint8_t sk_move_src_dst_avx[] = {
3213 72,173, //lods %ds:(%rsi),%rax
3214 197,252,40,224, //vmovaps %ymm0,%ymm4
3215 197,252,40,233, //vmovaps %ymm1,%ymm5
3216 197,252,40,242, //vmovaps %ymm2,%ymm6
3217 197,252,40,251, //vmovaps %ymm3,%ymm7
3218 255,224, //jmpq *%rax
3219};
3220
3221CODE const uint8_t sk_move_dst_src_avx[] = {
3222 72,173, //lods %ds:(%rsi),%rax
3223 197,252,40,196, //vmovaps %ymm4,%ymm0
3224 197,252,40,205, //vmovaps %ymm5,%ymm1
3225 197,252,40,214, //vmovaps %ymm6,%ymm2
3226 197,252,40,223, //vmovaps %ymm7,%ymm3
3227 255,224, //jmpq *%rax
3228};
3229
3230CODE const uint8_t sk_premul_avx[] = {
3231 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0
3232 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1
3233 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
3234 72,173, //lods %ds:(%rsi),%rax
3235 255,224, //jmpq *%rax
3236};
3237
3238CODE const uint8_t sk_unpremul_avx[] = {
3239 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
3240 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
3241 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
3242 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
3243 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
3244 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
3245 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
3246 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
3247 72,173, //lods %ds:(%rsi),%rax
3248 255,224, //jmpq *%rax
3249};
3250
3251CODE const uint8_t sk_from_srgb_avx[] = {
3252 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8
3253 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
3254 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10
3255 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11
3256 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12
3257 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13
3258 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13
3259 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14
3260 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10
3261 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10
3262 196,98,125,24,106,68, //vbroadcastss 0x44(%rdx),%ymm13
3263 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0
3264 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0
3265 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
3266 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10
3267 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15
3268 196,65,4,88,252, //vaddps %ymm12,%ymm15,%ymm15
3269 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10
3270 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10
3271 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1
3272 196,195,45,74,201,16, //vblendvps %ymm1,%ymm9,%ymm10,%ymm1
3273 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
3274 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9
3275 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10
3276 196,65,44,88,212, //vaddps %ymm12,%ymm10,%ymm10
3277 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9
3278 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9
3279 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2
3280 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
3281 72,173, //lods %ds:(%rsi),%rax
3282 255,224, //jmpq *%rax
3283};
3284
3285CODE const uint8_t sk_to_srgb_avx[] = {
3286 197,124,82,192, //vrsqrtps %ymm0,%ymm8
3287 196,65,124,83,200, //vrcpps %ymm8,%ymm9
3288 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10
3289 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8
3290 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11
3291 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12
3292 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13
3293 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14
3294 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15
3295 196,65,52,89,206, //vmulps %ymm14,%ymm9,%ymm9
3296 196,65,52,88,207, //vaddps %ymm15,%ymm9,%ymm9
3297 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10
3298 196,65,44,88,201, //vaddps %ymm9,%ymm10,%ymm9
3299 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
3300 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10
3301 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
3302 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0
3303 197,124,82,201, //vrsqrtps %ymm1,%ymm9
3304 196,65,124,83,217, //vrcpps %ymm9,%ymm11
3305 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
3306 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11
3307 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11
3308 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9
3309 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9
3310 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11
3311 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
3312 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
3313 196,195,53,74,203,16, //vblendvps %ymm1,%ymm11,%ymm9,%ymm1
3314 197,124,82,202, //vrsqrtps %ymm2,%ymm9
3315 196,65,124,83,217, //vrcpps %ymm9,%ymm11
3316 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11
3317 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11
3318 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
3319 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9
3320 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9
3321 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
3322 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
3323 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
3324 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
3325 72,173, //lods %ds:(%rsi),%rax
3326 255,224, //jmpq *%rax
3327};
3328
3329CODE const uint8_t sk_scale_1_float_avx[] = {
3330 72,173, //lods %ds:(%rsi),%rax
3331 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
3332 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
3333 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
3334 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
3335 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
3336 72,173, //lods %ds:(%rsi),%rax
3337 255,224, //jmpq *%rax
3338};
3339
3340CODE const uint8_t sk_scale_u8_avx[] = {
3341 73,137,200, //mov %rcx,%r8
3342 72,173, //lods %ds:(%rsi),%rax
3343 72,139,0, //mov (%rax),%rax
3344 72,1,248, //add %rdi,%rax
3345 77,133,192, //test %r8,%r8
3346 117,65, //jne 478 <_sk_scale_u8_avx+0x51>
3347 197,123,16,0, //vmovsd (%rax),%xmm8
3348 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
3349 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
3350 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
3351 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
3352 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
3353 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
3354 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
3355 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
3356 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
3357 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
3358 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
3359 72,173, //lods %ds:(%rsi),%rax
3360 76,137,193, //mov %r8,%rcx
3361 255,224, //jmpq *%rax
3362 49,201, //xor %ecx,%ecx
3363 77,137,194, //mov %r8,%r10
3364 69,49,201, //xor %r9d,%r9d
3365 68,15,182,24, //movzbl (%rax),%r11d
3366 72,255,192, //inc %rax
3367 73,211,227, //shl %cl,%r11
3368 77,9,217, //or %r11,%r9
3369 72,131,193,8, //add $0x8,%rcx
3370 73,255,202, //dec %r10
3371 117,234, //jne 480 <_sk_scale_u8_avx+0x59>
3372 196,65,249,110,193, //vmovq %r9,%xmm8
3373 235,158, //jmp 43b <_sk_scale_u8_avx+0x14>
3374};
3375
3376CODE const uint8_t sk_lerp_1_float_avx[] = {
3377 72,173, //lods %ds:(%rsi),%rax
3378 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
3379 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
3380 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
3381 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
3382 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
3383 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
3384 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
3385 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
3386 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
3387 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
3388 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
3389 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
3390 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
3391 72,173, //lods %ds:(%rsi),%rax
3392 255,224, //jmpq *%rax
3393};
3394
3395CODE const uint8_t sk_lerp_u8_avx[] = {
3396 73,137,200, //mov %rcx,%r8
3397 72,173, //lods %ds:(%rsi),%rax
3398 72,139,0, //mov (%rax),%rax
3399 72,1,248, //add %rdi,%rax
3400 77,133,192, //test %r8,%r8
3401 117,101, //jne 551 <_sk_lerp_u8_avx+0x75>
3402 197,123,16,0, //vmovsd (%rax),%xmm8
3403 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
3404 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
3405 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
3406 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
3407 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
3408 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
3409 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
3410 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
3411 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
3412 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
3413 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
3414 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
3415 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
3416 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
3417 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
3418 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
3419 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
3420 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
3421 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
3422 72,173, //lods %ds:(%rsi),%rax
3423 76,137,193, //mov %r8,%rcx
3424 255,224, //jmpq *%rax
3425 49,201, //xor %ecx,%ecx
3426 77,137,194, //mov %r8,%r10
3427 69,49,201, //xor %r9d,%r9d
3428 68,15,182,24, //movzbl (%rax),%r11d
3429 72,255,192, //inc %rax
3430 73,211,227, //shl %cl,%r11
3431 77,9,217, //or %r11,%r9
3432 72,131,193,8, //add $0x8,%rcx
3433 73,255,202, //dec %r10
3434 117,234, //jne 559 <_sk_lerp_u8_avx+0x7d>
3435 196,65,249,110,193, //vmovq %r9,%xmm8
3436 233,119,255,255,255, //jmpq 4f0 <_sk_lerp_u8_avx+0x14>
3437};
3438
3439CODE const uint8_t sk_lerp_565_avx[] = {
3440 72,173, //lods %ds:(%rsi),%rax
3441 76,139,16, //mov (%rax),%r10
3442 72,133,201, //test %rcx,%rcx
3443 15,133,148,0,0,0, //jne 61b <_sk_lerp_565_avx+0xa2>
3444 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8
3445 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
3446 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3
3447 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8
3448 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
3449 196,98,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm8
3450 197,60,84,195, //vandps %ymm3,%ymm8,%ymm8
3451 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
3452 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9
3453 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8
3454 196,98,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm9
3455 197,52,84,203, //vandps %ymm3,%ymm9,%ymm9
3456 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9
3457 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10
3458 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9
3459 196,98,125,24,82,112, //vbroadcastss 0x70(%rdx),%ymm10
3460 197,172,84,219, //vandps %ymm3,%ymm10,%ymm3
3461 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
3462 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10
3463 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3
3464 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
3465 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
3466 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
3467 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
3468 196,193,116,89,201, //vmulps %ymm9,%ymm1,%ymm1
3469 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
3470 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
3471 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
3472 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
3473 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
3474 72,173, //lods %ds:(%rsi),%rax
3475 255,224, //jmpq *%rax
3476 65,137,200, //mov %ecx,%r8d
3477 65,128,224,7, //and $0x7,%r8b
3478 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8
3479 65,254,200, //dec %r8b
3480 69,15,182,192, //movzbl %r8b,%r8d
3481 65,128,248,6, //cmp $0x6,%r8b
3482 15,135,85,255,255,255, //ja 58d <_sk_lerp_565_avx+0x14>
3483 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 688 <_sk_lerp_565_avx+0x10f>
3484 75,99,4,129, //movslq (%r9,%r8,4),%rax
3485 76,1,200, //add %r9,%rax
3486 255,224, //jmpq *%rax
3487 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
3488 196,65,97,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
3489 196,65,57,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
3490 196,65,57,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
3491 196,65,57,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
3492 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
3493 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
3494 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8
3495 233,5,255,255,255, //jmpq 58d <_sk_lerp_565_avx+0x14>
3496 244, //hlt
3497 255, //(bad)
3498 255, //(bad)
3499 255, //(bad)
3500 236, //in (%dx),%al
3501 255, //(bad)
3502 255, //(bad)
3503 255,228, //jmpq *%rsp
3504 255, //(bad)
3505 255, //(bad)
3506 255, //(bad)
3507 220,255, //fdivr %st,%st(7)
3508 255, //(bad)
3509 255,212, //callq *%rsp
3510 255, //(bad)
3511 255, //(bad)
3512 255,204, //dec %esp
3513 255, //(bad)
3514 255, //(bad)
3515 255,192, //inc %eax
3516 255, //(bad)
3517 255, //(bad)
3518 255, //.byte 0xff
3519};
3520
3521CODE const uint8_t sk_load_tables_avx[] = {
3522 85, //push %rbp
3523 65,87, //push %r15
3524 65,86, //push %r14
3525 65,85, //push %r13
3526 65,84, //push %r12
3527 83, //push %rbx
3528 72,173, //lods %ds:(%rsi),%rax
3529 76,139,0, //mov (%rax),%r8
3530 72,133,201, //test %rcx,%rcx
3531 15,133,18,2,0,0, //jne 8ce <_sk_load_tables_avx+0x22a>
3532 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8
3533 196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9
3534 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0
3535 196,193,249,126,193, //vmovq %xmm0,%r9
3536 69,137,203, //mov %r9d,%r11d
3537 196,195,249,22,194,1, //vpextrq $0x1,%xmm0,%r10
3538 69,137,214, //mov %r10d,%r14d
3539 73,193,234,32, //shr $0x20,%r10
3540 73,193,233,32, //shr $0x20,%r9
3541 196,227,125,25,192,1, //vextractf128 $0x1,%ymm0,%xmm0
3542 196,193,249,126,196, //vmovq %xmm0,%r12
3543 69,137,231, //mov %r12d,%r15d
3544 196,227,249,22,195,1, //vpextrq $0x1,%xmm0,%rbx
3545 65,137,221, //mov %ebx,%r13d
3546 72,193,235,32, //shr $0x20,%rbx
3547 73,193,236,32, //shr $0x20,%r12
3548 72,139,104,8, //mov 0x8(%rax),%rbp
3549 76,139,64,16, //mov 0x10(%rax),%r8
3550 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0
3551 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
3552 196,163,121,33,68,173,0,32, //vinsertps $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
3553 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1
3554 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0
3555 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1
3556 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
3557 196,163,113,33,76,181,0,32, //vinsertps $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
3558 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3
3559 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1
3560 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
3561 196,193,113,114,208,8, //vpsrld $0x8,%xmm8,%xmm1
3562 196,67,125,25,194,1, //vextractf128 $0x1,%ymm8,%xmm10
3563 196,193,105,114,210,8, //vpsrld $0x8,%xmm10,%xmm2
3564 196,227,117,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
3565 197,180,84,201, //vandps %ymm1,%ymm9,%ymm1
3566 196,193,249,126,201, //vmovq %xmm1,%r9
3567 69,137,203, //mov %r9d,%r11d
3568 196,195,249,22,202,1, //vpextrq $0x1,%xmm1,%r10
3569 69,137,214, //mov %r10d,%r14d
3570 73,193,234,32, //shr $0x20,%r10
3571 73,193,233,32, //shr $0x20,%r9
3572 196,227,125,25,201,1, //vextractf128 $0x1,%ymm1,%xmm1
3573 196,225,249,126,205, //vmovq %xmm1,%rbp
3574 65,137,239, //mov %ebp,%r15d
3575 196,227,249,22,203,1, //vpextrq $0x1,%xmm1,%rbx
3576 65,137,220, //mov %ebx,%r12d
3577 72,193,235,32, //shr $0x20,%rbx
3578 72,193,237,32, //shr $0x20,%rbp
3579 196,129,122,16,12,184, //vmovss (%r8,%r15,4),%xmm1
3580 196,195,113,33,12,168,16, //vinsertps $0x10,(%r8,%rbp,4),%xmm1,%xmm1
3581 196,129,122,16,20,160, //vmovss (%r8,%r12,4),%xmm2
3582 196,227,113,33,202,32, //vinsertps $0x20,%xmm2,%xmm1,%xmm1
3583 196,193,122,16,20,152, //vmovss (%r8,%rbx,4),%xmm2
3584 196,227,113,33,202,48, //vinsertps $0x30,%xmm2,%xmm1,%xmm1
3585 196,129,122,16,20,152, //vmovss (%r8,%r11,4),%xmm2
3586 196,131,105,33,20,136,16, //vinsertps $0x10,(%r8,%r9,4),%xmm2,%xmm2
3587 196,129,122,16,28,176, //vmovss (%r8,%r14,4),%xmm3
3588 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2
3589 196,129,122,16,28,144, //vmovss (%r8,%r10,4),%xmm3
3590 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2
3591 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
3592 72,139,64,24, //mov 0x18(%rax),%rax
3593 196,193,105,114,208,16, //vpsrld $0x10,%xmm8,%xmm2
3594 196,193,97,114,210,16, //vpsrld $0x10,%xmm10,%xmm3
3595 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
3596 197,180,84,210, //vandps %ymm2,%ymm9,%ymm2
3597 196,193,249,126,208, //vmovq %xmm2,%r8
3598 69,137,194, //mov %r8d,%r10d
3599 196,195,249,22,209,1, //vpextrq $0x1,%xmm2,%r9
3600 69,137,203, //mov %r9d,%r11d
3601 73,193,233,32, //shr $0x20,%r9
3602 73,193,232,32, //shr $0x20,%r8
3603 196,227,125,25,210,1, //vextractf128 $0x1,%ymm2,%xmm2
3604 196,225,249,126,213, //vmovq %xmm2,%rbp
3605 65,137,238, //mov %ebp,%r14d
3606 196,227,249,22,211,1, //vpextrq $0x1,%xmm2,%rbx
3607 65,137,223, //mov %ebx,%r15d
3608 72,193,235,32, //shr $0x20,%rbx
3609 72,193,237,32, //shr $0x20,%rbp
3610 196,161,122,16,20,176, //vmovss (%rax,%r14,4),%xmm2
3611 196,227,105,33,20,168,16, //vinsertps $0x10,(%rax,%rbp,4),%xmm2,%xmm2
3612 196,161,122,16,28,184, //vmovss (%rax,%r15,4),%xmm3
3613 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2
3614 197,250,16,28,152, //vmovss (%rax,%rbx,4),%xmm3
3615 196,99,105,33,203,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm9
3616 196,161,122,16,28,144, //vmovss (%rax,%r10,4),%xmm3
3617 196,163,97,33,28,128,16, //vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3
3618 196,161,122,16,20,152, //vmovss (%rax,%r11,4),%xmm2
3619 196,227,97,33,210,32, //vinsertps $0x20,%xmm2,%xmm3,%xmm2
3620 196,161,122,16,28,136, //vmovss (%rax,%r9,4),%xmm3
3621 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2
3622 196,195,109,24,209,1, //vinsertf128 $0x1,%xmm9,%ymm2,%ymm2
3623 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8
3624 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3
3625 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
3626 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
3627 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
3628 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
3629 72,173, //lods %ds:(%rsi),%rax
3630 91, //pop %rbx
3631 65,92, //pop %r12
3632 65,93, //pop %r13
3633 65,94, //pop %r14
3634 65,95, //pop %r15
3635 93, //pop %rbp
3636 255,224, //jmpq *%rax
3637 65,137,201, //mov %ecx,%r9d
3638 65,128,225,7, //and $0x7,%r9b
3639 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
3640 65,254,201, //dec %r9b
3641 69,15,182,201, //movzbl %r9b,%r9d
3642 65,128,249,6, //cmp $0x6,%r9b
3643 15,135,215,253,255,255, //ja 6c2 <_sk_load_tables_avx+0x1e>
3644 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # 97c <_sk_load_tables_avx+0x2d8>
3645 79,99,12,138, //movslq (%r10,%r9,4),%r9
3646 77,1,209, //add %r10,%r9
3647 65,255,225, //jmpq *%r9
3648 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0
3649 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0
3650 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
3651 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3652 196,99,117,12,192,64, //vblendps $0x40,%ymm0,%ymm1,%ymm8
3653 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
3654 196,195,121,34,68,184,20,1, //vpinsrd $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
3655 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8
3656 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
3657 196,195,121,34,68,184,16,0, //vpinsrd $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
3658 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8
3659 196,195,57,34,68,184,12,3, //vpinsrd $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
3660 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
3661 196,195,57,34,68,184,8,2, //vpinsrd $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
3662 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
3663 196,195,57,34,68,184,4,1, //vpinsrd $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
3664 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
3665 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0
3666 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
3667 233,70,253,255,255, //jmpq 6c2 <_sk_load_tables_avx+0x1e>
3668 238, //out %al,(%dx)
3669 255, //(bad)
3670 255, //(bad)
3671 255,224, //jmpq *%rax
3672 255, //(bad)
3673 255, //(bad)
3674 255,210, //callq *%rdx
3675 255, //(bad)
3676 255, //(bad)
3677 255,196, //inc %esp
3678 255, //(bad)
3679 255, //(bad)
3680 255,176,255,255,255,156, //pushq -0x63000001(%rax)
3681 255, //(bad)
3682 255, //(bad)
3683 255, //.byte 0xff
3684 128,255,255, //cmp $0xff,%bh
3685 255, //.byte 0xff
3686};
3687
3688CODE const uint8_t sk_load_a8_avx[] = {
3689 73,137,200, //mov %rcx,%r8
3690 72,173, //lods %ds:(%rsi),%rax
3691 72,139,0, //mov (%rax),%rax
3692 72,1,248, //add %rdi,%rax
3693 77,133,192, //test %r8,%r8
3694 117,59, //jne 9e3 <_sk_load_a8_avx+0x4b>
3695 197,251,16,0, //vmovsd (%rax),%xmm0
3696 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1
3697 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0
3698 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0
3699 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
3700 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
3701 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1
3702 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3
3703 72,173, //lods %ds:(%rsi),%rax
3704 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
3705 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3706 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
3707 76,137,193, //mov %r8,%rcx
3708 255,224, //jmpq *%rax
3709 49,201, //xor %ecx,%ecx
3710 77,137,194, //mov %r8,%r10
3711 69,49,201, //xor %r9d,%r9d
3712 68,15,182,24, //movzbl (%rax),%r11d
3713 72,255,192, //inc %rax
3714 73,211,227, //shl %cl,%r11
3715 77,9,217, //or %r11,%r9
3716 72,131,193,8, //add $0x8,%rcx
3717 73,255,202, //dec %r10
3718 117,234, //jne 9eb <_sk_load_a8_avx+0x53>
3719 196,193,249,110,193, //vmovq %r9,%xmm0
3720 235,164, //jmp 9ac <_sk_load_a8_avx+0x14>
3721};
3722
3723CODE const uint8_t sk_store_a8_avx[] = {
3724 72,173, //lods %ds:(%rsi),%rax
3725 76,139,8, //mov (%rax),%r9
3726 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
3727 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
3728 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
3729 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
3730 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
3731 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
3732 72,133,201, //test %rcx,%rcx
3733 117,10, //jne a3b <_sk_store_a8_avx+0x33>
3734 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
3735 72,173, //lods %ds:(%rsi),%rax
3736 255,224, //jmpq *%rax
3737 137,200, //mov %ecx,%eax
3738 36,7, //and $0x7,%al
3739 254,200, //dec %al
3740 68,15,182,192, //movzbl %al,%r8d
3741 65,128,248,6, //cmp $0x6,%r8b
3742 119,236, //ja a37 <_sk_store_a8_avx+0x2f>
3743 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
3744 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # a9c <_sk_store_a8_avx+0x94>
3745 75,99,4,130, //movslq (%r10,%r8,4),%rax
3746 76,1,208, //add %r10,%rax
3747 255,224, //jmpq *%rax
3748 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1)
3749 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1)
3750 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1)
3751 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1)
3752 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
3753 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
3754 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
3755 235,158, //jmp a37 <_sk_store_a8_avx+0x2f>
3756 15,31,0, //nopl (%rax)
3757 244, //hlt
3758 255, //(bad)
3759 255, //(bad)
3760 255, //(bad)
3761 236, //in (%dx),%al
3762 255, //(bad)
3763 255, //(bad)
3764 255,228, //jmpq *%rsp
3765 255, //(bad)
3766 255, //(bad)
3767 255, //(bad)
3768 220,255, //fdivr %st,%st(7)
3769 255, //(bad)
3770 255,212, //callq *%rsp
3771 255, //(bad)
3772 255, //(bad)
3773 255,204, //dec %esp
3774 255, //(bad)
3775 255, //(bad)
3776 255,196, //inc %esp
3777 255, //(bad)
3778 255, //(bad)
3779 255, //.byte 0xff
3780};
3781
3782CODE const uint8_t sk_load_565_avx[] = {
3783 72,173, //lods %ds:(%rsi),%rax
3784 76,139,16, //mov (%rax),%r10
3785 72,133,201, //test %rcx,%rcx
3786 117,106, //jne b2c <_sk_load_565_avx+0x74>
3787 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
3788 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1
3789 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1
3790 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0
3791 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2
3792 196,226,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm0
3793 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0
3794 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
3795 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1
3796 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0
3797 196,226,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm1
3798 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1
3799 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
3800 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3
3801 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
3802 196,226,125,24,90,112, //vbroadcastss 0x70(%rdx),%ymm3
3803 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2
3804 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
3805 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3
3806 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2
3807 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
3808 72,173, //lods %ds:(%rsi),%rax
3809 255,224, //jmpq *%rax
3810 65,137,200, //mov %ecx,%r8d
3811 65,128,224,7, //and $0x7,%r8b
3812 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
3813 65,254,200, //dec %r8b
3814 69,15,182,192, //movzbl %r8b,%r8d
3815 65,128,248,6, //cmp $0x6,%r8b
3816 119,132, //ja ac8 <_sk_load_565_avx+0x10>
3817 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # b94 <_sk_load_565_avx+0xdc>
3818 75,99,4,129, //movslq (%r9,%r8,4),%rax
3819 76,1,200, //add %r9,%rax
3820 255,224, //jmpq *%rax
3821 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
3822 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
3823 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
3824 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
3825 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
3826 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
3827 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
3828 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
3829 233,52,255,255,255, //jmpq ac8 <_sk_load_565_avx+0x10>
3830 244, //hlt
3831 255, //(bad)
3832 255, //(bad)
3833 255, //(bad)
3834 236, //in (%dx),%al
3835 255, //(bad)
3836 255, //(bad)
3837 255,228, //jmpq *%rsp
3838 255, //(bad)
3839 255, //(bad)
3840 255, //(bad)
3841 220,255, //fdivr %st,%st(7)
3842 255, //(bad)
3843 255,212, //callq *%rsp
3844 255, //(bad)
3845 255, //(bad)
3846 255,204, //dec %esp
3847 255, //(bad)
3848 255, //(bad)
3849 255,192, //inc %eax
3850 255, //(bad)
3851 255, //(bad)
3852 255, //.byte 0xff
3853};
3854
3855CODE const uint8_t sk_store_565_avx[] = {
3856 72,173, //lods %ds:(%rsi),%rax
3857 76,139,8, //mov (%rax),%r9
3858 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8
3859 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
3860 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
3861 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10
3862 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9
3863 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9
3864 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9
3865 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10
3866 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
3867 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
3868 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11
3869 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
3870 196,193,41,114,242,5, //vpslld $0x5,%xmm10,%xmm10
3871 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
3872 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9
3873 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
3874 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
3875 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
3876 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
3877 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
3878 72,133,201, //test %rcx,%rcx
3879 117,10, //jne c36 <_sk_store_565_avx+0x86>
3880 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
3881 72,173, //lods %ds:(%rsi),%rax
3882 255,224, //jmpq *%rax
3883 137,200, //mov %ecx,%eax
3884 36,7, //and $0x7,%al
3885 254,200, //dec %al
3886 68,15,182,192, //movzbl %al,%r8d
3887 65,128,248,6, //cmp $0x6,%r8b
3888 119,236, //ja c32 <_sk_store_565_avx+0x82>
3889 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # c94 <_sk_store_565_avx+0xe4>
3890 75,99,4,130, //movslq (%r10,%r8,4),%rax
3891 76,1,208, //add %r10,%rax
3892 255,224, //jmpq *%rax
3893 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2)
3894 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2)
3895 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2)
3896 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2)
3897 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2)
3898 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
3899 197,121,126,192, //vmovd %xmm8,%eax
3900 102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
3901 235,161, //jmp c32 <_sk_store_565_avx+0x82>
3902 15,31,0, //nopl (%rax)
3903 242,255, //repnz (bad)
3904 255, //(bad)
3905 255, //(bad)
3906 234, //(bad)
3907 255, //(bad)
3908 255, //(bad)
3909 255,226, //jmpq *%rdx
3910 255, //(bad)
3911 255, //(bad)
3912 255, //(bad)
3913 218,255, //(bad)
3914 255, //(bad)
3915 255,210, //callq *%rdx
3916 255, //(bad)
3917 255, //(bad)
3918 255,202, //dec %edx
3919 255, //(bad)
3920 255, //(bad)
3921 255,194, //inc %edx
3922 255, //(bad)
3923 255, //(bad)
3924 255, //.byte 0xff
3925};
3926
3927CODE const uint8_t sk_load_8888_avx[] = {
3928 72,173, //lods %ds:(%rsi),%rax
3929 76,139,16, //mov (%rax),%r10
3930 72,133,201, //test %rcx,%rcx
3931 117,125, //jne d37 <_sk_load_8888_avx+0x87>
3932 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9
3933 196,98,125,24,90,16, //vbroadcastss 0x10(%rdx),%ymm11
3934 196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0
3935 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
3936 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
3937 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
3938 196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10
3939 196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3
3940 197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1
3941 196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
3942 197,164,84,201, //vandps %ymm1,%ymm11,%ymm1
3943 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
3944 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
3945 196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10
3946 197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2
3947 196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
3948 197,164,84,210, //vandps %ymm2,%ymm11,%ymm2
3949 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
3950 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
3951 196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9
3952 197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3
3953 196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
3954 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
3955 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
3956 72,173, //lods %ds:(%rsi),%rax
3957 255,224, //jmpq *%rax
3958 65,137,200, //mov %ecx,%r8d
3959 65,128,224,7, //and $0x7,%r8b
3960 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9
3961 65,254,200, //dec %r8b
3962 69,15,182,192, //movzbl %r8b,%r8d
3963 65,128,248,6, //cmp $0x6,%r8b
3964 15,135,108,255,255,255, //ja cc0 <_sk_load_8888_avx+0x10>
3965 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # de4 <_sk_load_8888_avx+0x134>
3966 75,99,4,129, //movslq (%r9,%r8,4),%rax
3967 76,1,200, //add %r9,%rax
3968 255,224, //jmpq *%rax
3969 196,193,121,110,68,186,24, //vmovd 0x18(%r10,%rdi,4),%xmm0
3970 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0
3971 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
3972 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
3973 196,99,117,12,200,64, //vblendps $0x40,%ymm0,%ymm1,%ymm9
3974 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
3975 196,195,121,34,68,186,20,1, //vpinsrd $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
3976 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9
3977 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
3978 196,195,121,34,68,186,16,0, //vpinsrd $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
3979 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9
3980 196,195,49,34,68,186,12,3, //vpinsrd $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
3981 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
3982 196,195,49,34,68,186,8,2, //vpinsrd $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
3983 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
3984 196,195,49,34,68,186,4,1, //vpinsrd $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
3985 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
3986 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0
3987 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
3988 233,220,254,255,255, //jmpq cc0 <_sk_load_8888_avx+0x10>
3989 238, //out %al,(%dx)
3990 255, //(bad)
3991 255, //(bad)
3992 255,224, //jmpq *%rax
3993 255, //(bad)
3994 255, //(bad)
3995 255,210, //callq *%rdx
3996 255, //(bad)
3997 255, //(bad)
3998 255,196, //inc %esp
3999 255, //(bad)
4000 255, //(bad)
4001 255,176,255,255,255,156, //pushq -0x63000001(%rax)
4002 255, //(bad)
4003 255, //(bad)
4004 255, //.byte 0xff
4005 128,255,255, //cmp $0xff,%bh
4006 255, //.byte 0xff
4007};
4008
4009CODE const uint8_t sk_store_8888_avx[] = {
4010 72,173, //lods %ds:(%rsi),%rax
4011 76,139,8, //mov (%rax),%r9
4012 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
4013 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
4014 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
4015 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
4016 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
4017 196,193,33,114,242,8, //vpslld $0x8,%xmm10,%xmm11
4018 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
4019 196,193,41,114,242,8, //vpslld $0x8,%xmm10,%xmm10
4020 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
4021 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9
4022 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10
4023 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
4024 196,193,33,114,242,16, //vpslld $0x10,%xmm10,%xmm11
4025 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
4026 196,193,41,114,242,16, //vpslld $0x10,%xmm10,%xmm10
4027 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
4028 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
4029 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
4030 196,193,33,114,240,24, //vpslld $0x18,%xmm8,%xmm11
4031 196,67,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm8
4032 196,193,57,114,240,24, //vpslld $0x18,%xmm8,%xmm8
4033 196,67,37,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm11,%ymm8
4034 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8
4035 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
4036 72,133,201, //test %rcx,%rcx
4037 117,10, //jne e95 <_sk_store_8888_avx+0x95>
4038 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4)
4039 72,173, //lods %ds:(%rsi),%rax
4040 255,224, //jmpq *%rax
4041 137,200, //mov %ecx,%eax
4042 36,7, //and $0x7,%al
4043 254,200, //dec %al
4044 68,15,182,192, //movzbl %al,%r8d
4045 65,128,248,6, //cmp $0x6,%r8b
4046 119,236, //ja e91 <_sk_store_8888_avx+0x91>
4047 76,141,21,84,0,0,0, //lea 0x54(%rip),%r10 # f00 <_sk_store_8888_avx+0x100>
4048 75,99,4,130, //movslq (%r10,%r8,4),%rax
4049 76,1,208, //add %r10,%rax
4050 255,224, //jmpq *%rax
4051 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
4052 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4)
4053 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
4054 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4)
4055 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
4056 196,65,121,126,76,185,16, //vmovd %xmm9,0x10(%r9,%rdi,4)
4057 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4)
4058 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4)
4059 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4)
4060 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4)
4061 235,147, //jmp e91 <_sk_store_8888_avx+0x91>
4062 102,144, //xchg %ax,%ax
4063 246,255, //idiv %bh
4064 255, //(bad)
4065 255, //(bad)
4066 238, //out %al,(%dx)
4067 255, //(bad)
4068 255, //(bad)
4069 255,230, //jmpq *%rsi
4070 255, //(bad)
4071 255, //(bad)
4072 255, //(bad)
4073 222,255, //fdivrp %st,%st(7)
4074 255, //(bad)
4075 255,209, //callq *%rcx
4076 255, //(bad)
4077 255, //(bad)
4078 255,195, //inc %ebx
4079 255, //(bad)
4080 255, //(bad)
4081 255, //.byte 0xff
4082 181,255, //mov $0xff,%ch
4083 255, //(bad)
4084 255, //.byte 0xff
4085};
4086
4087CODE const uint8_t sk_load_f16_avx[] = {
4088 72,173, //lods %ds:(%rsi),%rax
4089 72,139,0, //mov (%rax),%rax
4090 72,133,201, //test %rcx,%rcx
4091 15,133,240,0,0,0, //jne 101a <_sk_load_f16_avx+0xfe>
4092 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
4093 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
4094 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
4095 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8
4096 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0
4097 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1
4098 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2
4099 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3
4100 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8
4101 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0
4102 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1
4103 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9
4104 197,249,110,90,100, //vmovd 0x64(%rdx),%xmm3
4105 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3
4106 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2
4107 196,65,105,223,192, //vpandn %xmm8,%xmm2,%xmm8
4108 197,225,101,208, //vpcmpgtw %xmm0,%xmm3,%xmm2
4109 197,233,223,192, //vpandn %xmm0,%xmm2,%xmm0
4110 197,225,101,209, //vpcmpgtw %xmm1,%xmm3,%xmm2
4111 197,233,223,201, //vpandn %xmm1,%xmm2,%xmm1
4112 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2
4113 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2
4114 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10
4115 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9
4116 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
4117 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8
4118 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1
4119 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11
4120 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12
4121 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13
4122 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14
4123 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0
4124 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2
4125 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0
4126 196,98,125,24,74,92, //vbroadcastss 0x5c(%rdx),%ymm9
4127 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0
4128 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2
4129 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1
4130 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
4131 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1
4132 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2
4133 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3
4134 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
4135 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2
4136 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8
4137 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3
4138 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
4139 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3
4140 72,173, //lods %ds:(%rsi),%rax
4141 255,224, //jmpq *%rax
4142 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
4143 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
4144 72,131,249,1, //cmp $0x1,%rcx
4145 117,6, //jne 1030 <_sk_load_f16_avx+0x114>
4146 197,250,126,201, //vmovq %xmm1,%xmm1
4147 235,30, //jmp 104e <_sk_load_f16_avx+0x132>
4148 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
4149 72,131,249,3, //cmp $0x3,%rcx
4150 114,18, //jb 104e <_sk_load_f16_avx+0x132>
4151 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
4152 72,131,249,3, //cmp $0x3,%rcx
4153 117,19, //jne 105b <_sk_load_f16_avx+0x13f>
4154 197,250,126,210, //vmovq %xmm2,%xmm2
4155 235,46, //jmp 107c <_sk_load_f16_avx+0x160>
4156 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
4157 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
4158 233,230,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
4159 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
4160 72,131,249,5, //cmp $0x5,%rcx
4161 114,21, //jb 107c <_sk_load_f16_avx+0x160>
4162 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
4163 72,131,249,5, //cmp $0x5,%rcx
4164 117,18, //jne 1085 <_sk_load_f16_avx+0x169>
4165 197,250,126,219, //vmovq %xmm3,%xmm3
4166 233,197,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
4167 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
4168 233,188,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
4169 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
4170 72,131,249,7, //cmp $0x7,%rcx
4171 15,130,172,254,255,255, //jb f41 <_sk_load_f16_avx+0x25>
4172 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
4173 233,161,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
4174};
4175
4176CODE const uint8_t sk_store_f16_avx[] = {
4177 72,173, //lods %ds:(%rsi),%rax
4178 72,139,0, //mov (%rax),%rax
4179 196,98,125,24,66,96, //vbroadcastss 0x60(%rdx),%ymm8
4180 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
4181 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10
4182 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10
4183 196,193,49,114,209,13, //vpsrld $0xd,%xmm9,%xmm9
4184 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11
4185 196,67,125,25,220,1, //vextractf128 $0x1,%ymm11,%xmm12
4186 196,193,25,114,212,13, //vpsrld $0xd,%xmm12,%xmm12
4187 196,193,33,114,211,13, //vpsrld $0xd,%xmm11,%xmm11
4188 197,60,89,234, //vmulps %ymm2,%ymm8,%ymm13
4189 196,67,125,25,238,1, //vextractf128 $0x1,%ymm13,%xmm14
4190 196,193,9,114,214,13, //vpsrld $0xd,%xmm14,%xmm14
4191 196,193,17,114,213,13, //vpsrld $0xd,%xmm13,%xmm13
4192 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
4193 196,67,125,25,199,1, //vextractf128 $0x1,%ymm8,%xmm15
4194 196,193,1,114,215,13, //vpsrld $0xd,%xmm15,%xmm15
4195 196,193,57,114,208,13, //vpsrld $0xd,%xmm8,%xmm8
4196 196,193,33,115,251,2, //vpslldq $0x2,%xmm11,%xmm11
4197 196,65,33,235,201, //vpor %xmm9,%xmm11,%xmm9
4198 196,193,33,115,252,2, //vpslldq $0x2,%xmm12,%xmm11
4199 196,65,33,235,226, //vpor %xmm10,%xmm11,%xmm12
4200 196,193,57,115,248,2, //vpslldq $0x2,%xmm8,%xmm8
4201 196,65,57,235,197, //vpor %xmm13,%xmm8,%xmm8
4202 196,193,41,115,255,2, //vpslldq $0x2,%xmm15,%xmm10
4203 196,65,41,235,238, //vpor %xmm14,%xmm10,%xmm13
4204 196,65,49,98,216, //vpunpckldq %xmm8,%xmm9,%xmm11
4205 196,65,49,106,208, //vpunpckhdq %xmm8,%xmm9,%xmm10
4206 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9
4207 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8
4208 72,133,201, //test %rcx,%rcx
4209 117,27, //jne 1163 <_sk_store_f16_avx+0xc3>
4210 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
4211 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
4212 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
4213 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8)
4214 72,173, //lods %ds:(%rsi),%rax
4215 255,224, //jmpq *%rax
4216 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
4217 72,131,249,1, //cmp $0x1,%rcx
4218 116,241, //je 115f <_sk_store_f16_avx+0xbf>
4219 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
4220 72,131,249,3, //cmp $0x3,%rcx
4221 114,229, //jb 115f <_sk_store_f16_avx+0xbf>
4222 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
4223 116,221, //je 115f <_sk_store_f16_avx+0xbf>
4224 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
4225 72,131,249,5, //cmp $0x5,%rcx
4226 114,209, //jb 115f <_sk_store_f16_avx+0xbf>
4227 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
4228 116,201, //je 115f <_sk_store_f16_avx+0xbf>
4229 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
4230 72,131,249,7, //cmp $0x7,%rcx
4231 114,189, //jb 115f <_sk_store_f16_avx+0xbf>
4232 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
4233 235,181, //jmp 115f <_sk_store_f16_avx+0xbf>
4234};
4235
4236CODE const uint8_t sk_store_f32_avx[] = {
4237 72,173, //lods %ds:(%rsi),%rax
4238 76,139,0, //mov (%rax),%r8
4239 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax
4240 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8
4241 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11
4242 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9
4243 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12
4244 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10
4245 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9
4246 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
4247 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
4248 72,133,201, //test %rcx,%rcx
4249 117,55, //jne 1217 <_sk_store_f32_avx+0x6d>
4250 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
4251 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
4252 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
4253 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
4254 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4)
4255 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4)
4256 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4)
4257 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4)
4258 72,173, //lods %ds:(%rsi),%rax
4259 255,224, //jmpq *%rax
4260 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
4261 72,131,249,1, //cmp $0x1,%rcx
4262 116,240, //je 1213 <_sk_store_f32_avx+0x69>
4263 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
4264 72,131,249,3, //cmp $0x3,%rcx
4265 114,227, //jb 1213 <_sk_store_f32_avx+0x69>
4266 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
4267 116,218, //je 1213 <_sk_store_f32_avx+0x69>
4268 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
4269 72,131,249,5, //cmp $0x5,%rcx
4270 114,205, //jb 1213 <_sk_store_f32_avx+0x69>
4271 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
4272 116,195, //je 1213 <_sk_store_f32_avx+0x69>
4273 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
4274 72,131,249,7, //cmp $0x7,%rcx
4275 114,181, //jb 1213 <_sk_store_f32_avx+0x69>
4276 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
4277 235,171, //jmp 1213 <_sk_store_f32_avx+0x69>
4278};
4279
4280CODE const uint8_t sk_clamp_x_avx[] = {
4281 72,173, //lods %ds:(%rsi),%rax
4282 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
4283 197,60,95,200, //vmaxps %ymm0,%ymm8,%ymm9
4284 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4285 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
4286 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4287 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
4288 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
4289 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
4290 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
4291 72,173, //lods %ds:(%rsi),%rax
4292 255,224, //jmpq *%rax
4293};
4294
4295CODE const uint8_t sk_clamp_y_avx[] = {
4296 72,173, //lods %ds:(%rsi),%rax
4297 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
4298 197,60,95,201, //vmaxps %ymm1,%ymm8,%ymm9
4299 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4300 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1
4301 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4302 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
4303 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
4304 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
4305 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
4306 72,173, //lods %ds:(%rsi),%rax
4307 255,224, //jmpq *%rax
4308};
4309
4310CODE const uint8_t sk_repeat_x_avx[] = {
4311 72,173, //lods %ds:(%rsi),%rax
4312 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4313 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9
4314 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
4315 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9
4316 196,65,124,92,201, //vsubps %ymm9,%ymm0,%ymm9
4317 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
4318 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4319 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
4320 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
4321 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
4322 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
4323 72,173, //lods %ds:(%rsi),%rax
4324 255,224, //jmpq *%rax
4325};
4326
4327CODE const uint8_t sk_repeat_y_avx[] = {
4328 72,173, //lods %ds:(%rsi),%rax
4329 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4330 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9
4331 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
4332 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9
4333 196,65,116,92,201, //vsubps %ymm9,%ymm1,%ymm9
4334 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1
4335 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4336 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
4337 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
4338 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
4339 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
4340 72,173, //lods %ds:(%rsi),%rax
4341 255,224, //jmpq *%rax
4342};
4343
4344CODE const uint8_t sk_mirror_x_avx[] = {
4345 72,173, //lods %ds:(%rsi),%rax
4346 197,122,16,0, //vmovss (%rax),%xmm8
4347 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9
4348 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
4349 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10
4350 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0
4351 196,227,121,4,192,0, //vpermilps $0x0,%xmm0,%xmm0
4352 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
4353 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8
4354 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
4355 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
4356 197,172,92,192, //vsubps %ymm0,%ymm10,%ymm0
4357 196,193,124,92,193, //vsubps %ymm9,%ymm0,%ymm0
4358 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
4359 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8
4360 197,60,84,192, //vandps %ymm0,%ymm8,%ymm8
4361 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
4362 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4363 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
4364 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9
4365 196,227,53,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm0
4366 197,188,93,192, //vminps %ymm0,%ymm8,%ymm0
4367 72,173, //lods %ds:(%rsi),%rax
4368 255,224, //jmpq *%rax
4369};
4370
4371CODE const uint8_t sk_mirror_y_avx[] = {
4372 72,173, //lods %ds:(%rsi),%rax
4373 197,122,16,0, //vmovss (%rax),%xmm8
4374 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9
4375 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
4376 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10
4377 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1
4378 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
4379 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
4380 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8
4381 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
4382 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
4383 197,172,92,201, //vsubps %ymm1,%ymm10,%ymm1
4384 196,193,116,92,201, //vsubps %ymm9,%ymm1,%ymm1
4385 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
4386 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8
4387 197,60,84,193, //vandps %ymm1,%ymm8,%ymm8
4388 196,99,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm1
4389 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
4390 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
4391 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9
4392 196,227,53,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm9,%ymm1
4393 197,188,93,201, //vminps %ymm1,%ymm8,%ymm1
4394 72,173, //lods %ds:(%rsi),%rax
4395 255,224, //jmpq *%rax
4396};
4397
4398CODE const uint8_t sk_matrix_2x3_avx[] = {
4399 72,173, //lods %ds:(%rsi),%rax
4400 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4401 196,98,125,24,72,8, //vbroadcastss 0x8(%rax),%ymm9
4402 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
4403 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
4404 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
4405 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
4406 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
4407 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
4408 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10
4409 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
4410 197,172,89,201, //vmulps %ymm1,%ymm10,%ymm1
4411 196,193,116,88,203, //vaddps %ymm11,%ymm1,%ymm1
4412 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0
4413 197,252,88,201, //vaddps %ymm1,%ymm0,%ymm1
4414 72,173, //lods %ds:(%rsi),%rax
4415 197,124,41,192, //vmovaps %ymm8,%ymm0
4416 255,224, //jmpq *%rax
4417};
4418
4419CODE const uint8_t sk_matrix_3x4_avx[] = {
4420 72,173, //lods %ds:(%rsi),%rax
4421 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4422 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
4423 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10
4424 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11
4425 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10
4426 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
4427 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
4428 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
4429 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
4430 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
4431 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
4432 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
4433 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11
4434 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12
4435 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11
4436 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
4437 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
4438 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
4439 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
4440 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
4441 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
4442 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
4443 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
4444 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13
4445 197,156,89,210, //vmulps %ymm2,%ymm12,%ymm2
4446 196,193,108,88,213, //vaddps %ymm13,%ymm2,%ymm2
4447 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
4448 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1
4449 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0
4450 197,252,88,209, //vaddps %ymm1,%ymm0,%ymm2
4451 72,173, //lods %ds:(%rsi),%rax
4452 197,124,41,192, //vmovaps %ymm8,%ymm0
4453 197,124,41,201, //vmovaps %ymm9,%ymm1
4454 255,224, //jmpq *%rax
4455};
4456
4457CODE const uint8_t sk_matrix_perspective_avx[] = {
4458 72,173, //lods %ds:(%rsi),%rax
4459 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
4460 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
4461 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
4462 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
4463 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
4464 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
4465 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
4466 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
4467 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
4468 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
4469 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
4470 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
4471 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
4472 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
4473 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10
4474 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11
4475 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
4476 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
4477 196,193,116,88,204, //vaddps %ymm12,%ymm1,%ymm1
4478 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0
4479 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
4480 197,252,83,200, //vrcpps %ymm0,%ymm1
4481 197,188,89,193, //vmulps %ymm1,%ymm8,%ymm0
4482 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1
4483 72,173, //lods %ds:(%rsi),%rax
4484 255,224, //jmpq *%rax
4485};
4486
4487CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
4488 72,173, //lods %ds:(%rsi),%rax
4489 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1
4490 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
4491 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1
4492 197,108,88,193, //vaddps %ymm1,%ymm2,%ymm8
4493 196,226,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm1
4494 196,226,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm2
4495 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1
4496 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
4497 196,226,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm2
4498 196,226,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm3
4499 197,236,89,208, //vmulps %ymm0,%ymm2,%ymm2
4500 197,228,88,210, //vaddps %ymm2,%ymm3,%ymm2
4501 196,226,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm3
4502 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
4503 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0
4504 197,180,88,216, //vaddps %ymm0,%ymm9,%ymm3
4505 72,173, //lods %ds:(%rsi),%rax
4506 197,124,41,192, //vmovaps %ymm8,%ymm0
4507 255,224, //jmpq *%rax
4508};
4509
4510CODE const uint8_t sk_start_pipeline_sse41[] = {
4511 65,87, //push %r15
4512 65,86, //push %r14
4513 65,85, //push %r13
4514 65,84, //push %r12
4515 83, //push %rbx
4516 73,137,207, //mov %rcx,%r15
4517 73,137,214, //mov %rdx,%r14
4518 72,137,251, //mov %rdi,%rbx
4519 72,173, //lods %ds:(%rsi),%rax
4520 73,137,196, //mov %rax,%r12
4521 73,137,245, //mov %rsi,%r13
4522 72,141,67,4, //lea 0x4(%rbx),%rax
4523 76,57,248, //cmp %r15,%rax
4524 118,5, //jbe 28 <_sk_start_pipeline_sse41+0x28>
4525 72,137,216, //mov %rbx,%rax
4526 235,52, //jmp 5c <_sk_start_pipeline_sse41+0x5c>
4527 15,87,192, //xorps %xmm0,%xmm0
4528 15,87,201, //xorps %xmm1,%xmm1
4529 15,87,210, //xorps %xmm2,%xmm2
4530 15,87,219, //xorps %xmm3,%xmm3
4531 15,87,228, //xorps %xmm4,%xmm4
4532 15,87,237, //xorps %xmm5,%xmm5
4533 15,87,246, //xorps %xmm6,%xmm6
4534 15,87,255, //xorps %xmm7,%xmm7
4535 72,137,223, //mov %rbx,%rdi
4536 76,137,238, //mov %r13,%rsi
4537 76,137,242, //mov %r14,%rdx
4538 65,255,212, //callq *%r12
4539 72,141,67,4, //lea 0x4(%rbx),%rax
4540 72,131,195,8, //add $0x8,%rbx
4541 76,57,251, //cmp %r15,%rbx
4542 72,137,195, //mov %rax,%rbx
4543 118,204, //jbe 28 <_sk_start_pipeline_sse41+0x28>
4544 91, //pop %rbx
4545 65,92, //pop %r12
4546 65,93, //pop %r13
4547 65,94, //pop %r14
4548 65,95, //pop %r15
4549 195, //retq
4550};
4551
4552CODE const uint8_t sk_just_return_sse41[] = {
4553 195, //retq
4554};
4555
4556CODE const uint8_t sk_seed_shader_sse41[] = {
4557 72,173, //lods %ds:(%rsi),%rax
4558 102,15,110,199, //movd %edi,%xmm0
4559 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
4560 15,91,200, //cvtdq2ps %xmm0,%xmm1
4561 243,15,16,18, //movss (%rdx),%xmm2
4562 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
4563 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
4564 15,88,203, //addps %xmm3,%xmm1
4565 15,16,66,20, //movups 0x14(%rdx),%xmm0
4566 15,88,193, //addps %xmm1,%xmm0
4567 102,15,110,8, //movd (%rax),%xmm1
4568 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
4569 15,91,201, //cvtdq2ps %xmm1,%xmm1
4570 15,88,203, //addps %xmm3,%xmm1
4571 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
4572 72,173, //lods %ds:(%rsi),%rax
4573 15,87,219, //xorps %xmm3,%xmm3
4574 15,87,228, //xorps %xmm4,%xmm4
4575 15,87,237, //xorps %xmm5,%xmm5
4576 15,87,246, //xorps %xmm6,%xmm6
4577 15,87,255, //xorps %xmm7,%xmm7
4578 255,224, //jmpq *%rax
4579};
4580
4581CODE const uint8_t sk_constant_color_sse41[] = {
4582 72,173, //lods %ds:(%rsi),%rax
4583 15,16,24, //movups (%rax),%xmm3
4584 15,40,195, //movaps %xmm3,%xmm0
4585 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
4586 15,40,203, //movaps %xmm3,%xmm1
4587 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
4588 15,40,211, //movaps %xmm3,%xmm2
4589 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
4590 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
4591 72,173, //lods %ds:(%rsi),%rax
4592 255,224, //jmpq *%rax
4593};
4594
4595CODE const uint8_t sk_clear_sse41[] = {
4596 72,173, //lods %ds:(%rsi),%rax
4597 15,87,192, //xorps %xmm0,%xmm0
4598 15,87,201, //xorps %xmm1,%xmm1
4599 15,87,210, //xorps %xmm2,%xmm2
4600 15,87,219, //xorps %xmm3,%xmm3
4601 255,224, //jmpq *%rax
4602};
4603
4604CODE const uint8_t sk_plus__sse41[] = {
4605 15,88,196, //addps %xmm4,%xmm0
4606 15,88,205, //addps %xmm5,%xmm1
4607 15,88,214, //addps %xmm6,%xmm2
4608 15,88,223, //addps %xmm7,%xmm3
4609 72,173, //lods %ds:(%rsi),%rax
4610 255,224, //jmpq *%rax
4611};
4612
4613CODE const uint8_t sk_srcover_sse41[] = {
4614 243,68,15,16,2, //movss (%rdx),%xmm8
4615 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4616 68,15,92,195, //subps %xmm3,%xmm8
4617 69,15,40,200, //movaps %xmm8,%xmm9
4618 68,15,89,204, //mulps %xmm4,%xmm9
4619 65,15,88,193, //addps %xmm9,%xmm0
4620 69,15,40,200, //movaps %xmm8,%xmm9
4621 68,15,89,205, //mulps %xmm5,%xmm9
4622 65,15,88,201, //addps %xmm9,%xmm1
4623 69,15,40,200, //movaps %xmm8,%xmm9
4624 68,15,89,206, //mulps %xmm6,%xmm9
4625 65,15,88,209, //addps %xmm9,%xmm2
4626 68,15,89,199, //mulps %xmm7,%xmm8
4627 65,15,88,216, //addps %xmm8,%xmm3
4628 72,173, //lods %ds:(%rsi),%rax
4629 255,224, //jmpq *%rax
4630};
4631
4632CODE const uint8_t sk_dstover_sse41[] = {
4633 243,68,15,16,2, //movss (%rdx),%xmm8
4634 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4635 68,15,92,199, //subps %xmm7,%xmm8
4636 65,15,89,192, //mulps %xmm8,%xmm0
4637 15,88,196, //addps %xmm4,%xmm0
4638 65,15,89,200, //mulps %xmm8,%xmm1
4639 15,88,205, //addps %xmm5,%xmm1
4640 65,15,89,208, //mulps %xmm8,%xmm2
4641 15,88,214, //addps %xmm6,%xmm2
4642 65,15,89,216, //mulps %xmm8,%xmm3
4643 15,88,223, //addps %xmm7,%xmm3
4644 72,173, //lods %ds:(%rsi),%rax
4645 255,224, //jmpq *%rax
4646};
4647
4648CODE const uint8_t sk_clamp_0_sse41[] = {
4649 69,15,87,192, //xorps %xmm8,%xmm8
4650 65,15,95,192, //maxps %xmm8,%xmm0
4651 65,15,95,200, //maxps %xmm8,%xmm1
4652 65,15,95,208, //maxps %xmm8,%xmm2
4653 65,15,95,216, //maxps %xmm8,%xmm3
4654 72,173, //lods %ds:(%rsi),%rax
4655 255,224, //jmpq *%rax
4656};
4657
4658CODE const uint8_t sk_clamp_1_sse41[] = {
4659 243,68,15,16,2, //movss (%rdx),%xmm8
4660 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4661 65,15,93,192, //minps %xmm8,%xmm0
4662 65,15,93,200, //minps %xmm8,%xmm1
4663 65,15,93,208, //minps %xmm8,%xmm2
4664 65,15,93,216, //minps %xmm8,%xmm3
4665 72,173, //lods %ds:(%rsi),%rax
4666 255,224, //jmpq *%rax
4667};
4668
4669CODE const uint8_t sk_clamp_a_sse41[] = {
4670 243,68,15,16,2, //movss (%rdx),%xmm8
4671 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4672 65,15,93,216, //minps %xmm8,%xmm3
4673 15,93,195, //minps %xmm3,%xmm0
4674 15,93,203, //minps %xmm3,%xmm1
4675 15,93,211, //minps %xmm3,%xmm2
4676 72,173, //lods %ds:(%rsi),%rax
4677 255,224, //jmpq *%rax
4678};
4679
4680CODE const uint8_t sk_set_rgb_sse41[] = {
4681 72,173, //lods %ds:(%rsi),%rax
4682 243,15,16,0, //movss (%rax),%xmm0
4683 243,15,16,72,4, //movss 0x4(%rax),%xmm1
4684 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
4685 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
4686 243,15,16,80,8, //movss 0x8(%rax),%xmm2
4687 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
4688 72,173, //lods %ds:(%rsi),%rax
4689 255,224, //jmpq *%rax
4690};
4691
4692CODE const uint8_t sk_swap_rb_sse41[] = {
4693 68,15,40,192, //movaps %xmm0,%xmm8
4694 72,173, //lods %ds:(%rsi),%rax
4695 15,40,194, //movaps %xmm2,%xmm0
4696 65,15,40,208, //movaps %xmm8,%xmm2
4697 255,224, //jmpq *%rax
4698};
4699
4700CODE const uint8_t sk_swap_sse41[] = {
4701 68,15,40,195, //movaps %xmm3,%xmm8
4702 68,15,40,202, //movaps %xmm2,%xmm9
4703 68,15,40,209, //movaps %xmm1,%xmm10
4704 68,15,40,216, //movaps %xmm0,%xmm11
4705 72,173, //lods %ds:(%rsi),%rax
4706 15,40,196, //movaps %xmm4,%xmm0
4707 15,40,205, //movaps %xmm5,%xmm1
4708 15,40,214, //movaps %xmm6,%xmm2
4709 15,40,223, //movaps %xmm7,%xmm3
4710 65,15,40,227, //movaps %xmm11,%xmm4
4711 65,15,40,234, //movaps %xmm10,%xmm5
4712 65,15,40,241, //movaps %xmm9,%xmm6
4713 65,15,40,248, //movaps %xmm8,%xmm7
4714 255,224, //jmpq *%rax
4715};
4716
4717CODE const uint8_t sk_move_src_dst_sse41[] = {
4718 72,173, //lods %ds:(%rsi),%rax
4719 15,40,224, //movaps %xmm0,%xmm4
4720 15,40,233, //movaps %xmm1,%xmm5
4721 15,40,242, //movaps %xmm2,%xmm6
4722 15,40,251, //movaps %xmm3,%xmm7
4723 255,224, //jmpq *%rax
4724};
4725
4726CODE const uint8_t sk_move_dst_src_sse41[] = {
4727 72,173, //lods %ds:(%rsi),%rax
4728 15,40,196, //movaps %xmm4,%xmm0
4729 15,40,205, //movaps %xmm5,%xmm1
4730 15,40,214, //movaps %xmm6,%xmm2
4731 15,40,223, //movaps %xmm7,%xmm3
4732 255,224, //jmpq *%rax
4733};
4734
4735CODE const uint8_t sk_premul_sse41[] = {
4736 15,89,195, //mulps %xmm3,%xmm0
4737 15,89,203, //mulps %xmm3,%xmm1
4738 15,89,211, //mulps %xmm3,%xmm2
4739 72,173, //lods %ds:(%rsi),%rax
4740 255,224, //jmpq *%rax
4741};
4742
4743CODE const uint8_t sk_unpremul_sse41[] = {
4744 68,15,40,192, //movaps %xmm0,%xmm8
4745 69,15,87,201, //xorps %xmm9,%xmm9
4746 243,68,15,16,18, //movss (%rdx),%xmm10
4747 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
4748 68,15,94,211, //divps %xmm3,%xmm10
4749 15,40,195, //movaps %xmm3,%xmm0
4750 65,15,194,193,0, //cmpeqps %xmm9,%xmm0
4751 102,69,15,56,20,209, //blendvps %xmm0,%xmm9,%xmm10
4752 69,15,89,194, //mulps %xmm10,%xmm8
4753 65,15,89,202, //mulps %xmm10,%xmm1
4754 65,15,89,210, //mulps %xmm10,%xmm2
4755 72,173, //lods %ds:(%rsi),%rax
4756 65,15,40,192, //movaps %xmm8,%xmm0
4757 255,224, //jmpq *%rax
4758};
4759
4760CODE const uint8_t sk_from_srgb_sse41[] = {
4761 68,15,40,194, //movaps %xmm2,%xmm8
4762 243,68,15,16,90,64, //movss 0x40(%rdx),%xmm11
4763 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
4764 69,15,40,211, //movaps %xmm11,%xmm10
4765 68,15,89,208, //mulps %xmm0,%xmm10
4766 68,15,40,240, //movaps %xmm0,%xmm14
4767 69,15,89,246, //mulps %xmm14,%xmm14
4768 243,15,16,82,60, //movss 0x3c(%rdx),%xmm2
4769 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
4770 243,68,15,16,98,52, //movss 0x34(%rdx),%xmm12
4771 243,68,15,16,106,56, //movss 0x38(%rdx),%xmm13
4772 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
4773 68,15,40,202, //movaps %xmm2,%xmm9
4774 68,15,89,200, //mulps %xmm0,%xmm9
4775 69,15,88,205, //addps %xmm13,%xmm9
4776 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
4777 69,15,89,206, //mulps %xmm14,%xmm9
4778 69,15,88,204, //addps %xmm12,%xmm9
4779 243,68,15,16,114,68, //movss 0x44(%rdx),%xmm14
4780 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
4781 65,15,194,198,1, //cmpltps %xmm14,%xmm0
4782 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9
4783 69,15,40,251, //movaps %xmm11,%xmm15
4784 68,15,89,249, //mulps %xmm1,%xmm15
4785 15,40,193, //movaps %xmm1,%xmm0
4786 15,89,192, //mulps %xmm0,%xmm0
4787 68,15,40,210, //movaps %xmm2,%xmm10
4788 68,15,89,209, //mulps %xmm1,%xmm10
4789 69,15,88,213, //addps %xmm13,%xmm10
4790 68,15,89,208, //mulps %xmm0,%xmm10
4791 69,15,88,212, //addps %xmm12,%xmm10
4792 65,15,194,206,1, //cmpltps %xmm14,%xmm1
4793 15,40,193, //movaps %xmm1,%xmm0
4794 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10
4795 69,15,89,216, //mulps %xmm8,%xmm11
4796 65,15,40,192, //movaps %xmm8,%xmm0
4797 15,89,192, //mulps %xmm0,%xmm0
4798 65,15,89,208, //mulps %xmm8,%xmm2
4799 65,15,88,213, //addps %xmm13,%xmm2
4800 15,89,208, //mulps %xmm0,%xmm2
4801 65,15,88,212, //addps %xmm12,%xmm2
4802 69,15,194,198,1, //cmpltps %xmm14,%xmm8
4803 65,15,40,192, //movaps %xmm8,%xmm0
4804 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2
4805 72,173, //lods %ds:(%rsi),%rax
4806 65,15,40,193, //movaps %xmm9,%xmm0
4807 65,15,40,202, //movaps %xmm10,%xmm1
4808 255,224, //jmpq *%rax
4809};
4810
4811CODE const uint8_t sk_to_srgb_sse41[] = {
4812 72,131,236,24, //sub $0x18,%rsp
4813 15,41,60,36, //movaps %xmm7,(%rsp)
4814 15,40,254, //movaps %xmm6,%xmm7
4815 15,40,245, //movaps %xmm5,%xmm6
4816 15,40,236, //movaps %xmm4,%xmm5
4817 15,40,227, //movaps %xmm3,%xmm4
4818 68,15,40,194, //movaps %xmm2,%xmm8
4819 15,40,217, //movaps %xmm1,%xmm3
4820 15,82,208, //rsqrtps %xmm0,%xmm2
4821 68,15,83,202, //rcpps %xmm2,%xmm9
4822 68,15,82,210, //rsqrtps %xmm2,%xmm10
4823 243,15,16,18, //movss (%rdx),%xmm2
4824 243,68,15,16,90,72, //movss 0x48(%rdx),%xmm11
4825 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
4826 65,15,40,203, //movaps %xmm11,%xmm1
4827 15,89,200, //mulps %xmm0,%xmm1
4828 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
4829 243,68,15,16,98,76, //movss 0x4c(%rdx),%xmm12
4830 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
4831 243,68,15,16,106,80, //movss 0x50(%rdx),%xmm13
4832 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
4833 243,68,15,16,114,84, //movss 0x54(%rdx),%xmm14
4834 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
4835 69,15,89,205, //mulps %xmm13,%xmm9
4836 69,15,88,206, //addps %xmm14,%xmm9
4837 69,15,89,212, //mulps %xmm12,%xmm10
4838 69,15,88,209, //addps %xmm9,%xmm10
4839 68,15,40,202, //movaps %xmm2,%xmm9
4840 69,15,93,202, //minps %xmm10,%xmm9
4841 243,68,15,16,122,88, //movss 0x58(%rdx),%xmm15
4842 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
4843 65,15,194,199,1, //cmpltps %xmm15,%xmm0
4844 102,68,15,56,20,201, //blendvps %xmm0,%xmm1,%xmm9
4845 15,82,195, //rsqrtps %xmm3,%xmm0
4846 15,83,200, //rcpps %xmm0,%xmm1
4847 15,82,192, //rsqrtps %xmm0,%xmm0
4848 65,15,89,205, //mulps %xmm13,%xmm1
4849 65,15,88,206, //addps %xmm14,%xmm1
4850 65,15,89,196, //mulps %xmm12,%xmm0
4851 15,88,193, //addps %xmm1,%xmm0
4852 68,15,40,210, //movaps %xmm2,%xmm10
4853 68,15,93,208, //minps %xmm0,%xmm10
4854 65,15,40,203, //movaps %xmm11,%xmm1
4855 15,89,203, //mulps %xmm3,%xmm1
4856 65,15,194,223,1, //cmpltps %xmm15,%xmm3
4857 15,40,195, //movaps %xmm3,%xmm0
4858 102,68,15,56,20,209, //blendvps %xmm0,%xmm1,%xmm10
4859 65,15,82,192, //rsqrtps %xmm8,%xmm0
4860 15,83,200, //rcpps %xmm0,%xmm1
4861 65,15,89,205, //mulps %xmm13,%xmm1
4862 65,15,88,206, //addps %xmm14,%xmm1
4863 15,82,192, //rsqrtps %xmm0,%xmm0
4864 65,15,89,196, //mulps %xmm12,%xmm0
4865 15,88,193, //addps %xmm1,%xmm0
4866 15,93,208, //minps %xmm0,%xmm2
4867 69,15,89,216, //mulps %xmm8,%xmm11
4868 69,15,194,199,1, //cmpltps %xmm15,%xmm8
4869 65,15,40,192, //movaps %xmm8,%xmm0
4870 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2
4871 72,173, //lods %ds:(%rsi),%rax
4872 65,15,40,193, //movaps %xmm9,%xmm0
4873 65,15,40,202, //movaps %xmm10,%xmm1
4874 15,40,220, //movaps %xmm4,%xmm3
4875 15,40,229, //movaps %xmm5,%xmm4
4876 15,40,238, //movaps %xmm6,%xmm5
4877 15,40,247, //movaps %xmm7,%xmm6
4878 15,40,60,36, //movaps (%rsp),%xmm7
4879 72,131,196,24, //add $0x18,%rsp
4880 255,224, //jmpq *%rax
4881};
4882
4883CODE const uint8_t sk_scale_1_float_sse41[] = {
4884 72,173, //lods %ds:(%rsi),%rax
4885 243,68,15,16,0, //movss (%rax),%xmm8
4886 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4887 65,15,89,192, //mulps %xmm8,%xmm0
4888 65,15,89,200, //mulps %xmm8,%xmm1
4889 65,15,89,208, //mulps %xmm8,%xmm2
4890 65,15,89,216, //mulps %xmm8,%xmm3
4891 72,173, //lods %ds:(%rsi),%rax
4892 255,224, //jmpq *%rax
4893};
4894
4895CODE const uint8_t sk_scale_u8_sse41[] = {
4896 72,173, //lods %ds:(%rsi),%rax
4897 72,139,0, //mov (%rax),%rax
4898 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
4899 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
4900 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
4901 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
4902 69,15,89,200, //mulps %xmm8,%xmm9
4903 65,15,89,193, //mulps %xmm9,%xmm0
4904 65,15,89,201, //mulps %xmm9,%xmm1
4905 65,15,89,209, //mulps %xmm9,%xmm2
4906 65,15,89,217, //mulps %xmm9,%xmm3
4907 72,173, //lods %ds:(%rsi),%rax
4908 255,224, //jmpq *%rax
4909};
4910
4911CODE const uint8_t sk_lerp_1_float_sse41[] = {
4912 72,173, //lods %ds:(%rsi),%rax
4913 243,68,15,16,0, //movss (%rax),%xmm8
4914 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
4915 15,92,196, //subps %xmm4,%xmm0
4916 65,15,89,192, //mulps %xmm8,%xmm0
4917 15,88,196, //addps %xmm4,%xmm0
4918 15,92,205, //subps %xmm5,%xmm1
4919 65,15,89,200, //mulps %xmm8,%xmm1
4920 15,88,205, //addps %xmm5,%xmm1
4921 15,92,214, //subps %xmm6,%xmm2
4922 65,15,89,208, //mulps %xmm8,%xmm2
4923 15,88,214, //addps %xmm6,%xmm2
4924 15,92,223, //subps %xmm7,%xmm3
4925 65,15,89,216, //mulps %xmm8,%xmm3
4926 15,88,223, //addps %xmm7,%xmm3
4927 72,173, //lods %ds:(%rsi),%rax
4928 255,224, //jmpq *%rax
4929};
4930
4931CODE const uint8_t sk_lerp_u8_sse41[] = {
4932 72,173, //lods %ds:(%rsi),%rax
4933 72,139,0, //mov (%rax),%rax
4934 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
4935 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
4936 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
4937 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
4938 69,15,89,200, //mulps %xmm8,%xmm9
4939 15,92,196, //subps %xmm4,%xmm0
4940 65,15,89,193, //mulps %xmm9,%xmm0
4941 15,88,196, //addps %xmm4,%xmm0
4942 15,92,205, //subps %xmm5,%xmm1
4943 65,15,89,201, //mulps %xmm9,%xmm1
4944 15,88,205, //addps %xmm5,%xmm1
4945 15,92,214, //subps %xmm6,%xmm2
4946 65,15,89,209, //mulps %xmm9,%xmm2
4947 15,88,214, //addps %xmm6,%xmm2
4948 15,92,223, //subps %xmm7,%xmm3
4949 65,15,89,217, //mulps %xmm9,%xmm3
4950 15,88,223, //addps %xmm7,%xmm3
4951 72,173, //lods %ds:(%rsi),%rax
4952 255,224, //jmpq *%rax
4953};
4954
4955CODE const uint8_t sk_lerp_565_sse41[] = {
4956 72,173, //lods %ds:(%rsi),%rax
4957 72,139,0, //mov (%rax),%rax
4958 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8
4959 102,15,110,90,104, //movd 0x68(%rdx),%xmm3
4960 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
4961 102,65,15,219,216, //pand %xmm8,%xmm3
4962 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
4963 243,15,16,26, //movss (%rdx),%xmm3
4964 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
4965 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
4966 69,15,89,209, //mulps %xmm9,%xmm10
4967 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
4968 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
4969 102,69,15,219,200, //pand %xmm8,%xmm9
4970 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
4971 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
4972 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
4973 69,15,89,217, //mulps %xmm9,%xmm11
4974 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
4975 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
4976 102,69,15,219,200, //pand %xmm8,%xmm9
4977 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
4978 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
4979 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
4980 69,15,89,200, //mulps %xmm8,%xmm9
4981 15,92,196, //subps %xmm4,%xmm0
4982 65,15,89,194, //mulps %xmm10,%xmm0
4983 15,88,196, //addps %xmm4,%xmm0
4984 15,92,205, //subps %xmm5,%xmm1
4985 65,15,89,203, //mulps %xmm11,%xmm1
4986 15,88,205, //addps %xmm5,%xmm1
4987 15,92,214, //subps %xmm6,%xmm2
4988 65,15,89,209, //mulps %xmm9,%xmm2
4989 15,88,214, //addps %xmm6,%xmm2
4990 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
4991 72,173, //lods %ds:(%rsi),%rax
4992 255,224, //jmpq *%rax
4993};
4994
4995CODE const uint8_t sk_load_tables_sse41[] = {
4996 72,173, //lods %ds:(%rsi),%rax
4997 72,139,8, //mov (%rax),%rcx
4998 76,139,64,8, //mov 0x8(%rax),%r8
4999 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8
5000 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
5001 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
5002 102,65,15,111,200, //movdqa %xmm8,%xmm1
5003 102,15,114,209,8, //psrld $0x8,%xmm1
5004 102,15,219,200, //pand %xmm0,%xmm1
5005 102,65,15,111,208, //movdqa %xmm8,%xmm2
5006 102,15,114,210,16, //psrld $0x10,%xmm2
5007 102,15,219,208, //pand %xmm0,%xmm2
5008 102,65,15,219,192, //pand %xmm8,%xmm0
5009 102,72,15,58,22,193,1, //pextrq $0x1,%xmm0,%rcx
5010 65,137,201, //mov %ecx,%r9d
5011 72,193,233,32, //shr $0x20,%rcx
5012 102,73,15,126,194, //movq %xmm0,%r10
5013 69,137,211, //mov %r10d,%r11d
5014 73,193,234,32, //shr $0x20,%r10
5015 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0
5016 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0
5017 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0
5018 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0
5019 72,139,72,16, //mov 0x10(%rax),%rcx
5020 102,73,15,58,22,200,1, //pextrq $0x1,%xmm1,%r8
5021 69,137,193, //mov %r8d,%r9d
5022 73,193,232,32, //shr $0x20,%r8
5023 102,73,15,126,202, //movq %xmm1,%r10
5024 69,137,211, //mov %r10d,%r11d
5025 73,193,234,32, //shr $0x20,%r10
5026 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1
5027 102,66,15,58,33,12,145,16, //insertps $0x10,(%rcx,%r10,4),%xmm1
5028 243,66,15,16,28,137, //movss (%rcx,%r9,4),%xmm3
5029 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1
5030 243,66,15,16,28,129, //movss (%rcx,%r8,4),%xmm3
5031 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1
5032 72,139,64,24, //mov 0x18(%rax),%rax
5033 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx
5034 65,137,200, //mov %ecx,%r8d
5035 72,193,233,32, //shr $0x20,%rcx
5036 102,73,15,126,209, //movq %xmm2,%r9
5037 69,137,202, //mov %r9d,%r10d
5038 73,193,233,32, //shr $0x20,%r9
5039 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2
5040 102,66,15,58,33,20,136,16, //insertps $0x10,(%rax,%r9,4),%xmm2
5041 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3
5042 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2
5043 243,15,16,28,136, //movss (%rax,%rcx,4),%xmm3
5044 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2
5045 102,65,15,114,208,24, //psrld $0x18,%xmm8
5046 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
5047 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
5048 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
5049 65,15,89,216, //mulps %xmm8,%xmm3
5050 72,173, //lods %ds:(%rsi),%rax
5051 255,224, //jmpq *%rax
5052};
5053
5054CODE const uint8_t sk_load_a8_sse41[] = {
5055 72,173, //lods %ds:(%rsi),%rax
5056 72,139,0, //mov (%rax),%rax
5057 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0
5058 15,91,192, //cvtdq2ps %xmm0,%xmm0
5059 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
5060 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
5061 15,89,216, //mulps %xmm0,%xmm3
5062 72,173, //lods %ds:(%rsi),%rax
5063 15,87,192, //xorps %xmm0,%xmm0
5064 15,87,201, //xorps %xmm1,%xmm1
5065 15,87,210, //xorps %xmm2,%xmm2
5066 255,224, //jmpq *%rax
5067};
5068
5069CODE const uint8_t sk_store_a8_sse41[] = {
5070 72,173, //lods %ds:(%rsi),%rax
5071 72,139,0, //mov (%rax),%rax
5072 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
5073 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5074 68,15,89,195, //mulps %xmm3,%xmm8
5075 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
5076 102,69,15,56,43,192, //packusdw %xmm8,%xmm8
5077 102,69,15,103,192, //packuswb %xmm8,%xmm8
5078 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1)
5079 72,173, //lods %ds:(%rsi),%rax
5080 255,224, //jmpq *%rax
5081};
5082
5083CODE const uint8_t sk_load_565_sse41[] = {
5084 72,173, //lods %ds:(%rsi),%rax
5085 72,139,0, //mov (%rax),%rax
5086 102,68,15,56,51,12,120, //pmovzxwd (%rax,%rdi,2),%xmm9
5087 102,15,110,66,104, //movd 0x68(%rdx),%xmm0
5088 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
5089 102,65,15,219,193, //pand %xmm9,%xmm0
5090 15,91,200, //cvtdq2ps %xmm0,%xmm1
5091 243,15,16,26, //movss (%rdx),%xmm3
5092 243,15,16,66,116, //movss 0x74(%rdx),%xmm0
5093 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5094 15,89,193, //mulps %xmm1,%xmm0
5095 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1
5096 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
5097 102,65,15,219,201, //pand %xmm9,%xmm1
5098 68,15,91,193, //cvtdq2ps %xmm1,%xmm8
5099 243,15,16,74,120, //movss 0x78(%rdx),%xmm1
5100 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
5101 65,15,89,200, //mulps %xmm8,%xmm1
5102 102,15,110,82,112, //movd 0x70(%rdx),%xmm2
5103 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2
5104 102,65,15,219,209, //pand %xmm9,%xmm2
5105 68,15,91,194, //cvtdq2ps %xmm2,%xmm8
5106 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2
5107 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
5108 65,15,89,208, //mulps %xmm8,%xmm2
5109 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
5110 72,173, //lods %ds:(%rsi),%rax
5111 255,224, //jmpq *%rax
5112};
5113
5114CODE const uint8_t sk_store_565_sse41[] = {
5115 72,173, //lods %ds:(%rsi),%rax
5116 72,139,0, //mov (%rax),%rax
5117 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8
5118 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9
5119 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5120 69,15,40,208, //movaps %xmm8,%xmm10
5121 68,15,89,208, //mulps %xmm0,%xmm10
5122 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
5123 102,65,15,114,242,11, //pslld $0xb,%xmm10
5124 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5125 68,15,89,201, //mulps %xmm1,%xmm9
5126 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
5127 102,65,15,114,241,5, //pslld $0x5,%xmm9
5128 102,69,15,235,202, //por %xmm10,%xmm9
5129 68,15,89,194, //mulps %xmm2,%xmm8
5130 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
5131 102,69,15,86,193, //orpd %xmm9,%xmm8
5132 102,69,15,56,43,192, //packusdw %xmm8,%xmm8
5133 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2)
5134 72,173, //lods %ds:(%rsi),%rax
5135 255,224, //jmpq *%rax
5136};
5137
5138CODE const uint8_t sk_load_8888_sse41[] = {
5139 72,173, //lods %ds:(%rsi),%rax
5140 72,139,0, //mov (%rax),%rax
5141 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
5142 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
5143 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
5144 102,15,111,203, //movdqa %xmm3,%xmm1
5145 102,15,114,209,8, //psrld $0x8,%xmm1
5146 102,15,219,200, //pand %xmm0,%xmm1
5147 102,15,111,211, //movdqa %xmm3,%xmm2
5148 102,15,114,210,16, //psrld $0x10,%xmm2
5149 102,15,219,208, //pand %xmm0,%xmm2
5150 102,15,219,195, //pand %xmm3,%xmm0
5151 15,91,192, //cvtdq2ps %xmm0,%xmm0
5152 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
5153 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5154 65,15,89,192, //mulps %xmm8,%xmm0
5155 15,91,201, //cvtdq2ps %xmm1,%xmm1
5156 65,15,89,200, //mulps %xmm8,%xmm1
5157 15,91,210, //cvtdq2ps %xmm2,%xmm2
5158 65,15,89,208, //mulps %xmm8,%xmm2
5159 102,15,114,211,24, //psrld $0x18,%xmm3
5160 15,91,219, //cvtdq2ps %xmm3,%xmm3
5161 65,15,89,216, //mulps %xmm8,%xmm3
5162 72,173, //lods %ds:(%rsi),%rax
5163 255,224, //jmpq *%rax
5164};
5165
5166CODE const uint8_t sk_store_8888_sse41[] = {
5167 72,173, //lods %ds:(%rsi),%rax
5168 72,139,0, //mov (%rax),%rax
5169 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
5170 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5171 69,15,40,200, //movaps %xmm8,%xmm9
5172 68,15,89,200, //mulps %xmm0,%xmm9
5173 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
5174 69,15,40,208, //movaps %xmm8,%xmm10
5175 68,15,89,209, //mulps %xmm1,%xmm10
5176 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
5177 102,65,15,114,242,8, //pslld $0x8,%xmm10
5178 102,69,15,235,209, //por %xmm9,%xmm10
5179 69,15,40,200, //movaps %xmm8,%xmm9
5180 68,15,89,202, //mulps %xmm2,%xmm9
5181 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
5182 102,65,15,114,241,16, //pslld $0x10,%xmm9
5183 68,15,89,195, //mulps %xmm3,%xmm8
5184 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
5185 102,65,15,114,240,24, //pslld $0x18,%xmm8
5186 102,69,15,235,193, //por %xmm9,%xmm8
5187 102,69,15,235,194, //por %xmm10,%xmm8
5188 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4)
5189 72,173, //lods %ds:(%rsi),%rax
5190 255,224, //jmpq *%rax
5191};
5192
5193CODE const uint8_t sk_load_f16_sse41[] = {
5194 72,173, //lods %ds:(%rsi),%rax
5195 72,139,0, //mov (%rax),%rax
5196 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0
5197 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1
5198 102,15,111,208, //movdqa %xmm0,%xmm2
5199 102,15,97,209, //punpcklwd %xmm1,%xmm2
5200 102,15,105,193, //punpckhwd %xmm1,%xmm0
5201 102,68,15,111,194, //movdqa %xmm2,%xmm8
5202 102,68,15,97,192, //punpcklwd %xmm0,%xmm8
5203 102,15,105,208, //punpckhwd %xmm0,%xmm2
5204 102,15,110,66,100, //movd 0x64(%rdx),%xmm0
5205 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3
5206 102,15,111,203, //movdqa %xmm3,%xmm1
5207 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1
5208 102,65,15,223,200, //pandn %xmm8,%xmm1
5209 102,15,101,218, //pcmpgtw %xmm2,%xmm3
5210 102,15,223,218, //pandn %xmm2,%xmm3
5211 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0
5212 102,15,114,240,13, //pslld $0xd,%xmm0
5213 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2
5214 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8
5215 65,15,89,192, //mulps %xmm8,%xmm0
5216 102,69,15,239,201, //pxor %xmm9,%xmm9
5217 102,65,15,105,201, //punpckhwd %xmm9,%xmm1
5218 102,15,114,241,13, //pslld $0xd,%xmm1
5219 65,15,89,200, //mulps %xmm8,%xmm1
5220 102,15,56,51,211, //pmovzxwd %xmm3,%xmm2
5221 102,15,114,242,13, //pslld $0xd,%xmm2
5222 65,15,89,208, //mulps %xmm8,%xmm2
5223 102,65,15,105,217, //punpckhwd %xmm9,%xmm3
5224 102,15,114,243,13, //pslld $0xd,%xmm3
5225 65,15,89,216, //mulps %xmm8,%xmm3
5226 72,173, //lods %ds:(%rsi),%rax
5227 255,224, //jmpq *%rax
5228};
5229
5230CODE const uint8_t sk_store_f16_sse41[] = {
5231 72,173, //lods %ds:(%rsi),%rax
5232 72,139,0, //mov (%rax),%rax
5233 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8
5234 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8
5235 102,69,15,111,200, //movdqa %xmm8,%xmm9
5236 68,15,89,200, //mulps %xmm0,%xmm9
5237 102,65,15,114,209,13, //psrld $0xd,%xmm9
5238 102,69,15,111,208, //movdqa %xmm8,%xmm10
5239 68,15,89,209, //mulps %xmm1,%xmm10
5240 102,65,15,114,210,13, //psrld $0xd,%xmm10
5241 102,69,15,111,216, //movdqa %xmm8,%xmm11
5242 68,15,89,218, //mulps %xmm2,%xmm11
5243 102,65,15,114,211,13, //psrld $0xd,%xmm11
5244 68,15,89,195, //mulps %xmm3,%xmm8
5245 102,65,15,114,208,13, //psrld $0xd,%xmm8
5246 102,65,15,115,250,2, //pslldq $0x2,%xmm10
5247 102,69,15,235,209, //por %xmm9,%xmm10
5248 102,65,15,115,248,2, //pslldq $0x2,%xmm8
5249 102,69,15,235,195, //por %xmm11,%xmm8
5250 102,69,15,111,202, //movdqa %xmm10,%xmm9
5251 102,69,15,98,200, //punpckldq %xmm8,%xmm9
5252 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8)
5253 102,69,15,106,208, //punpckhdq %xmm8,%xmm10
5254 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8)
5255 72,173, //lods %ds:(%rsi),%rax
5256 255,224, //jmpq *%rax
5257};
5258
5259CODE const uint8_t sk_store_f32_sse41[] = {
5260 72,173, //lods %ds:(%rsi),%rax
5261 72,139,0, //mov (%rax),%rax
5262 72,137,249, //mov %rdi,%rcx
5263 72,193,225,4, //shl $0x4,%rcx
5264 68,15,40,192, //movaps %xmm0,%xmm8
5265 68,15,40,200, //movaps %xmm0,%xmm9
5266 68,15,20,201, //unpcklps %xmm1,%xmm9
5267 68,15,40,210, //movaps %xmm2,%xmm10
5268 68,15,40,218, //movaps %xmm2,%xmm11
5269 68,15,20,219, //unpcklps %xmm3,%xmm11
5270 68,15,21,193, //unpckhps %xmm1,%xmm8
5271 68,15,21,211, //unpckhps %xmm3,%xmm10
5272 69,15,40,225, //movaps %xmm9,%xmm12
5273 102,69,15,20,227, //unpcklpd %xmm11,%xmm12
5274 102,69,15,21,203, //unpckhpd %xmm11,%xmm9
5275 69,15,40,216, //movaps %xmm8,%xmm11
5276 102,69,15,20,218, //unpcklpd %xmm10,%xmm11
5277 102,69,15,21,194, //unpckhpd %xmm10,%xmm8
5278 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1)
5279 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1)
5280 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1)
5281 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1)
5282 72,173, //lods %ds:(%rsi),%rax
5283 255,224, //jmpq *%rax
5284};
5285
5286CODE const uint8_t sk_clamp_x_sse41[] = {
5287 72,173, //lods %ds:(%rsi),%rax
5288 69,15,87,192, //xorps %xmm8,%xmm8
5289 68,15,95,192, //maxps %xmm0,%xmm8
5290 243,68,15,16,8, //movss (%rax),%xmm9
5291 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5292 102,15,118,192, //pcmpeqd %xmm0,%xmm0
5293 102,65,15,254,193, //paddd %xmm9,%xmm0
5294 68,15,93,192, //minps %xmm0,%xmm8
5295 72,173, //lods %ds:(%rsi),%rax
5296 65,15,40,192, //movaps %xmm8,%xmm0
5297 255,224, //jmpq *%rax
5298};
5299
5300CODE const uint8_t sk_clamp_y_sse41[] = {
5301 72,173, //lods %ds:(%rsi),%rax
5302 69,15,87,192, //xorps %xmm8,%xmm8
5303 68,15,95,193, //maxps %xmm1,%xmm8
5304 243,68,15,16,8, //movss (%rax),%xmm9
5305 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5306 102,15,118,201, //pcmpeqd %xmm1,%xmm1
5307 102,65,15,254,201, //paddd %xmm9,%xmm1
5308 68,15,93,193, //minps %xmm1,%xmm8
5309 72,173, //lods %ds:(%rsi),%rax
5310 65,15,40,200, //movaps %xmm8,%xmm1
5311 255,224, //jmpq *%rax
5312};
5313
5314CODE const uint8_t sk_repeat_x_sse41[] = {
5315 72,173, //lods %ds:(%rsi),%rax
5316 243,68,15,16,0, //movss (%rax),%xmm8
5317 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5318 68,15,40,200, //movaps %xmm0,%xmm9
5319 69,15,94,200, //divps %xmm8,%xmm9
5320 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9
5321 69,15,89,200, //mulps %xmm8,%xmm9
5322 65,15,92,193, //subps %xmm9,%xmm0
5323 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
5324 102,69,15,254,200, //paddd %xmm8,%xmm9
5325 65,15,93,193, //minps %xmm9,%xmm0
5326 72,173, //lods %ds:(%rsi),%rax
5327 255,224, //jmpq *%rax
5328};
5329
5330CODE const uint8_t sk_repeat_y_sse41[] = {
5331 72,173, //lods %ds:(%rsi),%rax
5332 243,68,15,16,0, //movss (%rax),%xmm8
5333 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5334 68,15,40,201, //movaps %xmm1,%xmm9
5335 69,15,94,200, //divps %xmm8,%xmm9
5336 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9
5337 69,15,89,200, //mulps %xmm8,%xmm9
5338 65,15,92,201, //subps %xmm9,%xmm1
5339 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
5340 102,69,15,254,200, //paddd %xmm8,%xmm9
5341 65,15,93,201, //minps %xmm9,%xmm1
5342 72,173, //lods %ds:(%rsi),%rax
5343 255,224, //jmpq *%rax
5344};
5345
5346CODE const uint8_t sk_mirror_x_sse41[] = {
5347 72,173, //lods %ds:(%rsi),%rax
5348 243,68,15,16,0, //movss (%rax),%xmm8
5349 69,15,40,200, //movaps %xmm8,%xmm9
5350 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5351 65,15,92,193, //subps %xmm9,%xmm0
5352 243,69,15,88,192, //addss %xmm8,%xmm8
5353 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5354 68,15,40,208, //movaps %xmm0,%xmm10
5355 69,15,94,208, //divps %xmm8,%xmm10
5356 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10
5357 69,15,89,208, //mulps %xmm8,%xmm10
5358 65,15,92,194, //subps %xmm10,%xmm0
5359 65,15,92,193, //subps %xmm9,%xmm0
5360 69,15,87,192, //xorps %xmm8,%xmm8
5361 68,15,92,192, //subps %xmm0,%xmm8
5362 65,15,84,192, //andps %xmm8,%xmm0
5363 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8
5364 102,69,15,254,193, //paddd %xmm9,%xmm8
5365 65,15,93,192, //minps %xmm8,%xmm0
5366 72,173, //lods %ds:(%rsi),%rax
5367 255,224, //jmpq *%rax
5368};
5369
5370CODE const uint8_t sk_mirror_y_sse41[] = {
5371 72,173, //lods %ds:(%rsi),%rax
5372 243,68,15,16,0, //movss (%rax),%xmm8
5373 69,15,40,200, //movaps %xmm8,%xmm9
5374 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5375 65,15,92,201, //subps %xmm9,%xmm1
5376 243,69,15,88,192, //addss %xmm8,%xmm8
5377 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5378 68,15,40,209, //movaps %xmm1,%xmm10
5379 69,15,94,208, //divps %xmm8,%xmm10
5380 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10
5381 69,15,89,208, //mulps %xmm8,%xmm10
5382 65,15,92,202, //subps %xmm10,%xmm1
5383 65,15,92,201, //subps %xmm9,%xmm1
5384 69,15,87,192, //xorps %xmm8,%xmm8
5385 68,15,92,193, //subps %xmm1,%xmm8
5386 65,15,84,200, //andps %xmm8,%xmm1
5387 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8
5388 102,69,15,254,193, //paddd %xmm9,%xmm8
5389 65,15,93,200, //minps %xmm8,%xmm1
5390 72,173, //lods %ds:(%rsi),%rax
5391 255,224, //jmpq *%rax
5392};
5393
5394CODE const uint8_t sk_matrix_2x3_sse41[] = {
5395 68,15,40,201, //movaps %xmm1,%xmm9
5396 68,15,40,192, //movaps %xmm0,%xmm8
5397 72,173, //lods %ds:(%rsi),%rax
5398 243,15,16,0, //movss (%rax),%xmm0
5399 243,15,16,72,4, //movss 0x4(%rax),%xmm1
5400 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5401 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
5402 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5403 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11
5404 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5405 69,15,89,209, //mulps %xmm9,%xmm10
5406 69,15,88,211, //addps %xmm11,%xmm10
5407 65,15,89,192, //mulps %xmm8,%xmm0
5408 65,15,88,194, //addps %xmm10,%xmm0
5409 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
5410 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
5411 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5412 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
5413 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5414 69,15,89,209, //mulps %xmm9,%xmm10
5415 69,15,88,211, //addps %xmm11,%xmm10
5416 65,15,89,200, //mulps %xmm8,%xmm1
5417 65,15,88,202, //addps %xmm10,%xmm1
5418 72,173, //lods %ds:(%rsi),%rax
5419 255,224, //jmpq *%rax
5420};
5421
5422CODE const uint8_t sk_matrix_3x4_sse41[] = {
5423 68,15,40,201, //movaps %xmm1,%xmm9
5424 68,15,40,192, //movaps %xmm0,%xmm8
5425 72,173, //lods %ds:(%rsi),%rax
5426 243,15,16,0, //movss (%rax),%xmm0
5427 243,15,16,72,4, //movss 0x4(%rax),%xmm1
5428 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5429 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
5430 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5431 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
5432 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5433 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12
5434 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5435 68,15,89,218, //mulps %xmm2,%xmm11
5436 69,15,88,220, //addps %xmm12,%xmm11
5437 69,15,89,209, //mulps %xmm9,%xmm10
5438 69,15,88,211, //addps %xmm11,%xmm10
5439 65,15,89,192, //mulps %xmm8,%xmm0
5440 65,15,88,194, //addps %xmm10,%xmm0
5441 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
5442 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
5443 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5444 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
5445 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5446 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
5447 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5448 68,15,89,218, //mulps %xmm2,%xmm11
5449 69,15,88,220, //addps %xmm12,%xmm11
5450 69,15,89,209, //mulps %xmm9,%xmm10
5451 69,15,88,211, //addps %xmm11,%xmm10
5452 65,15,89,200, //mulps %xmm8,%xmm1
5453 65,15,88,202, //addps %xmm10,%xmm1
5454 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
5455 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5456 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
5457 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5458 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
5459 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5460 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
5461 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
5462 68,15,89,226, //mulps %xmm2,%xmm12
5463 69,15,88,229, //addps %xmm13,%xmm12
5464 69,15,89,217, //mulps %xmm9,%xmm11
5465 69,15,88,220, //addps %xmm12,%xmm11
5466 69,15,89,208, //mulps %xmm8,%xmm10
5467 69,15,88,211, //addps %xmm11,%xmm10
5468 72,173, //lods %ds:(%rsi),%rax
5469 65,15,40,210, //movaps %xmm10,%xmm2
5470 255,224, //jmpq *%rax
5471};
5472
5473CODE const uint8_t sk_matrix_perspective_sse41[] = {
5474 68,15,40,192, //movaps %xmm0,%xmm8
5475 72,173, //lods %ds:(%rsi),%rax
5476 243,15,16,0, //movss (%rax),%xmm0
5477 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9
5478 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5479 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5480 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
5481 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5482 68,15,89,201, //mulps %xmm1,%xmm9
5483 69,15,88,202, //addps %xmm10,%xmm9
5484 65,15,89,192, //mulps %xmm8,%xmm0
5485 65,15,88,193, //addps %xmm9,%xmm0
5486 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9
5487 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5488 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
5489 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5490 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
5491 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5492 68,15,89,209, //mulps %xmm1,%xmm10
5493 69,15,88,211, //addps %xmm11,%xmm10
5494 69,15,89,200, //mulps %xmm8,%xmm9
5495 69,15,88,202, //addps %xmm10,%xmm9
5496 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10
5497 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5498 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
5499 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5500 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
5501 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5502 68,15,89,217, //mulps %xmm1,%xmm11
5503 69,15,88,220, //addps %xmm12,%xmm11
5504 69,15,89,208, //mulps %xmm8,%xmm10
5505 69,15,88,211, //addps %xmm11,%xmm10
5506 65,15,83,202, //rcpps %xmm10,%xmm1
5507 15,89,193, //mulps %xmm1,%xmm0
5508 68,15,89,201, //mulps %xmm1,%xmm9
5509 72,173, //lods %ds:(%rsi),%rax
5510 65,15,40,201, //movaps %xmm9,%xmm1
5511 255,224, //jmpq *%rax
5512};
5513
5514CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
5515 72,173, //lods %ds:(%rsi),%rax
5516 68,15,16,8, //movups (%rax),%xmm9
5517 15,16,88,16, //movups 0x10(%rax),%xmm3
5518 68,15,40,195, //movaps %xmm3,%xmm8
5519 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5520 65,15,40,201, //movaps %xmm9,%xmm1
5521 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
5522 68,15,89,192, //mulps %xmm0,%xmm8
5523 68,15,88,193, //addps %xmm1,%xmm8
5524 15,40,203, //movaps %xmm3,%xmm1
5525 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
5526 65,15,40,209, //movaps %xmm9,%xmm2
5527 15,198,210,85, //shufps $0x55,%xmm2,%xmm2
5528 15,89,200, //mulps %xmm0,%xmm1
5529 15,88,202, //addps %xmm2,%xmm1
5530 15,40,211, //movaps %xmm3,%xmm2
5531 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
5532 69,15,40,209, //movaps %xmm9,%xmm10
5533 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10
5534 15,89,208, //mulps %xmm0,%xmm2
5535 65,15,88,210, //addps %xmm10,%xmm2
5536 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
5537 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9
5538 15,89,216, //mulps %xmm0,%xmm3
5539 65,15,88,217, //addps %xmm9,%xmm3
5540 72,173, //lods %ds:(%rsi),%rax
5541 65,15,40,192, //movaps %xmm8,%xmm0
5542 255,224, //jmpq *%rax
5543};
5544
5545CODE const uint8_t sk_start_pipeline_sse2[] = {
5546 65,87, //push %r15
5547 65,86, //push %r14
5548 65,85, //push %r13
5549 65,84, //push %r12
5550 83, //push %rbx
5551 73,137,207, //mov %rcx,%r15
5552 73,137,214, //mov %rdx,%r14
5553 72,137,251, //mov %rdi,%rbx
5554 72,173, //lods %ds:(%rsi),%rax
5555 73,137,196, //mov %rax,%r12
5556 73,137,245, //mov %rsi,%r13
5557 72,141,67,4, //lea 0x4(%rbx),%rax
5558 76,57,248, //cmp %r15,%rax
5559 118,5, //jbe 28 <_sk_start_pipeline_sse2+0x28>
5560 72,137,216, //mov %rbx,%rax
5561 235,52, //jmp 5c <_sk_start_pipeline_sse2+0x5c>
5562 15,87,192, //xorps %xmm0,%xmm0
5563 15,87,201, //xorps %xmm1,%xmm1
5564 15,87,210, //xorps %xmm2,%xmm2
5565 15,87,219, //xorps %xmm3,%xmm3
5566 15,87,228, //xorps %xmm4,%xmm4
5567 15,87,237, //xorps %xmm5,%xmm5
5568 15,87,246, //xorps %xmm6,%xmm6
5569 15,87,255, //xorps %xmm7,%xmm7
5570 72,137,223, //mov %rbx,%rdi
5571 76,137,238, //mov %r13,%rsi
5572 76,137,242, //mov %r14,%rdx
5573 65,255,212, //callq *%r12
5574 72,141,67,4, //lea 0x4(%rbx),%rax
5575 72,131,195,8, //add $0x8,%rbx
5576 76,57,251, //cmp %r15,%rbx
5577 72,137,195, //mov %rax,%rbx
5578 118,204, //jbe 28 <_sk_start_pipeline_sse2+0x28>
5579 91, //pop %rbx
5580 65,92, //pop %r12
5581 65,93, //pop %r13
5582 65,94, //pop %r14
5583 65,95, //pop %r15
5584 195, //retq
5585};
5586
5587CODE const uint8_t sk_just_return_sse2[] = {
5588 195, //retq
5589};
5590
5591CODE const uint8_t sk_seed_shader_sse2[] = {
5592 72,173, //lods %ds:(%rsi),%rax
5593 102,15,110,199, //movd %edi,%xmm0
5594 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
5595 15,91,200, //cvtdq2ps %xmm0,%xmm1
5596 243,15,16,18, //movss (%rdx),%xmm2
5597 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
5598 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
5599 15,88,203, //addps %xmm3,%xmm1
5600 15,16,66,20, //movups 0x14(%rdx),%xmm0
5601 15,88,193, //addps %xmm1,%xmm0
5602 102,15,110,8, //movd (%rax),%xmm1
5603 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
5604 15,91,201, //cvtdq2ps %xmm1,%xmm1
5605 15,88,203, //addps %xmm3,%xmm1
5606 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
5607 72,173, //lods %ds:(%rsi),%rax
5608 15,87,219, //xorps %xmm3,%xmm3
5609 15,87,228, //xorps %xmm4,%xmm4
5610 15,87,237, //xorps %xmm5,%xmm5
5611 15,87,246, //xorps %xmm6,%xmm6
5612 15,87,255, //xorps %xmm7,%xmm7
5613 255,224, //jmpq *%rax
5614};
5615
5616CODE const uint8_t sk_constant_color_sse2[] = {
5617 72,173, //lods %ds:(%rsi),%rax
5618 15,16,24, //movups (%rax),%xmm3
5619 15,40,195, //movaps %xmm3,%xmm0
5620 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5621 15,40,203, //movaps %xmm3,%xmm1
5622 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
5623 15,40,211, //movaps %xmm3,%xmm2
5624 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
5625 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
5626 72,173, //lods %ds:(%rsi),%rax
5627 255,224, //jmpq *%rax
5628};
5629
5630CODE const uint8_t sk_clear_sse2[] = {
5631 72,173, //lods %ds:(%rsi),%rax
5632 15,87,192, //xorps %xmm0,%xmm0
5633 15,87,201, //xorps %xmm1,%xmm1
5634 15,87,210, //xorps %xmm2,%xmm2
5635 15,87,219, //xorps %xmm3,%xmm3
5636 255,224, //jmpq *%rax
5637};
5638
5639CODE const uint8_t sk_plus__sse2[] = {
5640 15,88,196, //addps %xmm4,%xmm0
5641 15,88,205, //addps %xmm5,%xmm1
5642 15,88,214, //addps %xmm6,%xmm2
5643 15,88,223, //addps %xmm7,%xmm3
5644 72,173, //lods %ds:(%rsi),%rax
5645 255,224, //jmpq *%rax
5646};
5647
5648CODE const uint8_t sk_srcover_sse2[] = {
5649 243,68,15,16,2, //movss (%rdx),%xmm8
5650 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5651 68,15,92,195, //subps %xmm3,%xmm8
5652 69,15,40,200, //movaps %xmm8,%xmm9
5653 68,15,89,204, //mulps %xmm4,%xmm9
5654 65,15,88,193, //addps %xmm9,%xmm0
5655 69,15,40,200, //movaps %xmm8,%xmm9
5656 68,15,89,205, //mulps %xmm5,%xmm9
5657 65,15,88,201, //addps %xmm9,%xmm1
5658 69,15,40,200, //movaps %xmm8,%xmm9
5659 68,15,89,206, //mulps %xmm6,%xmm9
5660 65,15,88,209, //addps %xmm9,%xmm2
5661 68,15,89,199, //mulps %xmm7,%xmm8
5662 65,15,88,216, //addps %xmm8,%xmm3
5663 72,173, //lods %ds:(%rsi),%rax
5664 255,224, //jmpq *%rax
5665};
5666
5667CODE const uint8_t sk_dstover_sse2[] = {
5668 243,68,15,16,2, //movss (%rdx),%xmm8
5669 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5670 68,15,92,199, //subps %xmm7,%xmm8
5671 65,15,89,192, //mulps %xmm8,%xmm0
5672 15,88,196, //addps %xmm4,%xmm0
5673 65,15,89,200, //mulps %xmm8,%xmm1
5674 15,88,205, //addps %xmm5,%xmm1
5675 65,15,89,208, //mulps %xmm8,%xmm2
5676 15,88,214, //addps %xmm6,%xmm2
5677 65,15,89,216, //mulps %xmm8,%xmm3
5678 15,88,223, //addps %xmm7,%xmm3
5679 72,173, //lods %ds:(%rsi),%rax
5680 255,224, //jmpq *%rax
5681};
5682
5683CODE const uint8_t sk_clamp_0_sse2[] = {
5684 69,15,87,192, //xorps %xmm8,%xmm8
5685 65,15,95,192, //maxps %xmm8,%xmm0
5686 65,15,95,200, //maxps %xmm8,%xmm1
5687 65,15,95,208, //maxps %xmm8,%xmm2
5688 65,15,95,216, //maxps %xmm8,%xmm3
5689 72,173, //lods %ds:(%rsi),%rax
5690 255,224, //jmpq *%rax
5691};
5692
5693CODE const uint8_t sk_clamp_1_sse2[] = {
5694 243,68,15,16,2, //movss (%rdx),%xmm8
5695 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5696 65,15,93,192, //minps %xmm8,%xmm0
5697 65,15,93,200, //minps %xmm8,%xmm1
5698 65,15,93,208, //minps %xmm8,%xmm2
5699 65,15,93,216, //minps %xmm8,%xmm3
5700 72,173, //lods %ds:(%rsi),%rax
5701 255,224, //jmpq *%rax
5702};
5703
5704CODE const uint8_t sk_clamp_a_sse2[] = {
5705 243,68,15,16,2, //movss (%rdx),%xmm8
5706 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5707 65,15,93,216, //minps %xmm8,%xmm3
5708 15,93,195, //minps %xmm3,%xmm0
5709 15,93,203, //minps %xmm3,%xmm1
5710 15,93,211, //minps %xmm3,%xmm2
5711 72,173, //lods %ds:(%rsi),%rax
5712 255,224, //jmpq *%rax
5713};
5714
5715CODE const uint8_t sk_set_rgb_sse2[] = {
5716 72,173, //lods %ds:(%rsi),%rax
5717 243,15,16,0, //movss (%rax),%xmm0
5718 243,15,16,72,4, //movss 0x4(%rax),%xmm1
5719 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
5720 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
5721 243,15,16,80,8, //movss 0x8(%rax),%xmm2
5722 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
5723 72,173, //lods %ds:(%rsi),%rax
5724 255,224, //jmpq *%rax
5725};
5726
5727CODE const uint8_t sk_swap_rb_sse2[] = {
5728 68,15,40,192, //movaps %xmm0,%xmm8
5729 72,173, //lods %ds:(%rsi),%rax
5730 15,40,194, //movaps %xmm2,%xmm0
5731 65,15,40,208, //movaps %xmm8,%xmm2
5732 255,224, //jmpq *%rax
5733};
5734
5735CODE const uint8_t sk_swap_sse2[] = {
5736 68,15,40,195, //movaps %xmm3,%xmm8
5737 68,15,40,202, //movaps %xmm2,%xmm9
5738 68,15,40,209, //movaps %xmm1,%xmm10
5739 68,15,40,216, //movaps %xmm0,%xmm11
5740 72,173, //lods %ds:(%rsi),%rax
5741 15,40,196, //movaps %xmm4,%xmm0
5742 15,40,205, //movaps %xmm5,%xmm1
5743 15,40,214, //movaps %xmm6,%xmm2
5744 15,40,223, //movaps %xmm7,%xmm3
5745 65,15,40,227, //movaps %xmm11,%xmm4
5746 65,15,40,234, //movaps %xmm10,%xmm5
5747 65,15,40,241, //movaps %xmm9,%xmm6
5748 65,15,40,248, //movaps %xmm8,%xmm7
5749 255,224, //jmpq *%rax
5750};
5751
5752CODE const uint8_t sk_move_src_dst_sse2[] = {
5753 72,173, //lods %ds:(%rsi),%rax
5754 15,40,224, //movaps %xmm0,%xmm4
5755 15,40,233, //movaps %xmm1,%xmm5
5756 15,40,242, //movaps %xmm2,%xmm6
5757 15,40,251, //movaps %xmm3,%xmm7
5758 255,224, //jmpq *%rax
5759};
5760
5761CODE const uint8_t sk_move_dst_src_sse2[] = {
5762 72,173, //lods %ds:(%rsi),%rax
5763 15,40,196, //movaps %xmm4,%xmm0
5764 15,40,205, //movaps %xmm5,%xmm1
5765 15,40,214, //movaps %xmm6,%xmm2
5766 15,40,223, //movaps %xmm7,%xmm3
5767 255,224, //jmpq *%rax
5768};
5769
5770CODE const uint8_t sk_premul_sse2[] = {
5771 15,89,195, //mulps %xmm3,%xmm0
5772 15,89,203, //mulps %xmm3,%xmm1
5773 15,89,211, //mulps %xmm3,%xmm2
5774 72,173, //lods %ds:(%rsi),%rax
5775 255,224, //jmpq *%rax
5776};
5777
5778CODE const uint8_t sk_unpremul_sse2[] = {
5779 69,15,87,192, //xorps %xmm8,%xmm8
5780 68,15,194,195,0, //cmpeqps %xmm3,%xmm8
5781 243,68,15,16,10, //movss (%rdx),%xmm9
5782 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5783 68,15,94,203, //divps %xmm3,%xmm9
5784 69,15,85,193, //andnps %xmm9,%xmm8
5785 65,15,89,192, //mulps %xmm8,%xmm0
5786 65,15,89,200, //mulps %xmm8,%xmm1
5787 65,15,89,208, //mulps %xmm8,%xmm2
5788 72,173, //lods %ds:(%rsi),%rax
5789 255,224, //jmpq *%rax
5790};
5791
5792CODE const uint8_t sk_from_srgb_sse2[] = {
5793 243,68,15,16,66,64, //movss 0x40(%rdx),%xmm8
5794 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5795 69,15,40,232, //movaps %xmm8,%xmm13
5796 68,15,89,232, //mulps %xmm0,%xmm13
5797 68,15,40,224, //movaps %xmm0,%xmm12
5798 69,15,89,228, //mulps %xmm12,%xmm12
5799 243,68,15,16,74,60, //movss 0x3c(%rdx),%xmm9
5800 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5801 243,68,15,16,82,52, //movss 0x34(%rdx),%xmm10
5802 243,68,15,16,90,56, //movss 0x38(%rdx),%xmm11
5803 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5804 69,15,40,241, //movaps %xmm9,%xmm14
5805 68,15,89,240, //mulps %xmm0,%xmm14
5806 69,15,88,243, //addps %xmm11,%xmm14
5807 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5808 69,15,89,244, //mulps %xmm12,%xmm14
5809 69,15,88,242, //addps %xmm10,%xmm14
5810 243,68,15,16,98,68, //movss 0x44(%rdx),%xmm12
5811 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5812 65,15,194,196,1, //cmpltps %xmm12,%xmm0
5813 68,15,84,232, //andps %xmm0,%xmm13
5814 65,15,85,198, //andnps %xmm14,%xmm0
5815 65,15,86,197, //orps %xmm13,%xmm0
5816 69,15,40,232, //movaps %xmm8,%xmm13
5817 68,15,89,233, //mulps %xmm1,%xmm13
5818 68,15,40,241, //movaps %xmm1,%xmm14
5819 69,15,89,246, //mulps %xmm14,%xmm14
5820 69,15,40,249, //movaps %xmm9,%xmm15
5821 68,15,89,249, //mulps %xmm1,%xmm15
5822 69,15,88,251, //addps %xmm11,%xmm15
5823 69,15,89,254, //mulps %xmm14,%xmm15
5824 69,15,88,250, //addps %xmm10,%xmm15
5825 65,15,194,204,1, //cmpltps %xmm12,%xmm1
5826 68,15,84,233, //andps %xmm1,%xmm13
5827 65,15,85,207, //andnps %xmm15,%xmm1
5828 65,15,86,205, //orps %xmm13,%xmm1
5829 68,15,89,194, //mulps %xmm2,%xmm8
5830 68,15,40,234, //movaps %xmm2,%xmm13
5831 69,15,89,237, //mulps %xmm13,%xmm13
5832 68,15,89,202, //mulps %xmm2,%xmm9
5833 69,15,88,203, //addps %xmm11,%xmm9
5834 69,15,89,205, //mulps %xmm13,%xmm9
5835 69,15,88,202, //addps %xmm10,%xmm9
5836 65,15,194,212,1, //cmpltps %xmm12,%xmm2
5837 68,15,84,194, //andps %xmm2,%xmm8
5838 65,15,85,209, //andnps %xmm9,%xmm2
5839 65,15,86,208, //orps %xmm8,%xmm2
5840 72,173, //lods %ds:(%rsi),%rax
5841 255,224, //jmpq *%rax
5842};
5843
5844CODE const uint8_t sk_to_srgb_sse2[] = {
5845 72,131,236,40, //sub $0x28,%rsp
5846 15,41,124,36,16, //movaps %xmm7,0x10(%rsp)
5847 15,41,52,36, //movaps %xmm6,(%rsp)
5848 15,40,245, //movaps %xmm5,%xmm6
5849 15,40,236, //movaps %xmm4,%xmm5
5850 15,40,227, //movaps %xmm3,%xmm4
5851 68,15,82,192, //rsqrtps %xmm0,%xmm8
5852 69,15,83,232, //rcpps %xmm8,%xmm13
5853 69,15,82,248, //rsqrtps %xmm8,%xmm15
5854 243,15,16,26, //movss (%rdx),%xmm3
5855 243,68,15,16,66,72, //movss 0x48(%rdx),%xmm8
5856 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5857 69,15,40,240, //movaps %xmm8,%xmm14
5858 68,15,89,240, //mulps %xmm0,%xmm14
5859 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
5860 243,68,15,16,82,76, //movss 0x4c(%rdx),%xmm10
5861 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
5862 243,68,15,16,90,80, //movss 0x50(%rdx),%xmm11
5863 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
5864 243,68,15,16,98,84, //movss 0x54(%rdx),%xmm12
5865 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
5866 69,15,89,235, //mulps %xmm11,%xmm13
5867 69,15,88,236, //addps %xmm12,%xmm13
5868 69,15,89,250, //mulps %xmm10,%xmm15
5869 69,15,88,253, //addps %xmm13,%xmm15
5870 68,15,40,203, //movaps %xmm3,%xmm9
5871 69,15,93,207, //minps %xmm15,%xmm9
5872 243,68,15,16,106,88, //movss 0x58(%rdx),%xmm13
5873 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
5874 65,15,194,197,1, //cmpltps %xmm13,%xmm0
5875 68,15,84,240, //andps %xmm0,%xmm14
5876 65,15,85,193, //andnps %xmm9,%xmm0
5877 65,15,86,198, //orps %xmm14,%xmm0
5878 68,15,82,201, //rsqrtps %xmm1,%xmm9
5879 69,15,83,241, //rcpps %xmm9,%xmm14
5880 69,15,82,201, //rsqrtps %xmm9,%xmm9
5881 69,15,89,243, //mulps %xmm11,%xmm14
5882 69,15,88,244, //addps %xmm12,%xmm14
5883 69,15,89,202, //mulps %xmm10,%xmm9
5884 69,15,88,206, //addps %xmm14,%xmm9
5885 68,15,40,243, //movaps %xmm3,%xmm14
5886 69,15,93,241, //minps %xmm9,%xmm14
5887 69,15,40,200, //movaps %xmm8,%xmm9
5888 68,15,89,201, //mulps %xmm1,%xmm9
5889 65,15,194,205,1, //cmpltps %xmm13,%xmm1
5890 68,15,84,201, //andps %xmm1,%xmm9
5891 65,15,85,206, //andnps %xmm14,%xmm1
5892 65,15,86,201, //orps %xmm9,%xmm1
5893 68,15,82,202, //rsqrtps %xmm2,%xmm9
5894 69,15,83,241, //rcpps %xmm9,%xmm14
5895 69,15,89,243, //mulps %xmm11,%xmm14
5896 69,15,88,244, //addps %xmm12,%xmm14
5897 65,15,82,249, //rsqrtps %xmm9,%xmm7
5898 65,15,89,250, //mulps %xmm10,%xmm7
5899 65,15,88,254, //addps %xmm14,%xmm7
5900 15,93,223, //minps %xmm7,%xmm3
5901 68,15,89,194, //mulps %xmm2,%xmm8
5902 65,15,194,213,1, //cmpltps %xmm13,%xmm2
5903 68,15,84,194, //andps %xmm2,%xmm8
5904 15,85,211, //andnps %xmm3,%xmm2
5905 65,15,86,208, //orps %xmm8,%xmm2
5906 72,173, //lods %ds:(%rsi),%rax
5907 15,40,220, //movaps %xmm4,%xmm3
5908 15,40,229, //movaps %xmm5,%xmm4
5909 15,40,238, //movaps %xmm6,%xmm5
5910 15,40,52,36, //movaps (%rsp),%xmm6
5911 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7
5912 72,131,196,40, //add $0x28,%rsp
5913 255,224, //jmpq *%rax
5914};
5915
5916CODE const uint8_t sk_scale_1_float_sse2[] = {
5917 72,173, //lods %ds:(%rsi),%rax
5918 243,68,15,16,0, //movss (%rax),%xmm8
5919 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5920 65,15,89,192, //mulps %xmm8,%xmm0
5921 65,15,89,200, //mulps %xmm8,%xmm1
5922 65,15,89,208, //mulps %xmm8,%xmm2
5923 65,15,89,216, //mulps %xmm8,%xmm3
5924 72,173, //lods %ds:(%rsi),%rax
5925 255,224, //jmpq *%rax
5926};
5927
5928CODE const uint8_t sk_scale_u8_sse2[] = {
5929 72,173, //lods %ds:(%rsi),%rax
5930 72,139,0, //mov (%rax),%rax
5931 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8
5932 102,69,15,239,201, //pxor %xmm9,%xmm9
5933 102,69,15,96,193, //punpcklbw %xmm9,%xmm8
5934 102,69,15,97,193, //punpcklwd %xmm9,%xmm8
5935 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
5936 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
5937 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5938 69,15,89,200, //mulps %xmm8,%xmm9
5939 65,15,89,193, //mulps %xmm9,%xmm0
5940 65,15,89,201, //mulps %xmm9,%xmm1
5941 65,15,89,209, //mulps %xmm9,%xmm2
5942 65,15,89,217, //mulps %xmm9,%xmm3
5943 72,173, //lods %ds:(%rsi),%rax
5944 255,224, //jmpq *%rax
5945};
5946
5947CODE const uint8_t sk_lerp_1_float_sse2[] = {
5948 72,173, //lods %ds:(%rsi),%rax
5949 243,68,15,16,0, //movss (%rax),%xmm8
5950 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
5951 15,92,196, //subps %xmm4,%xmm0
5952 65,15,89,192, //mulps %xmm8,%xmm0
5953 15,88,196, //addps %xmm4,%xmm0
5954 15,92,205, //subps %xmm5,%xmm1
5955 65,15,89,200, //mulps %xmm8,%xmm1
5956 15,88,205, //addps %xmm5,%xmm1
5957 15,92,214, //subps %xmm6,%xmm2
5958 65,15,89,208, //mulps %xmm8,%xmm2
5959 15,88,214, //addps %xmm6,%xmm2
5960 15,92,223, //subps %xmm7,%xmm3
5961 65,15,89,216, //mulps %xmm8,%xmm3
5962 15,88,223, //addps %xmm7,%xmm3
5963 72,173, //lods %ds:(%rsi),%rax
5964 255,224, //jmpq *%rax
5965};
5966
5967CODE const uint8_t sk_lerp_u8_sse2[] = {
5968 72,173, //lods %ds:(%rsi),%rax
5969 72,139,0, //mov (%rax),%rax
5970 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8
5971 102,69,15,239,201, //pxor %xmm9,%xmm9
5972 102,69,15,96,193, //punpcklbw %xmm9,%xmm8
5973 102,69,15,97,193, //punpcklwd %xmm9,%xmm8
5974 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
5975 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
5976 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
5977 69,15,89,200, //mulps %xmm8,%xmm9
5978 15,92,196, //subps %xmm4,%xmm0
5979 65,15,89,193, //mulps %xmm9,%xmm0
5980 15,88,196, //addps %xmm4,%xmm0
5981 15,92,205, //subps %xmm5,%xmm1
5982 65,15,89,201, //mulps %xmm9,%xmm1
5983 15,88,205, //addps %xmm5,%xmm1
5984 15,92,214, //subps %xmm6,%xmm2
5985 65,15,89,209, //mulps %xmm9,%xmm2
5986 15,88,214, //addps %xmm6,%xmm2
5987 15,92,223, //subps %xmm7,%xmm3
5988 65,15,89,217, //mulps %xmm9,%xmm3
5989 15,88,223, //addps %xmm7,%xmm3
5990 72,173, //lods %ds:(%rsi),%rax
5991 255,224, //jmpq *%rax
5992};
5993
5994CODE const uint8_t sk_lerp_565_sse2[] = {
5995 72,173, //lods %ds:(%rsi),%rax
5996 72,139,0, //mov (%rax),%rax
5997 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8
5998 102,15,239,219, //pxor %xmm3,%xmm3
5999 102,68,15,97,195, //punpcklwd %xmm3,%xmm8
6000 102,15,110,90,104, //movd 0x68(%rdx),%xmm3
6001 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
6002 102,65,15,219,216, //pand %xmm8,%xmm3
6003 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
6004 243,15,16,26, //movss (%rdx),%xmm3
6005 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
6006 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6007 69,15,89,209, //mulps %xmm9,%xmm10
6008 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
6009 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
6010 102,69,15,219,200, //pand %xmm8,%xmm9
6011 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
6012 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
6013 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6014 69,15,89,217, //mulps %xmm9,%xmm11
6015 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
6016 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
6017 102,69,15,219,200, //pand %xmm8,%xmm9
6018 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
6019 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
6020 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6021 69,15,89,200, //mulps %xmm8,%xmm9
6022 15,92,196, //subps %xmm4,%xmm0
6023 65,15,89,194, //mulps %xmm10,%xmm0
6024 15,88,196, //addps %xmm4,%xmm0
6025 15,92,205, //subps %xmm5,%xmm1
6026 65,15,89,203, //mulps %xmm11,%xmm1
6027 15,88,205, //addps %xmm5,%xmm1
6028 15,92,214, //subps %xmm6,%xmm2
6029 65,15,89,209, //mulps %xmm9,%xmm2
6030 15,88,214, //addps %xmm6,%xmm2
6031 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
6032 72,173, //lods %ds:(%rsi),%rax
6033 255,224, //jmpq *%rax
6034};
6035
6036CODE const uint8_t sk_load_tables_sse2[] = {
6037 72,173, //lods %ds:(%rsi),%rax
6038 72,139,8, //mov (%rax),%rcx
6039 76,139,64,8, //mov 0x8(%rax),%r8
6040 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8
6041 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
6042 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
6043 102,69,15,111,200, //movdqa %xmm8,%xmm9
6044 102,65,15,114,209,8, //psrld $0x8,%xmm9
6045 102,68,15,219,200, //pand %xmm0,%xmm9
6046 102,69,15,111,208, //movdqa %xmm8,%xmm10
6047 102,65,15,114,210,16, //psrld $0x10,%xmm10
6048 102,68,15,219,208, //pand %xmm0,%xmm10
6049 102,65,15,219,192, //pand %xmm8,%xmm0
6050 102,15,112,216,78, //pshufd $0x4e,%xmm0,%xmm3
6051 102,72,15,126,217, //movq %xmm3,%rcx
6052 65,137,201, //mov %ecx,%r9d
6053 72,193,233,32, //shr $0x20,%rcx
6054 102,73,15,126,194, //movq %xmm0,%r10
6055 69,137,211, //mov %r10d,%r11d
6056 73,193,234,32, //shr $0x20,%r10
6057 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3
6058 243,65,15,16,4,136, //movss (%r8,%rcx,4),%xmm0
6059 15,20,216, //unpcklps %xmm0,%xmm3
6060 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0
6061 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1
6062 15,20,193, //unpcklps %xmm1,%xmm0
6063 15,20,195, //unpcklps %xmm3,%xmm0
6064 72,139,72,16, //mov 0x10(%rax),%rcx
6065 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1
6066 102,73,15,126,200, //movq %xmm1,%r8
6067 69,137,193, //mov %r8d,%r9d
6068 73,193,232,32, //shr $0x20,%r8
6069 102,77,15,126,202, //movq %xmm9,%r10
6070 69,137,211, //mov %r10d,%r11d
6071 73,193,234,32, //shr $0x20,%r10
6072 243,66,15,16,28,145, //movss (%rcx,%r10,4),%xmm3
6073 243,66,15,16,12,129, //movss (%rcx,%r8,4),%xmm1
6074 15,20,217, //unpcklps %xmm1,%xmm3
6075 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1
6076 243,66,15,16,20,137, //movss (%rcx,%r9,4),%xmm2
6077 15,20,202, //unpcklps %xmm2,%xmm1
6078 15,20,203, //unpcklps %xmm3,%xmm1
6079 72,139,64,24, //mov 0x18(%rax),%rax
6080 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2
6081 102,72,15,126,209, //movq %xmm2,%rcx
6082 65,137,200, //mov %ecx,%r8d
6083 72,193,233,32, //shr $0x20,%rcx
6084 102,77,15,126,209, //movq %xmm10,%r9
6085 69,137,202, //mov %r9d,%r10d
6086 73,193,233,32, //shr $0x20,%r9
6087 243,70,15,16,12,136, //movss (%rax,%r9,4),%xmm9
6088 243,15,16,20,136, //movss (%rax,%rcx,4),%xmm2
6089 68,15,20,202, //unpcklps %xmm2,%xmm9
6090 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2
6091 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3
6092 15,20,211, //unpcklps %xmm3,%xmm2
6093 65,15,20,209, //unpcklps %xmm9,%xmm2
6094 102,65,15,114,208,24, //psrld $0x18,%xmm8
6095 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
6096 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
6097 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
6098 65,15,89,216, //mulps %xmm8,%xmm3
6099 72,173, //lods %ds:(%rsi),%rax
6100 255,224, //jmpq *%rax
6101};
6102
6103CODE const uint8_t sk_load_a8_sse2[] = {
6104 72,173, //lods %ds:(%rsi),%rax
6105 72,139,0, //mov (%rax),%rax
6106 102,15,110,4,56, //movd (%rax,%rdi,1),%xmm0
6107 102,15,239,201, //pxor %xmm1,%xmm1
6108 102,15,96,193, //punpcklbw %xmm1,%xmm0
6109 102,15,97,193, //punpcklwd %xmm1,%xmm0
6110 15,91,192, //cvtdq2ps %xmm0,%xmm0
6111 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
6112 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
6113 15,89,216, //mulps %xmm0,%xmm3
6114 72,173, //lods %ds:(%rsi),%rax
6115 15,87,192, //xorps %xmm0,%xmm0
6116 102,15,239,201, //pxor %xmm1,%xmm1
6117 15,87,210, //xorps %xmm2,%xmm2
6118 255,224, //jmpq *%rax
6119};
6120
6121CODE const uint8_t sk_store_a8_sse2[] = {
6122 72,173, //lods %ds:(%rsi),%rax
6123 72,139,0, //mov (%rax),%rax
6124 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
6125 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6126 68,15,89,195, //mulps %xmm3,%xmm8
6127 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
6128 102,65,15,114,240,16, //pslld $0x10,%xmm8
6129 102,65,15,114,224,16, //psrad $0x10,%xmm8
6130 102,69,15,107,192, //packssdw %xmm8,%xmm8
6131 102,69,15,103,192, //packuswb %xmm8,%xmm8
6132 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1)
6133 72,173, //lods %ds:(%rsi),%rax
6134 255,224, //jmpq *%rax
6135};
6136
6137CODE const uint8_t sk_load_565_sse2[] = {
6138 72,173, //lods %ds:(%rsi),%rax
6139 72,139,0, //mov (%rax),%rax
6140 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9
6141 102,15,239,192, //pxor %xmm0,%xmm0
6142 102,68,15,97,200, //punpcklwd %xmm0,%xmm9
6143 102,15,110,66,104, //movd 0x68(%rdx),%xmm0
6144 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
6145 102,65,15,219,193, //pand %xmm9,%xmm0
6146 15,91,200, //cvtdq2ps %xmm0,%xmm1
6147 243,15,16,26, //movss (%rdx),%xmm3
6148 243,15,16,66,116, //movss 0x74(%rdx),%xmm0
6149 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
6150 15,89,193, //mulps %xmm1,%xmm0
6151 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1
6152 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
6153 102,65,15,219,201, //pand %xmm9,%xmm1
6154 68,15,91,193, //cvtdq2ps %xmm1,%xmm8
6155 243,15,16,74,120, //movss 0x78(%rdx),%xmm1
6156 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
6157 65,15,89,200, //mulps %xmm8,%xmm1
6158 102,15,110,82,112, //movd 0x70(%rdx),%xmm2
6159 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2
6160 102,65,15,219,209, //pand %xmm9,%xmm2
6161 68,15,91,194, //cvtdq2ps %xmm2,%xmm8
6162 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2
6163 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
6164 65,15,89,208, //mulps %xmm8,%xmm2
6165 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
6166 72,173, //lods %ds:(%rsi),%rax
6167 255,224, //jmpq *%rax
6168};
6169
6170CODE const uint8_t sk_store_565_sse2[] = {
6171 72,173, //lods %ds:(%rsi),%rax
6172 72,139,0, //mov (%rax),%rax
6173 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8
6174 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9
6175 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6176 69,15,40,208, //movaps %xmm8,%xmm10
6177 68,15,89,208, //mulps %xmm0,%xmm10
6178 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
6179 102,65,15,114,242,11, //pslld $0xb,%xmm10
6180 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6181 68,15,89,201, //mulps %xmm1,%xmm9
6182 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
6183 102,65,15,114,241,5, //pslld $0x5,%xmm9
6184 102,69,15,235,202, //por %xmm10,%xmm9
6185 68,15,89,194, //mulps %xmm2,%xmm8
6186 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
6187 102,69,15,86,193, //orpd %xmm9,%xmm8
6188 102,65,15,114,240,16, //pslld $0x10,%xmm8
6189 102,65,15,114,224,16, //psrad $0x10,%xmm8
6190 102,69,15,107,192, //packssdw %xmm8,%xmm8
6191 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2)
6192 72,173, //lods %ds:(%rsi),%rax
6193 255,224, //jmpq *%rax
6194};
6195
6196CODE const uint8_t sk_load_8888_sse2[] = {
6197 72,173, //lods %ds:(%rsi),%rax
6198 72,139,0, //mov (%rax),%rax
6199 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
6200 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
6201 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
6202 102,15,111,203, //movdqa %xmm3,%xmm1
6203 102,15,114,209,8, //psrld $0x8,%xmm1
6204 102,15,219,200, //pand %xmm0,%xmm1
6205 102,15,111,211, //movdqa %xmm3,%xmm2
6206 102,15,114,210,16, //psrld $0x10,%xmm2
6207 102,15,219,208, //pand %xmm0,%xmm2
6208 102,15,219,195, //pand %xmm3,%xmm0
6209 15,91,192, //cvtdq2ps %xmm0,%xmm0
6210 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
6211 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6212 65,15,89,192, //mulps %xmm8,%xmm0
6213 15,91,201, //cvtdq2ps %xmm1,%xmm1
6214 65,15,89,200, //mulps %xmm8,%xmm1
6215 15,91,210, //cvtdq2ps %xmm2,%xmm2
6216 65,15,89,208, //mulps %xmm8,%xmm2
6217 102,15,114,211,24, //psrld $0x18,%xmm3
6218 15,91,219, //cvtdq2ps %xmm3,%xmm3
6219 65,15,89,216, //mulps %xmm8,%xmm3
6220 72,173, //lods %ds:(%rsi),%rax
6221 255,224, //jmpq *%rax
6222};
6223
6224CODE const uint8_t sk_store_8888_sse2[] = {
6225 72,173, //lods %ds:(%rsi),%rax
6226 72,139,0, //mov (%rax),%rax
6227 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
6228 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6229 69,15,40,200, //movaps %xmm8,%xmm9
6230 68,15,89,200, //mulps %xmm0,%xmm9
6231 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
6232 69,15,40,208, //movaps %xmm8,%xmm10
6233 68,15,89,209, //mulps %xmm1,%xmm10
6234 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
6235 102,65,15,114,242,8, //pslld $0x8,%xmm10
6236 102,69,15,235,209, //por %xmm9,%xmm10
6237 69,15,40,200, //movaps %xmm8,%xmm9
6238 68,15,89,202, //mulps %xmm2,%xmm9
6239 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
6240 102,65,15,114,241,16, //pslld $0x10,%xmm9
6241 68,15,89,195, //mulps %xmm3,%xmm8
6242 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
6243 102,65,15,114,240,24, //pslld $0x18,%xmm8
6244 102,69,15,235,193, //por %xmm9,%xmm8
6245 102,69,15,235,194, //por %xmm10,%xmm8
6246 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4)
6247 72,173, //lods %ds:(%rsi),%rax
6248 255,224, //jmpq *%rax
6249};
6250
6251CODE const uint8_t sk_load_f16_sse2[] = {
6252 72,173, //lods %ds:(%rsi),%rax
6253 72,139,0, //mov (%rax),%rax
6254 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0
6255 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1
6256 102,15,111,208, //movdqa %xmm0,%xmm2
6257 102,15,97,209, //punpcklwd %xmm1,%xmm2
6258 102,15,105,193, //punpckhwd %xmm1,%xmm0
6259 102,68,15,111,194, //movdqa %xmm2,%xmm8
6260 102,68,15,97,192, //punpcklwd %xmm0,%xmm8
6261 102,15,105,208, //punpckhwd %xmm0,%xmm2
6262 102,15,110,66,100, //movd 0x64(%rdx),%xmm0
6263 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3
6264 102,15,111,203, //movdqa %xmm3,%xmm1
6265 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1
6266 102,65,15,223,200, //pandn %xmm8,%xmm1
6267 102,15,101,218, //pcmpgtw %xmm2,%xmm3
6268 102,15,223,218, //pandn %xmm2,%xmm3
6269 102,69,15,239,192, //pxor %xmm8,%xmm8
6270 102,15,111,193, //movdqa %xmm1,%xmm0
6271 102,65,15,97,192, //punpcklwd %xmm8,%xmm0
6272 102,15,114,240,13, //pslld $0xd,%xmm0
6273 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2
6274 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9
6275 65,15,89,193, //mulps %xmm9,%xmm0
6276 102,65,15,105,200, //punpckhwd %xmm8,%xmm1
6277 102,15,114,241,13, //pslld $0xd,%xmm1
6278 65,15,89,201, //mulps %xmm9,%xmm1
6279 102,15,111,211, //movdqa %xmm3,%xmm2
6280 102,65,15,97,208, //punpcklwd %xmm8,%xmm2
6281 102,15,114,242,13, //pslld $0xd,%xmm2
6282 65,15,89,209, //mulps %xmm9,%xmm2
6283 102,65,15,105,216, //punpckhwd %xmm8,%xmm3
6284 102,15,114,243,13, //pslld $0xd,%xmm3
6285 65,15,89,217, //mulps %xmm9,%xmm3
6286 72,173, //lods %ds:(%rsi),%rax
6287 255,224, //jmpq *%rax
6288};
6289
6290CODE const uint8_t sk_store_f16_sse2[] = {
6291 72,173, //lods %ds:(%rsi),%rax
6292 72,139,0, //mov (%rax),%rax
6293 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8
6294 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8
6295 102,69,15,111,200, //movdqa %xmm8,%xmm9
6296 68,15,89,200, //mulps %xmm0,%xmm9
6297 102,65,15,114,209,13, //psrld $0xd,%xmm9
6298 102,69,15,111,208, //movdqa %xmm8,%xmm10
6299 68,15,89,209, //mulps %xmm1,%xmm10
6300 102,65,15,114,210,13, //psrld $0xd,%xmm10
6301 102,69,15,111,216, //movdqa %xmm8,%xmm11
6302 68,15,89,218, //mulps %xmm2,%xmm11
6303 102,65,15,114,211,13, //psrld $0xd,%xmm11
6304 68,15,89,195, //mulps %xmm3,%xmm8
6305 102,65,15,114,208,13, //psrld $0xd,%xmm8
6306 102,65,15,115,250,2, //pslldq $0x2,%xmm10
6307 102,69,15,235,209, //por %xmm9,%xmm10
6308 102,65,15,115,248,2, //pslldq $0x2,%xmm8
6309 102,69,15,235,195, //por %xmm11,%xmm8
6310 102,69,15,111,202, //movdqa %xmm10,%xmm9
6311 102,69,15,98,200, //punpckldq %xmm8,%xmm9
6312 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8)
6313 102,69,15,106,208, //punpckhdq %xmm8,%xmm10
6314 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8)
6315 72,173, //lods %ds:(%rsi),%rax
6316 255,224, //jmpq *%rax
6317};
6318
6319CODE const uint8_t sk_store_f32_sse2[] = {
6320 72,173, //lods %ds:(%rsi),%rax
6321 72,139,0, //mov (%rax),%rax
6322 72,137,249, //mov %rdi,%rcx
6323 72,193,225,4, //shl $0x4,%rcx
6324 68,15,40,192, //movaps %xmm0,%xmm8
6325 68,15,40,200, //movaps %xmm0,%xmm9
6326 68,15,20,201, //unpcklps %xmm1,%xmm9
6327 68,15,40,210, //movaps %xmm2,%xmm10
6328 68,15,40,218, //movaps %xmm2,%xmm11
6329 68,15,20,219, //unpcklps %xmm3,%xmm11
6330 68,15,21,193, //unpckhps %xmm1,%xmm8
6331 68,15,21,211, //unpckhps %xmm3,%xmm10
6332 69,15,40,225, //movaps %xmm9,%xmm12
6333 102,69,15,20,227, //unpcklpd %xmm11,%xmm12
6334 102,69,15,21,203, //unpckhpd %xmm11,%xmm9
6335 69,15,40,216, //movaps %xmm8,%xmm11
6336 102,69,15,20,218, //unpcklpd %xmm10,%xmm11
6337 102,69,15,21,194, //unpckhpd %xmm10,%xmm8
6338 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1)
6339 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1)
6340 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1)
6341 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1)
6342 72,173, //lods %ds:(%rsi),%rax
6343 255,224, //jmpq *%rax
6344};
6345
6346CODE const uint8_t sk_clamp_x_sse2[] = {
6347 72,173, //lods %ds:(%rsi),%rax
6348 69,15,87,192, //xorps %xmm8,%xmm8
6349 68,15,95,192, //maxps %xmm0,%xmm8
6350 243,68,15,16,8, //movss (%rax),%xmm9
6351 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6352 102,15,118,192, //pcmpeqd %xmm0,%xmm0
6353 102,65,15,254,193, //paddd %xmm9,%xmm0
6354 68,15,93,192, //minps %xmm0,%xmm8
6355 72,173, //lods %ds:(%rsi),%rax
6356 65,15,40,192, //movaps %xmm8,%xmm0
6357 255,224, //jmpq *%rax
6358};
6359
6360CODE const uint8_t sk_clamp_y_sse2[] = {
6361 72,173, //lods %ds:(%rsi),%rax
6362 69,15,87,192, //xorps %xmm8,%xmm8
6363 68,15,95,193, //maxps %xmm1,%xmm8
6364 243,68,15,16,8, //movss (%rax),%xmm9
6365 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6366 102,15,118,201, //pcmpeqd %xmm1,%xmm1
6367 102,65,15,254,201, //paddd %xmm9,%xmm1
6368 68,15,93,193, //minps %xmm1,%xmm8
6369 72,173, //lods %ds:(%rsi),%rax
6370 65,15,40,200, //movaps %xmm8,%xmm1
6371 255,224, //jmpq *%rax
6372};
6373
6374CODE const uint8_t sk_repeat_x_sse2[] = {
6375 72,173, //lods %ds:(%rsi),%rax
6376 243,68,15,16,0, //movss (%rax),%xmm8
6377 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6378 68,15,40,200, //movaps %xmm0,%xmm9
6379 69,15,94,200, //divps %xmm8,%xmm9
6380 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
6381 69,15,91,210, //cvtdq2ps %xmm10,%xmm10
6382 69,15,194,202,1, //cmpltps %xmm10,%xmm9
6383 243,68,15,16,26, //movss (%rdx),%xmm11
6384 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6385 69,15,84,217, //andps %xmm9,%xmm11
6386 69,15,92,211, //subps %xmm11,%xmm10
6387 69,15,89,208, //mulps %xmm8,%xmm10
6388 65,15,92,194, //subps %xmm10,%xmm0
6389 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
6390 102,69,15,254,200, //paddd %xmm8,%xmm9
6391 65,15,93,193, //minps %xmm9,%xmm0
6392 72,173, //lods %ds:(%rsi),%rax
6393 255,224, //jmpq *%rax
6394};
6395
6396CODE const uint8_t sk_repeat_y_sse2[] = {
6397 72,173, //lods %ds:(%rsi),%rax
6398 243,68,15,16,0, //movss (%rax),%xmm8
6399 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6400 68,15,40,201, //movaps %xmm1,%xmm9
6401 69,15,94,200, //divps %xmm8,%xmm9
6402 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
6403 69,15,91,210, //cvtdq2ps %xmm10,%xmm10
6404 69,15,194,202,1, //cmpltps %xmm10,%xmm9
6405 243,68,15,16,26, //movss (%rdx),%xmm11
6406 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6407 69,15,84,217, //andps %xmm9,%xmm11
6408 69,15,92,211, //subps %xmm11,%xmm10
6409 69,15,89,208, //mulps %xmm8,%xmm10
6410 65,15,92,202, //subps %xmm10,%xmm1
6411 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
6412 102,69,15,254,200, //paddd %xmm8,%xmm9
6413 65,15,93,201, //minps %xmm9,%xmm1
6414 72,173, //lods %ds:(%rsi),%rax
6415 255,224, //jmpq *%rax
6416};
6417
6418CODE const uint8_t sk_mirror_x_sse2[] = {
6419 72,173, //lods %ds:(%rsi),%rax
6420 243,68,15,16,8, //movss (%rax),%xmm9
6421 69,15,40,193, //movaps %xmm9,%xmm8
6422 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6423 65,15,92,192, //subps %xmm8,%xmm0
6424 243,69,15,88,201, //addss %xmm9,%xmm9
6425 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6426 68,15,40,208, //movaps %xmm0,%xmm10
6427 69,15,94,209, //divps %xmm9,%xmm10
6428 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
6429 69,15,91,219, //cvtdq2ps %xmm11,%xmm11
6430 69,15,194,211,1, //cmpltps %xmm11,%xmm10
6431 243,68,15,16,34, //movss (%rdx),%xmm12
6432 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6433 69,15,84,226, //andps %xmm10,%xmm12
6434 69,15,87,210, //xorps %xmm10,%xmm10
6435 69,15,92,220, //subps %xmm12,%xmm11
6436 69,15,89,217, //mulps %xmm9,%xmm11
6437 65,15,92,195, //subps %xmm11,%xmm0
6438 65,15,92,192, //subps %xmm8,%xmm0
6439 68,15,92,208, //subps %xmm0,%xmm10
6440 65,15,84,194, //andps %xmm10,%xmm0
6441 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
6442 102,69,15,254,200, //paddd %xmm8,%xmm9
6443 65,15,93,193, //minps %xmm9,%xmm0
6444 72,173, //lods %ds:(%rsi),%rax
6445 255,224, //jmpq *%rax
6446};
6447
6448CODE const uint8_t sk_mirror_y_sse2[] = {
6449 72,173, //lods %ds:(%rsi),%rax
6450 243,68,15,16,8, //movss (%rax),%xmm9
6451 69,15,40,193, //movaps %xmm9,%xmm8
6452 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6453 65,15,92,200, //subps %xmm8,%xmm1
6454 243,69,15,88,201, //addss %xmm9,%xmm9
6455 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6456 68,15,40,209, //movaps %xmm1,%xmm10
6457 69,15,94,209, //divps %xmm9,%xmm10
6458 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
6459 69,15,91,219, //cvtdq2ps %xmm11,%xmm11
6460 69,15,194,211,1, //cmpltps %xmm11,%xmm10
6461 243,68,15,16,34, //movss (%rdx),%xmm12
6462 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6463 69,15,84,226, //andps %xmm10,%xmm12
6464 69,15,87,210, //xorps %xmm10,%xmm10
6465 69,15,92,220, //subps %xmm12,%xmm11
6466 69,15,89,217, //mulps %xmm9,%xmm11
6467 65,15,92,203, //subps %xmm11,%xmm1
6468 65,15,92,200, //subps %xmm8,%xmm1
6469 68,15,92,209, //subps %xmm1,%xmm10
6470 65,15,84,202, //andps %xmm10,%xmm1
6471 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
6472 102,69,15,254,200, //paddd %xmm8,%xmm9
6473 65,15,93,201, //minps %xmm9,%xmm1
6474 72,173, //lods %ds:(%rsi),%rax
6475 255,224, //jmpq *%rax
6476};
6477
6478CODE const uint8_t sk_matrix_2x3_sse2[] = {
6479 68,15,40,201, //movaps %xmm1,%xmm9
6480 68,15,40,192, //movaps %xmm0,%xmm8
6481 72,173, //lods %ds:(%rsi),%rax
6482 243,15,16,0, //movss (%rax),%xmm0
6483 243,15,16,72,4, //movss 0x4(%rax),%xmm1
6484 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
6485 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
6486 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6487 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11
6488 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6489 69,15,89,209, //mulps %xmm9,%xmm10
6490 69,15,88,211, //addps %xmm11,%xmm10
6491 65,15,89,192, //mulps %xmm8,%xmm0
6492 65,15,88,194, //addps %xmm10,%xmm0
6493 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
6494 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
6495 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6496 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
6497 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6498 69,15,89,209, //mulps %xmm9,%xmm10
6499 69,15,88,211, //addps %xmm11,%xmm10
6500 65,15,89,200, //mulps %xmm8,%xmm1
6501 65,15,88,202, //addps %xmm10,%xmm1
6502 72,173, //lods %ds:(%rsi),%rax
6503 255,224, //jmpq *%rax
6504};
6505
6506CODE const uint8_t sk_matrix_3x4_sse2[] = {
6507 68,15,40,201, //movaps %xmm1,%xmm9
6508 68,15,40,192, //movaps %xmm0,%xmm8
6509 72,173, //lods %ds:(%rsi),%rax
6510 243,15,16,0, //movss (%rax),%xmm0
6511 243,15,16,72,4, //movss 0x4(%rax),%xmm1
6512 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
6513 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
6514 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6515 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
6516 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6517 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12
6518 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6519 68,15,89,218, //mulps %xmm2,%xmm11
6520 69,15,88,220, //addps %xmm12,%xmm11
6521 69,15,89,209, //mulps %xmm9,%xmm10
6522 69,15,88,211, //addps %xmm11,%xmm10
6523 65,15,89,192, //mulps %xmm8,%xmm0
6524 65,15,88,194, //addps %xmm10,%xmm0
6525 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
6526 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
6527 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6528 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
6529 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6530 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
6531 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6532 68,15,89,218, //mulps %xmm2,%xmm11
6533 69,15,88,220, //addps %xmm12,%xmm11
6534 69,15,89,209, //mulps %xmm9,%xmm10
6535 69,15,88,211, //addps %xmm11,%xmm10
6536 65,15,89,200, //mulps %xmm8,%xmm1
6537 65,15,88,202, //addps %xmm10,%xmm1
6538 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
6539 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6540 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
6541 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6542 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
6543 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6544 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
6545 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
6546 68,15,89,226, //mulps %xmm2,%xmm12
6547 69,15,88,229, //addps %xmm13,%xmm12
6548 69,15,89,217, //mulps %xmm9,%xmm11
6549 69,15,88,220, //addps %xmm12,%xmm11
6550 69,15,89,208, //mulps %xmm8,%xmm10
6551 69,15,88,211, //addps %xmm11,%xmm10
6552 72,173, //lods %ds:(%rsi),%rax
6553 65,15,40,210, //movaps %xmm10,%xmm2
6554 255,224, //jmpq *%rax
6555};
6556
6557CODE const uint8_t sk_matrix_perspective_sse2[] = {
6558 68,15,40,192, //movaps %xmm0,%xmm8
6559 72,173, //lods %ds:(%rsi),%rax
6560 243,15,16,0, //movss (%rax),%xmm0
6561 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9
6562 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
6563 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6564 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
6565 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6566 68,15,89,201, //mulps %xmm1,%xmm9
6567 69,15,88,202, //addps %xmm10,%xmm9
6568 65,15,89,192, //mulps %xmm8,%xmm0
6569 65,15,88,193, //addps %xmm9,%xmm0
6570 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9
6571 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
6572 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
6573 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6574 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
6575 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6576 68,15,89,209, //mulps %xmm1,%xmm10
6577 69,15,88,211, //addps %xmm11,%xmm10
6578 69,15,89,200, //mulps %xmm8,%xmm9
6579 69,15,88,202, //addps %xmm10,%xmm9
6580 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10
6581 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
6582 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
6583 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
6584 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
6585 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
6586 68,15,89,217, //mulps %xmm1,%xmm11
6587 69,15,88,220, //addps %xmm12,%xmm11
6588 69,15,89,208, //mulps %xmm8,%xmm10
6589 69,15,88,211, //addps %xmm11,%xmm10
6590 65,15,83,202, //rcpps %xmm10,%xmm1
6591 15,89,193, //mulps %xmm1,%xmm0
6592 68,15,89,201, //mulps %xmm1,%xmm9
6593 72,173, //lods %ds:(%rsi),%rax
6594 65,15,40,201, //movaps %xmm9,%xmm1
6595 255,224, //jmpq *%rax
6596};
6597
6598CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
6599 72,173, //lods %ds:(%rsi),%rax
6600 68,15,16,8, //movups (%rax),%xmm9
6601 15,16,88,16, //movups 0x10(%rax),%xmm3
6602 68,15,40,195, //movaps %xmm3,%xmm8
6603 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
6604 65,15,40,201, //movaps %xmm9,%xmm1
6605 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
6606 68,15,89,192, //mulps %xmm0,%xmm8
6607 68,15,88,193, //addps %xmm1,%xmm8
6608 15,40,203, //movaps %xmm3,%xmm1
6609 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
6610 65,15,40,209, //movaps %xmm9,%xmm2
6611 15,198,210,85, //shufps $0x55,%xmm2,%xmm2
6612 15,89,200, //mulps %xmm0,%xmm1
6613 15,88,202, //addps %xmm2,%xmm1
6614 15,40,211, //movaps %xmm3,%xmm2
6615 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
6616 69,15,40,209, //movaps %xmm9,%xmm10
6617 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10
6618 15,89,208, //mulps %xmm0,%xmm2
6619 65,15,88,210, //addps %xmm10,%xmm2
6620 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
6621 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9
6622 15,89,216, //mulps %xmm0,%xmm3
6623 65,15,88,217, //addps %xmm9,%xmm3
6624 72,173, //lods %ds:(%rsi),%rax
6625 65,15,40,192, //movaps %xmm8,%xmm0
6626 255,224, //jmpq *%rax
6627};
6628#elif defined(_M_X64)
6629
6630CODE const uint8_t sk_start_pipeline_hsw[] = {
6631 65,87, //push %r15
6632 65,86, //push %r14
6633 65,85, //push %r13
6634 65,84, //push %r12
6635 86, //push %rsi
6636 87, //push %rdi
6637 83, //push %rbx
6638 72,129,236,160,0,0,0, //sub $0xa0,%rsp
6639 197,120,41,188,36,144,0,0,0, //vmovaps %xmm15,0x90(%rsp)
6640 197,120,41,180,36,128,0,0,0, //vmovaps %xmm14,0x80(%rsp)
6641 197,120,41,108,36,112, //vmovaps %xmm13,0x70(%rsp)
6642 197,120,41,100,36,96, //vmovaps %xmm12,0x60(%rsp)
6643 197,120,41,92,36,80, //vmovaps %xmm11,0x50(%rsp)
6644 197,120,41,84,36,64, //vmovaps %xmm10,0x40(%rsp)
6645 197,120,41,76,36,48, //vmovaps %xmm9,0x30(%rsp)
6646 197,120,41,68,36,32, //vmovaps %xmm8,0x20(%rsp)
6647 197,248,41,124,36,16, //vmovaps %xmm7,0x10(%rsp)
6648 197,248,41,52,36, //vmovaps %xmm6,(%rsp)
6649 77,137,205, //mov %r9,%r13
6650 77,137,198, //mov %r8,%r14
6651 72,137,203, //mov %rcx,%rbx
6652 72,137,214, //mov %rdx,%rsi
6653 72,173, //lods %ds:(%rsi),%rax
6654 73,137,199, //mov %rax,%r15
6655 73,137,244, //mov %rsi,%r12
6656 72,141,67,8, //lea 0x8(%rbx),%rax
6657 76,57,232, //cmp %r13,%rax
6658 118,5, //jbe 75 <_sk_start_pipeline_hsw+0x75>
6659 72,137,223, //mov %rbx,%rdi
6660 235,65, //jmp b6 <_sk_start_pipeline_hsw+0xb6>
6661 185,0,0,0,0, //mov $0x0,%ecx
6662 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
6663 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
6664 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
6665 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
6666 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
6667 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
6668 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
6669 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
6670 72,137,223, //mov %rbx,%rdi
6671 76,137,230, //mov %r12,%rsi
6672 76,137,242, //mov %r14,%rdx
6673 65,255,215, //callq *%r15
6674 72,141,123,8, //lea 0x8(%rbx),%rdi
6675 72,131,195,16, //add $0x10,%rbx
6676 76,57,235, //cmp %r13,%rbx
6677 72,137,251, //mov %rdi,%rbx
6678 118,191, //jbe 75 <_sk_start_pipeline_hsw+0x75>
6679 76,137,233, //mov %r13,%rcx
6680 72,41,249, //sub %rdi,%rcx
6681 116,41, //je e7 <_sk_start_pipeline_hsw+0xe7>
6682 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
6683 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
6684 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
6685 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
6686 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
6687 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
6688 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
6689 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
6690 76,137,230, //mov %r12,%rsi
6691 76,137,242, //mov %r14,%rdx
6692 65,255,215, //callq *%r15
6693 76,137,232, //mov %r13,%rax
6694 197,248,40,52,36, //vmovaps (%rsp),%xmm6
6695 197,248,40,124,36,16, //vmovaps 0x10(%rsp),%xmm7
6696 197,120,40,68,36,32, //vmovaps 0x20(%rsp),%xmm8
6697 197,120,40,76,36,48, //vmovaps 0x30(%rsp),%xmm9
6698 197,120,40,84,36,64, //vmovaps 0x40(%rsp),%xmm10
6699 197,120,40,92,36,80, //vmovaps 0x50(%rsp),%xmm11
6700 197,120,40,100,36,96, //vmovaps 0x60(%rsp),%xmm12
6701 197,120,40,108,36,112, //vmovaps 0x70(%rsp),%xmm13
6702 197,120,40,180,36,128,0,0,0, //vmovaps 0x80(%rsp),%xmm14
6703 197,120,40,188,36,144,0,0,0, //vmovaps 0x90(%rsp),%xmm15
6704 72,129,196,160,0,0,0, //add $0xa0,%rsp
6705 91, //pop %rbx
6706 95, //pop %rdi
6707 94, //pop %rsi
6708 65,92, //pop %r12
6709 65,93, //pop %r13
6710 65,94, //pop %r14
6711 65,95, //pop %r15
6712 197,248,119, //vzeroupper
6713 195, //retq
6714};
6715
6716CODE const uint8_t sk_just_return_hsw[] = {
6717 195, //retq
6718};
6719
6720CODE const uint8_t sk_seed_shader_hsw[] = {
6721 72,173, //lods %ds:(%rsi),%rax
6722 197,249,110,199, //vmovd %edi,%xmm0
6723 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
6724 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
6725 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
6726 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
6727 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
6728 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
6729 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
6730 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
6731 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
6732 72,173, //lods %ds:(%rsi),%rax
6733 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
6734 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
6735 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
6736 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
6737 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
6738 255,224, //jmpq *%rax
6739};
6740
6741CODE const uint8_t sk_constant_color_hsw[] = {
6742 72,173, //lods %ds:(%rsi),%rax
6743 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
6744 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
6745 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
6746 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
6747 72,173, //lods %ds:(%rsi),%rax
6748 255,224, //jmpq *%rax
6749};
6750
6751CODE const uint8_t sk_clear_hsw[] = {
6752 72,173, //lods %ds:(%rsi),%rax
6753 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
6754 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
6755 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
6756 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
6757 255,224, //jmpq *%rax
6758};
6759
6760CODE const uint8_t sk_plus__hsw[] = {
6761 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
6762 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
6763 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
6764 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
6765 72,173, //lods %ds:(%rsi),%rax
6766 255,224, //jmpq *%rax
6767};
6768
6769CODE const uint8_t sk_srcover_hsw[] = {
6770 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
6771 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
6772 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0
6773 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1
6774 196,194,77,184,208, //vfmadd231ps %ymm8,%ymm6,%ymm2
6775 196,194,69,184,216, //vfmadd231ps %ymm8,%ymm7,%ymm3
6776 72,173, //lods %ds:(%rsi),%rax
6777 255,224, //jmpq *%rax
6778};
6779
6780CODE const uint8_t sk_dstover_hsw[] = {
6781 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
6782 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
6783 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
6784 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
6785 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
6786 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
6787 72,173, //lods %ds:(%rsi),%rax
6788 255,224, //jmpq *%rax
6789};
6790
6791CODE const uint8_t sk_clamp_0_hsw[] = {
6792 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
6793 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0
6794 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1
6795 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2
6796 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3
6797 72,173, //lods %ds:(%rsi),%rax
6798 255,224, //jmpq *%rax
6799};
6800
6801CODE const uint8_t sk_clamp_1_hsw[] = {
6802 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
6803 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
6804 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
6805 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
6806 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
6807 72,173, //lods %ds:(%rsi),%rax
6808 255,224, //jmpq *%rax
6809};
6810
6811CODE const uint8_t sk_clamp_a_hsw[] = {
6812 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
6813 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
6814 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
6815 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
6816 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2
6817 72,173, //lods %ds:(%rsi),%rax
6818 255,224, //jmpq *%rax
6819};
6820
6821CODE const uint8_t sk_set_rgb_hsw[] = {
6822 72,173, //lods %ds:(%rsi),%rax
6823 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
6824 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
6825 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
6826 72,173, //lods %ds:(%rsi),%rax
6827 255,224, //jmpq *%rax
6828};
6829
6830CODE const uint8_t sk_swap_rb_hsw[] = {
6831 197,124,40,192, //vmovaps %ymm0,%ymm8
6832 72,173, //lods %ds:(%rsi),%rax
6833 197,252,40,194, //vmovaps %ymm2,%ymm0
6834 197,124,41,194, //vmovaps %ymm8,%ymm2
6835 255,224, //jmpq *%rax
6836};
6837
6838CODE const uint8_t sk_swap_hsw[] = {
6839 197,124,40,195, //vmovaps %ymm3,%ymm8
6840 197,124,40,202, //vmovaps %ymm2,%ymm9
6841 197,124,40,209, //vmovaps %ymm1,%ymm10
6842 197,124,40,216, //vmovaps %ymm0,%ymm11
6843 72,173, //lods %ds:(%rsi),%rax
6844 197,252,40,196, //vmovaps %ymm4,%ymm0
6845 197,252,40,205, //vmovaps %ymm5,%ymm1
6846 197,252,40,214, //vmovaps %ymm6,%ymm2
6847 197,252,40,223, //vmovaps %ymm7,%ymm3
6848 197,124,41,220, //vmovaps %ymm11,%ymm4
6849 197,124,41,213, //vmovaps %ymm10,%ymm5
6850 197,124,41,206, //vmovaps %ymm9,%ymm6
6851 197,124,41,199, //vmovaps %ymm8,%ymm7
6852 255,224, //jmpq *%rax
6853};
6854
6855CODE const uint8_t sk_move_src_dst_hsw[] = {
6856 72,173, //lods %ds:(%rsi),%rax
6857 197,252,40,224, //vmovaps %ymm0,%ymm4
6858 197,252,40,233, //vmovaps %ymm1,%ymm5
6859 197,252,40,242, //vmovaps %ymm2,%ymm6
6860 197,252,40,251, //vmovaps %ymm3,%ymm7
6861 255,224, //jmpq *%rax
6862};
6863
6864CODE const uint8_t sk_move_dst_src_hsw[] = {
6865 72,173, //lods %ds:(%rsi),%rax
6866 197,252,40,196, //vmovaps %ymm4,%ymm0
6867 197,252,40,205, //vmovaps %ymm5,%ymm1
6868 197,252,40,214, //vmovaps %ymm6,%ymm2
6869 197,252,40,223, //vmovaps %ymm7,%ymm3
6870 255,224, //jmpq *%rax
6871};
6872
6873CODE const uint8_t sk_premul_hsw[] = {
6874 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0
6875 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1
6876 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
6877 72,173, //lods %ds:(%rsi),%rax
6878 255,224, //jmpq *%rax
6879};
6880
6881CODE const uint8_t sk_unpremul_hsw[] = {
6882 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
6883 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
6884 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
6885 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
6886 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
6887 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
6888 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
6889 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
6890 72,173, //lods %ds:(%rsi),%rax
6891 255,224, //jmpq *%rax
6892};
6893
6894CODE const uint8_t sk_from_srgb_hsw[] = {
6895 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8
6896 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
6897 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10
6898 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11
6899 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12
6900 196,65,124,40,235, //vmovaps %ymm11,%ymm13
6901 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13
6902 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14
6903 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13
6904 196,98,125,24,82,68, //vbroadcastss 0x44(%rdx),%ymm10
6905 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
6906 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0
6907 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
6908 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13
6909 196,65,124,40,251, //vmovaps %ymm11,%ymm15
6910 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15
6911 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15
6912 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
6913 196,195,5,74,201,16, //vblendvps %ymm1,%ymm9,%ymm15,%ymm1
6914 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
6915 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9
6916 196,66,109,168,220, //vfmadd213ps %ymm12,%ymm2,%ymm11
6917 196,66,53,168,222, //vfmadd213ps %ymm14,%ymm9,%ymm11
6918 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
6919 196,195,37,74,208,32, //vblendvps %ymm2,%ymm8,%ymm11,%ymm2
6920 72,173, //lods %ds:(%rsi),%rax
6921 255,224, //jmpq *%rax
6922};
6923
6924CODE const uint8_t sk_to_srgb_hsw[] = {
6925 197,124,82,192, //vrsqrtps %ymm0,%ymm8
6926 196,65,124,83,200, //vrcpps %ymm8,%ymm9
6927 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10
6928 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8
6929 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11
6930 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12
6931 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13
6932 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14
6933 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15
6934 196,66,13,168,207, //vfmadd213ps %ymm15,%ymm14,%ymm9
6935 196,66,21,184,202, //vfmadd231ps %ymm10,%ymm13,%ymm9
6936 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
6937 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10
6938 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
6939 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0
6940 197,124,82,201, //vrsqrtps %ymm1,%ymm9
6941 196,65,124,83,217, //vrcpps %ymm9,%ymm11
6942 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
6943 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11
6944 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11
6945 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
6946 196,65,28,93,219, //vminps %ymm11,%ymm12,%ymm11
6947 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
6948 196,195,37,74,201,16, //vblendvps %ymm1,%ymm9,%ymm11,%ymm1
6949 197,124,82,202, //vrsqrtps %ymm2,%ymm9
6950 196,65,124,83,217, //vrcpps %ymm9,%ymm11
6951 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11
6952 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
6953 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11
6954 196,65,28,93,203, //vminps %ymm11,%ymm12,%ymm9
6955 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
6956 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
6957 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
6958 72,173, //lods %ds:(%rsi),%rax
6959 255,224, //jmpq *%rax
6960};
6961
6962CODE const uint8_t sk_scale_1_float_hsw[] = {
6963 72,173, //lods %ds:(%rsi),%rax
6964 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
6965 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
6966 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
6967 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
6968 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
6969 72,173, //lods %ds:(%rsi),%rax
6970 255,224, //jmpq *%rax
6971};
6972
6973CODE const uint8_t sk_scale_u8_hsw[] = {
6974 73,137,200, //mov %rcx,%r8
6975 72,173, //lods %ds:(%rsi),%rax
6976 72,139,0, //mov (%rax),%rax
6977 72,1,248, //add %rdi,%rax
6978 77,133,192, //test %r8,%r8
6979 117,48, //jne 4b1 <_sk_scale_u8_hsw+0x40>
6980 197,123,16,0, //vmovsd (%rax),%xmm8
6981 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
6982 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
6983 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
6984 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
6985 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
6986 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
6987 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
6988 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
6989 72,173, //lods %ds:(%rsi),%rax
6990 76,137,193, //mov %r8,%rcx
6991 255,224, //jmpq *%rax
6992 49,201, //xor %ecx,%ecx
6993 77,137,194, //mov %r8,%r10
6994 69,49,201, //xor %r9d,%r9d
6995 68,15,182,24, //movzbl (%rax),%r11d
6996 72,255,192, //inc %rax
6997 73,211,227, //shl %cl,%r11
6998 77,9,217, //or %r11,%r9
6999 72,131,193,8, //add $0x8,%rcx
7000 73,255,202, //dec %r10
7001 117,234, //jne 4b9 <_sk_scale_u8_hsw+0x48>
7002 196,65,249,110,193, //vmovq %r9,%xmm8
7003 235,175, //jmp 485 <_sk_scale_u8_hsw+0x14>
7004};
7005
7006CODE const uint8_t sk_lerp_1_float_hsw[] = {
7007 72,173, //lods %ds:(%rsi),%rax
7008 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
7009 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
7010 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
7011 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
7012 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
7013 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
7014 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
7015 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
7016 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
7017 72,173, //lods %ds:(%rsi),%rax
7018 255,224, //jmpq *%rax
7019};
7020
7021CODE const uint8_t sk_lerp_u8_hsw[] = {
7022 73,137,200, //mov %rcx,%r8
7023 72,173, //lods %ds:(%rsi),%rax
7024 72,139,0, //mov (%rax),%rax
7025 72,1,248, //add %rdi,%rax
7026 77,133,192, //test %r8,%r8
7027 117,68, //jne 559 <_sk_lerp_u8_hsw+0x54>
7028 197,123,16,0, //vmovsd (%rax),%xmm8
7029 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
7030 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
7031 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
7032 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
7033 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
7034 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
7035 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
7036 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
7037 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
7038 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2
7039 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
7040 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3
7041 72,173, //lods %ds:(%rsi),%rax
7042 76,137,193, //mov %r8,%rcx
7043 255,224, //jmpq *%rax
7044 49,201, //xor %ecx,%ecx
7045 77,137,194, //mov %r8,%r10
7046 69,49,201, //xor %r9d,%r9d
7047 68,15,182,24, //movzbl (%rax),%r11d
7048 72,255,192, //inc %rax
7049 73,211,227, //shl %cl,%r11
7050 77,9,217, //or %r11,%r9
7051 72,131,193,8, //add $0x8,%rcx
7052 73,255,202, //dec %r10
7053 117,234, //jne 561 <_sk_lerp_u8_hsw+0x5c>
7054 196,65,249,110,193, //vmovq %r9,%xmm8
7055 235,155, //jmp 519 <_sk_lerp_u8_hsw+0x14>
7056};
7057
7058CODE const uint8_t sk_lerp_565_hsw[] = {
7059 72,173, //lods %ds:(%rsi),%rax
7060 76,139,16, //mov (%rax),%r10
7061 72,133,201, //test %rcx,%rcx
7062 117,123, //jne 603 <_sk_lerp_565_hsw+0x85>
7063 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3
7064 196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3
7065 196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8
7066 197,61,219,195, //vpand %ymm3,%ymm8,%ymm8
7067 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
7068 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9
7069 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8
7070 196,98,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm9
7071 197,53,219,203, //vpand %ymm3,%ymm9,%ymm9
7072 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9
7073 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10
7074 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9
7075 196,98,125,88,82,112, //vpbroadcastd 0x70(%rdx),%ymm10
7076 197,173,219,219, //vpand %ymm3,%ymm10,%ymm3
7077 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
7078 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10
7079 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3
7080 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
7081 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
7082 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
7083 196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1
7084 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
7085 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2
7086 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
7087 72,173, //lods %ds:(%rsi),%rax
7088 255,224, //jmpq *%rax
7089 65,137,200, //mov %ecx,%r8d
7090 65,128,224,7, //and $0x7,%r8b
7091 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
7092 65,254,200, //dec %r8b
7093 69,15,182,192, //movzbl %r8b,%r8d
7094 65,128,248,6, //cmp $0x6,%r8b
7095 15,135,111,255,255,255, //ja 58e <_sk_lerp_565_hsw+0x10>
7096 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 670 <_sk_lerp_565_hsw+0xf2>
7097 75,99,4,129, //movslq (%r9,%r8,4),%rax
7098 76,1,200, //add %r9,%rax
7099 255,224, //jmpq *%rax
7100 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
7101 196,193,97,196,92,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
7102 196,193,97,196,92,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
7103 196,193,97,196,92,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
7104 196,193,97,196,92,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
7105 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
7106 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
7107 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3
7108 233,31,255,255,255, //jmpq 58e <_sk_lerp_565_hsw+0x10>
7109 144, //nop
7110 243,255, //repz (bad)
7111 255, //(bad)
7112 255, //(bad)
7113 235,255, //jmp 675 <_sk_lerp_565_hsw+0xf7>
7114 255, //(bad)
7115 255,227, //jmpq *%rbx
7116 255, //(bad)
7117 255, //(bad)
7118 255, //(bad)
7119 219,255, //(bad)
7120 255, //(bad)
7121 255,211, //callq *%rbx
7122 255, //(bad)
7123 255, //(bad)
7124 255,203, //dec %ebx
7125 255, //(bad)
7126 255, //(bad)
7127 255, //(bad)
7128 191, //.byte 0xbf
7129 255, //(bad)
7130 255, //(bad)
7131 255, //.byte 0xff
7132};
7133
7134CODE const uint8_t sk_load_tables_hsw[] = {
7135 73,137,200, //mov %rcx,%r8
7136 72,173, //lods %ds:(%rsi),%rax
7137 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
7138 76,3,8, //add (%rax),%r9
7139 77,133,192, //test %r8,%r8
7140 117,106, //jne 70b <_sk_load_tables_hsw+0x7f>
7141 196,193,126,111,25, //vmovdqu (%r9),%ymm3
7142 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
7143 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1
7144 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
7145 72,139,72,8, //mov 0x8(%rax),%rcx
7146 76,139,72,16, //mov 0x10(%rax),%r9
7147 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
7148 196,226,53,146,4,137, //vgatherdps %ymm9,(%rcx,%ymm1,4),%ymm0
7149 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
7150 197,109,219,201, //vpand %ymm1,%ymm2,%ymm9
7151 196,65,45,118,210, //vpcmpeqd %ymm10,%ymm10,%ymm10
7152 196,130,45,146,12,137, //vgatherdps %ymm10,(%r9,%ymm9,4),%ymm1
7153 72,139,64,24, //mov 0x18(%rax),%rax
7154 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
7155 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9
7156 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
7157 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
7158 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
7159 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
7160 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
7161 72,173, //lods %ds:(%rsi),%rax
7162 76,137,193, //mov %r8,%rcx
7163 255,224, //jmpq *%rax
7164 185,8,0,0,0, //mov $0x8,%ecx
7165 68,41,193, //sub %r8d,%ecx
7166 192,225,3, //shl $0x3,%cl
7167 73,199,194,255,255,255,255, //mov $0xffffffffffffffff,%r10
7168 73,211,234, //shr %cl,%r10
7169 196,193,249,110,194, //vmovq %r10,%xmm0
7170 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
7171 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
7172 233,114,255,255,255, //jmpq 6a6 <_sk_load_tables_hsw+0x1a>
7173};
7174
7175CODE const uint8_t sk_load_a8_hsw[] = {
7176 73,137,200, //mov %rcx,%r8
7177 72,173, //lods %ds:(%rsi),%rax
7178 72,139,0, //mov (%rax),%rax
7179 72,1,248, //add %rdi,%rax
7180 77,133,192, //test %r8,%r8
7181 117,42, //jne 76e <_sk_load_a8_hsw+0x3a>
7182 197,251,16,0, //vmovsd (%rax),%xmm0
7183 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0
7184 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
7185 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1
7186 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3
7187 72,173, //lods %ds:(%rsi),%rax
7188 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
7189 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
7190 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
7191 76,137,193, //mov %r8,%rcx
7192 255,224, //jmpq *%rax
7193 49,201, //xor %ecx,%ecx
7194 77,137,194, //mov %r8,%r10
7195 69,49,201, //xor %r9d,%r9d
7196 68,15,182,24, //movzbl (%rax),%r11d
7197 72,255,192, //inc %rax
7198 73,211,227, //shl %cl,%r11
7199 77,9,217, //or %r11,%r9
7200 72,131,193,8, //add $0x8,%rcx
7201 73,255,202, //dec %r10
7202 117,234, //jne 776 <_sk_load_a8_hsw+0x42>
7203 196,193,249,110,193, //vmovq %r9,%xmm0
7204 235,181, //jmp 748 <_sk_load_a8_hsw+0x14>
7205};
7206
7207CODE const uint8_t sk_store_a8_hsw[] = {
7208 72,173, //lods %ds:(%rsi),%rax
7209 76,139,8, //mov (%rax),%r9
7210 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
7211 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
7212 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
7213 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
7214 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
7215 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
7216 72,133,201, //test %rcx,%rcx
7217 117,10, //jne 7c6 <_sk_store_a8_hsw+0x33>
7218 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
7219 72,173, //lods %ds:(%rsi),%rax
7220 255,224, //jmpq *%rax
7221 137,200, //mov %ecx,%eax
7222 36,7, //and $0x7,%al
7223 254,200, //dec %al
7224 68,15,182,192, //movzbl %al,%r8d
7225 65,128,248,6, //cmp $0x6,%r8b
7226 119,236, //ja 7c2 <_sk_store_a8_hsw+0x2f>
7227 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
7228 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 824 <_sk_store_a8_hsw+0x91>
7229 75,99,4,130, //movslq (%r10,%r8,4),%rax
7230 76,1,208, //add %r10,%rax
7231 255,224, //jmpq *%rax
7232 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1)
7233 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1)
7234 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1)
7235 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1)
7236 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
7237 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
7238 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
7239 235,158, //jmp 7c2 <_sk_store_a8_hsw+0x2f>
7240 247,255, //idiv %edi
7241 255, //(bad)
7242 255, //(bad)
7243 239, //out %eax,(%dx)
7244 255, //(bad)
7245 255, //(bad)
7246 255,231, //jmpq *%rdi
7247 255, //(bad)
7248 255, //(bad)
7249 255, //(bad)
7250 223,255, //(bad)
7251 255, //(bad)
7252 255,215, //callq *%rdi
7253 255, //(bad)
7254 255, //(bad)
7255 255,207, //dec %edi
7256 255, //(bad)
7257 255, //(bad)
7258 255,199, //inc %edi
7259 255, //(bad)
7260 255, //(bad)
7261 255, //.byte 0xff
7262};
7263
7264CODE const uint8_t sk_load_565_hsw[] = {
7265 72,173, //lods %ds:(%rsi),%rax
7266 76,139,16, //mov (%rax),%r10
7267 72,133,201, //test %rcx,%rcx
7268 117,92, //jne 8a6 <_sk_load_565_hsw+0x66>
7269 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
7270 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2
7271 196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0
7272 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0
7273 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
7274 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1
7275 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0
7276 196,226,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm1
7277 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1
7278 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
7279 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3
7280 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
7281 196,226,125,88,90,112, //vpbroadcastd 0x70(%rdx),%ymm3
7282 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2
7283 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
7284 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3
7285 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2
7286 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
7287 72,173, //lods %ds:(%rsi),%rax
7288 255,224, //jmpq *%rax
7289 65,137,200, //mov %ecx,%r8d
7290 65,128,224,7, //and $0x7,%r8b
7291 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
7292 65,254,200, //dec %r8b
7293 69,15,182,192, //movzbl %r8b,%r8d
7294 65,128,248,6, //cmp $0x6,%r8b
7295 119,146, //ja 850 <_sk_load_565_hsw+0x10>
7296 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 910 <_sk_load_565_hsw+0xd0>
7297 75,99,4,129, //movslq (%r9,%r8,4),%rax
7298 76,1,200, //add %r9,%rax
7299 255,224, //jmpq *%rax
7300 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
7301 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
7302 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
7303 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
7304 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
7305 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
7306 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
7307 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
7308 233,66,255,255,255, //jmpq 850 <_sk_load_565_hsw+0x10>
7309 102,144, //xchg %ax,%ax
7310 242,255, //repnz (bad)
7311 255, //(bad)
7312 255, //(bad)
7313 234, //(bad)
7314 255, //(bad)
7315 255, //(bad)
7316 255,226, //jmpq *%rdx
7317 255, //(bad)
7318 255, //(bad)
7319 255, //(bad)
7320 218,255, //(bad)
7321 255, //(bad)
7322 255,210, //callq *%rdx
7323 255, //(bad)
7324 255, //(bad)
7325 255,202, //dec %edx
7326 255, //(bad)
7327 255, //(bad)
7328 255, //(bad)
7329 190, //.byte 0xbe
7330 255, //(bad)
7331 255, //(bad)
7332 255, //.byte 0xff
7333};
7334
7335CODE const uint8_t sk_store_565_hsw[] = {
7336 72,173, //lods %ds:(%rsi),%rax
7337 76,139,8, //mov (%rax),%r9
7338 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8
7339 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
7340 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
7341 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9
7342 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10
7343 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
7344 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
7345 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10
7346 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9
7347 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
7348 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
7349 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
7350 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9
7351 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
7352 72,133,201, //test %rcx,%rcx
7353 117,10, //jne 98e <_sk_store_565_hsw+0x62>
7354 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
7355 72,173, //lods %ds:(%rsi),%rax
7356 255,224, //jmpq *%rax
7357 137,200, //mov %ecx,%eax
7358 36,7, //and $0x7,%al
7359 254,200, //dec %al
7360 68,15,182,192, //movzbl %al,%r8d
7361 65,128,248,6, //cmp $0x6,%r8b
7362 119,236, //ja 98a <_sk_store_565_hsw+0x5e>
7363 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 9ec <_sk_store_565_hsw+0xc0>
7364 75,99,4,130, //movslq (%r10,%r8,4),%rax
7365 76,1,208, //add %r10,%rax
7366 255,224, //jmpq *%rax
7367 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2)
7368 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2)
7369 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2)
7370 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2)
7371 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2)
7372 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
7373 197,121,126,192, //vmovd %xmm8,%eax
7374 102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
7375 235,161, //jmp 98a <_sk_store_565_hsw+0x5e>
7376 15,31,0, //nopl (%rax)
7377 242,255, //repnz (bad)
7378 255, //(bad)
7379 255, //(bad)
7380 234, //(bad)
7381 255, //(bad)
7382 255, //(bad)
7383 255,226, //jmpq *%rdx
7384 255, //(bad)
7385 255, //(bad)
7386 255, //(bad)
7387 218,255, //(bad)
7388 255, //(bad)
7389 255,210, //callq *%rdx
7390 255, //(bad)
7391 255, //(bad)
7392 255,202, //dec %edx
7393 255, //(bad)
7394 255, //(bad)
7395 255,194, //inc %edx
7396 255, //(bad)
7397 255, //(bad)
7398 255, //.byte 0xff
7399};
7400
7401CODE const uint8_t sk_load_8888_hsw[] = {
7402 73,137,200, //mov %rcx,%r8
7403 72,173, //lods %ds:(%rsi),%rax
7404 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
7405 76,3,8, //add (%rax),%r9
7406 77,133,192, //test %r8,%r8
7407 117,85, //jne a72 <_sk_load_8888_hsw+0x6a>
7408 196,193,126,111,25, //vmovdqu (%r9),%ymm3
7409 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
7410 197,237,219,195, //vpand %ymm3,%ymm2,%ymm0
7411 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
7412 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
7413 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
7414 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
7415 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1
7416 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
7417 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
7418 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
7419 196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2
7420 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
7421 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
7422 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
7423 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
7424 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
7425 72,173, //lods %ds:(%rsi),%rax
7426 76,137,193, //mov %r8,%rcx
7427 255,224, //jmpq *%rax
7428 185,8,0,0,0, //mov $0x8,%ecx
7429 68,41,193, //sub %r8d,%ecx
7430 192,225,3, //shl $0x3,%cl
7431 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax
7432 72,211,232, //shr %cl,%rax
7433 196,225,249,110,192, //vmovq %rax,%xmm0
7434 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
7435 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
7436 235,138, //jmp a22 <_sk_load_8888_hsw+0x1a>
7437};
7438
7439CODE const uint8_t sk_store_8888_hsw[] = {
7440 73,137,200, //mov %rcx,%r8
7441 72,173, //lods %ds:(%rsi),%rax
7442 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
7443 76,3,8, //add (%rax),%r9
7444 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
7445 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
7446 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
7447 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
7448 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
7449 196,193,45,114,242,8, //vpslld $0x8,%ymm10,%ymm10
7450 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9
7451 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10
7452 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
7453 196,193,45,114,242,16, //vpslld $0x10,%ymm10,%ymm10
7454 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
7455 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
7456 196,193,61,114,240,24, //vpslld $0x18,%ymm8,%ymm8
7457 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8
7458 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
7459 77,133,192, //test %r8,%r8
7460 117,12, //jne b04 <_sk_store_8888_hsw+0x6c>
7461 196,65,126,127,1, //vmovdqu %ymm8,(%r9)
7462 72,173, //lods %ds:(%rsi),%rax
7463 76,137,193, //mov %r8,%rcx
7464 255,224, //jmpq *%rax
7465 185,8,0,0,0, //mov $0x8,%ecx
7466 68,41,193, //sub %r8d,%ecx
7467 192,225,3, //shl $0x3,%cl
7468 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax
7469 72,211,232, //shr %cl,%rax
7470 196,97,249,110,200, //vmovq %rax,%xmm9
7471 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9
7472 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9)
7473 235,211, //jmp afd <_sk_store_8888_hsw+0x65>
7474};
7475
7476CODE const uint8_t sk_load_f16_hsw[] = {
7477 72,173, //lods %ds:(%rsi),%rax
7478 72,139,0, //mov (%rax),%rax
7479 72,133,201, //test %rcx,%rcx
7480 117,97, //jne b95 <_sk_load_f16_hsw+0x6b>
7481 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
7482 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
7483 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
7484 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8
7485 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0
7486 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1
7487 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2
7488 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3
7489 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8
7490 197,121,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm9
7491 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1
7492 197,233,105,219, //vpunpckhwd %xmm3,%xmm2,%xmm3
7493 197,185,108,193, //vpunpcklqdq %xmm1,%xmm8,%xmm0
7494 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0
7495 197,185,109,201, //vpunpckhqdq %xmm1,%xmm8,%xmm1
7496 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1
7497 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2
7498 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2
7499 197,177,109,219, //vpunpckhqdq %xmm3,%xmm9,%xmm3
7500 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3
7501 72,173, //lods %ds:(%rsi),%rax
7502 255,224, //jmpq *%rax
7503 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
7504 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
7505 72,131,249,1, //cmp $0x1,%rcx
7506 117,6, //jne bab <_sk_load_f16_hsw+0x81>
7507 197,250,126,201, //vmovq %xmm1,%xmm1
7508 235,30, //jmp bc9 <_sk_load_f16_hsw+0x9f>
7509 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
7510 72,131,249,3, //cmp $0x3,%rcx
7511 114,18, //jb bc9 <_sk_load_f16_hsw+0x9f>
7512 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
7513 72,131,249,3, //cmp $0x3,%rcx
7514 117,19, //jne bd6 <_sk_load_f16_hsw+0xac>
7515 197,250,126,210, //vmovq %xmm2,%xmm2
7516 235,46, //jmp bf7 <_sk_load_f16_hsw+0xcd>
7517 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
7518 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
7519 233,117,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
7520 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
7521 72,131,249,5, //cmp $0x5,%rcx
7522 114,21, //jb bf7 <_sk_load_f16_hsw+0xcd>
7523 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
7524 72,131,249,5, //cmp $0x5,%rcx
7525 117,18, //jne c00 <_sk_load_f16_hsw+0xd6>
7526 197,250,126,219, //vmovq %xmm3,%xmm3
7527 233,84,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
7528 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
7529 233,75,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
7530 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
7531 72,131,249,7, //cmp $0x7,%rcx
7532 15,130,59,255,255,255, //jb b4b <_sk_load_f16_hsw+0x21>
7533 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
7534 233,48,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
7535};
7536
7537CODE const uint8_t sk_store_f16_hsw[] = {
7538 72,173, //lods %ds:(%rsi),%rax
7539 72,139,0, //mov (%rax),%rax
7540 196,195,125,29,192,4, //vcvtps2ph $0x4,%ymm0,%xmm8
7541 196,195,125,29,201,4, //vcvtps2ph $0x4,%ymm1,%xmm9
7542 196,195,125,29,210,4, //vcvtps2ph $0x4,%ymm2,%xmm10
7543 196,195,125,29,219,4, //vcvtps2ph $0x4,%ymm3,%xmm11
7544 196,65,57,97,225, //vpunpcklwd %xmm9,%xmm8,%xmm12
7545 196,65,57,105,193, //vpunpckhwd %xmm9,%xmm8,%xmm8
7546 196,65,41,97,203, //vpunpcklwd %xmm11,%xmm10,%xmm9
7547 196,65,41,105,235, //vpunpckhwd %xmm11,%xmm10,%xmm13
7548 196,65,25,98,217, //vpunpckldq %xmm9,%xmm12,%xmm11
7549 196,65,25,106,209, //vpunpckhdq %xmm9,%xmm12,%xmm10
7550 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9
7551 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8
7552 72,133,201, //test %rcx,%rcx
7553 117,27, //jne c80 <_sk_store_f16_hsw+0x65>
7554 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
7555 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
7556 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
7557 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8)
7558 72,173, //lods %ds:(%rsi),%rax
7559 255,224, //jmpq *%rax
7560 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
7561 72,131,249,1, //cmp $0x1,%rcx
7562 116,241, //je c7c <_sk_store_f16_hsw+0x61>
7563 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
7564 72,131,249,3, //cmp $0x3,%rcx
7565 114,229, //jb c7c <_sk_store_f16_hsw+0x61>
7566 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
7567 116,221, //je c7c <_sk_store_f16_hsw+0x61>
7568 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
7569 72,131,249,5, //cmp $0x5,%rcx
7570 114,209, //jb c7c <_sk_store_f16_hsw+0x61>
7571 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
7572 116,201, //je c7c <_sk_store_f16_hsw+0x61>
7573 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
7574 72,131,249,7, //cmp $0x7,%rcx
7575 114,189, //jb c7c <_sk_store_f16_hsw+0x61>
7576 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
7577 235,181, //jmp c7c <_sk_store_f16_hsw+0x61>
7578};
7579
7580CODE const uint8_t sk_store_f32_hsw[] = {
7581 72,173, //lods %ds:(%rsi),%rax
7582 76,139,0, //mov (%rax),%r8
7583 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax
7584 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8
7585 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11
7586 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9
7587 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12
7588 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10
7589 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9
7590 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
7591 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
7592 72,133,201, //test %rcx,%rcx
7593 117,55, //jne d34 <_sk_store_f32_hsw+0x6d>
7594 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
7595 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
7596 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
7597 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
7598 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4)
7599 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4)
7600 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4)
7601 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4)
7602 72,173, //lods %ds:(%rsi),%rax
7603 255,224, //jmpq *%rax
7604 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
7605 72,131,249,1, //cmp $0x1,%rcx
7606 116,240, //je d30 <_sk_store_f32_hsw+0x69>
7607 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
7608 72,131,249,3, //cmp $0x3,%rcx
7609 114,227, //jb d30 <_sk_store_f32_hsw+0x69>
7610 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
7611 116,218, //je d30 <_sk_store_f32_hsw+0x69>
7612 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
7613 72,131,249,5, //cmp $0x5,%rcx
7614 114,205, //jb d30 <_sk_store_f32_hsw+0x69>
7615 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
7616 116,195, //je d30 <_sk_store_f32_hsw+0x69>
7617 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
7618 72,131,249,7, //cmp $0x7,%rcx
7619 114,181, //jb d30 <_sk_store_f32_hsw+0x69>
7620 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
7621 235,171, //jmp d30 <_sk_store_f32_hsw+0x69>
7622};
7623
7624CODE const uint8_t sk_clamp_x_hsw[] = {
7625 72,173, //lods %ds:(%rsi),%rax
7626 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
7627 197,188,95,192, //vmaxps %ymm0,%ymm8,%ymm0
7628 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8
7629 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
7630 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8
7631 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
7632 72,173, //lods %ds:(%rsi),%rax
7633 255,224, //jmpq *%rax
7634};
7635
7636CODE const uint8_t sk_clamp_y_hsw[] = {
7637 72,173, //lods %ds:(%rsi),%rax
7638 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
7639 197,188,95,201, //vmaxps %ymm1,%ymm8,%ymm1
7640 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8
7641 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9
7642 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8
7643 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
7644 72,173, //lods %ds:(%rsi),%rax
7645 255,224, //jmpq *%rax
7646};
7647
7648CODE const uint8_t sk_repeat_x_hsw[] = {
7649 72,173, //lods %ds:(%rsi),%rax
7650 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
7651 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9
7652 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
7653 196,98,61,172,200, //vfnmadd213ps %ymm0,%ymm8,%ymm9
7654 197,253,118,192, //vpcmpeqd %ymm0,%ymm0,%ymm0
7655 197,189,254,192, //vpaddd %ymm0,%ymm8,%ymm0
7656 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
7657 72,173, //lods %ds:(%rsi),%rax
7658 255,224, //jmpq *%rax
7659};
7660
7661CODE const uint8_t sk_repeat_y_hsw[] = {
7662 72,173, //lods %ds:(%rsi),%rax
7663 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
7664 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9
7665 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
7666 196,98,61,172,201, //vfnmadd213ps %ymm1,%ymm8,%ymm9
7667 197,245,118,201, //vpcmpeqd %ymm1,%ymm1,%ymm1
7668 197,189,254,201, //vpaddd %ymm1,%ymm8,%ymm1
7669 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
7670 72,173, //lods %ds:(%rsi),%rax
7671 255,224, //jmpq *%rax
7672};
7673
7674CODE const uint8_t sk_mirror_x_hsw[] = {
7675 72,173, //lods %ds:(%rsi),%rax
7676 197,122,16,0, //vmovss (%rax),%xmm8
7677 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9
7678 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10
7679 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0
7680 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
7681 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8
7682 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
7683 196,66,125,172,194, //vfnmadd213ps %ymm10,%ymm0,%ymm8
7684 196,193,60,92,193, //vsubps %ymm9,%ymm8,%ymm0
7685 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
7686 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8
7687 197,188,84,192, //vandps %ymm0,%ymm8,%ymm0
7688 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
7689 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8
7690 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
7691 72,173, //lods %ds:(%rsi),%rax
7692 255,224, //jmpq *%rax
7693};
7694
7695CODE const uint8_t sk_mirror_y_hsw[] = {
7696 72,173, //lods %ds:(%rsi),%rax
7697 197,122,16,0, //vmovss (%rax),%xmm8
7698 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9
7699 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10
7700 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1
7701 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1
7702 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8
7703 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
7704 196,66,117,172,194, //vfnmadd213ps %ymm10,%ymm1,%ymm8
7705 196,193,60,92,201, //vsubps %ymm9,%ymm8,%ymm1
7706 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
7707 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8
7708 197,188,84,201, //vandps %ymm1,%ymm8,%ymm1
7709 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8
7710 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8
7711 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
7712 72,173, //lods %ds:(%rsi),%rax
7713 255,224, //jmpq *%rax
7714};
7715
7716CODE const uint8_t sk_matrix_2x3_hsw[] = {
7717 72,173, //lods %ds:(%rsi),%rax
7718 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
7719 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
7720 196,98,125,24,64,16, //vbroadcastss 0x10(%rax),%ymm8
7721 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
7722 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
7723 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
7724 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11
7725 196,98,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm9
7726 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
7727 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
7728 72,173, //lods %ds:(%rsi),%rax
7729 197,124,41,192, //vmovaps %ymm8,%ymm0
7730 197,124,41,201, //vmovaps %ymm9,%ymm1
7731 255,224, //jmpq *%rax
7732};
7733
7734CODE const uint8_t sk_matrix_3x4_hsw[] = {
7735 72,173, //lods %ds:(%rsi),%rax
7736 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
7737 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10
7738 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11
7739 196,98,125,24,64,36, //vbroadcastss 0x24(%rax),%ymm8
7740 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8
7741 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
7742 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
7743 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
7744 196,98,125,24,88,16, //vbroadcastss 0x10(%rax),%ymm11
7745 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12
7746 196,98,125,24,72,40, //vbroadcastss 0x28(%rax),%ymm9
7747 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9
7748 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
7749 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
7750 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11
7751 196,98,125,24,96,20, //vbroadcastss 0x14(%rax),%ymm12
7752 196,98,125,24,104,32, //vbroadcastss 0x20(%rax),%ymm13
7753 196,98,125,24,80,44, //vbroadcastss 0x2c(%rax),%ymm10
7754 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10
7755 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10
7756 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10
7757 72,173, //lods %ds:(%rsi),%rax
7758 197,124,41,192, //vmovaps %ymm8,%ymm0
7759 197,124,41,201, //vmovaps %ymm9,%ymm1
7760 197,124,41,210, //vmovaps %ymm10,%ymm2
7761 255,224, //jmpq *%rax
7762};
7763
7764CODE const uint8_t sk_matrix_perspective_hsw[] = {
7765 72,173, //lods %ds:(%rsi),%rax
7766 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
7767 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
7768 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
7769 196,66,117,184,209, //vfmadd231ps %ymm9,%ymm1,%ymm10
7770 196,66,125,184,208, //vfmadd231ps %ymm8,%ymm0,%ymm10
7771 196,98,125,24,64,12, //vbroadcastss 0xc(%rax),%ymm8
7772 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9
7773 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
7774 196,66,117,184,217, //vfmadd231ps %ymm9,%ymm1,%ymm11
7775 196,66,125,184,216, //vfmadd231ps %ymm8,%ymm0,%ymm11
7776 196,98,125,24,64,24, //vbroadcastss 0x18(%rax),%ymm8
7777 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9
7778 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
7779 196,66,117,184,225, //vfmadd231ps %ymm9,%ymm1,%ymm12
7780 196,66,125,184,224, //vfmadd231ps %ymm8,%ymm0,%ymm12
7781 196,193,124,83,204, //vrcpps %ymm12,%ymm1
7782 197,172,89,193, //vmulps %ymm1,%ymm10,%ymm0
7783 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
7784 72,173, //lods %ds:(%rsi),%rax
7785 255,224, //jmpq *%rax
7786};
7787
7788CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
7789 72,173, //lods %ds:(%rsi),%rax
7790 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1
7791 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
7792 196,98,125,184,193, //vfmadd231ps %ymm1,%ymm0,%ymm8
7793 196,226,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm2
7794 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
7795 196,226,125,184,202, //vfmadd231ps %ymm2,%ymm0,%ymm1
7796 196,226,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm3
7797 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
7798 196,226,125,184,211, //vfmadd231ps %ymm3,%ymm0,%ymm2
7799 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9
7800 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
7801 196,194,125,184,217, //vfmadd231ps %ymm9,%ymm0,%ymm3
7802 72,173, //lods %ds:(%rsi),%rax
7803 197,124,41,192, //vmovaps %ymm8,%ymm0
7804 255,224, //jmpq *%rax
7805};
7806
7807CODE const uint8_t sk_start_pipeline_avx[] = {
7808 65,87, //push %r15
7809 65,86, //push %r14
7810 65,85, //push %r13
7811 65,84, //push %r12
7812 86, //push %rsi
7813 87, //push %rdi
7814 83, //push %rbx
7815 72,129,236,160,0,0,0, //sub $0xa0,%rsp
7816 197,120,41,188,36,144,0,0,0, //vmovaps %xmm15,0x90(%rsp)
7817 197,120,41,180,36,128,0,0,0, //vmovaps %xmm14,0x80(%rsp)
7818 197,120,41,108,36,112, //vmovaps %xmm13,0x70(%rsp)
7819 197,120,41,100,36,96, //vmovaps %xmm12,0x60(%rsp)
7820 197,120,41,92,36,80, //vmovaps %xmm11,0x50(%rsp)
7821 197,120,41,84,36,64, //vmovaps %xmm10,0x40(%rsp)
7822 197,120,41,76,36,48, //vmovaps %xmm9,0x30(%rsp)
7823 197,120,41,68,36,32, //vmovaps %xmm8,0x20(%rsp)
7824 197,248,41,124,36,16, //vmovaps %xmm7,0x10(%rsp)
7825 197,248,41,52,36, //vmovaps %xmm6,(%rsp)
7826 77,137,205, //mov %r9,%r13
7827 77,137,198, //mov %r8,%r14
7828 72,137,203, //mov %rcx,%rbx
7829 72,137,214, //mov %rdx,%rsi
7830 72,173, //lods %ds:(%rsi),%rax
7831 73,137,199, //mov %rax,%r15
7832 73,137,244, //mov %rsi,%r12
7833 72,141,67,8, //lea 0x8(%rbx),%rax
7834 76,57,232, //cmp %r13,%rax
7835 118,5, //jbe 75 <_sk_start_pipeline_avx+0x75>
7836 72,137,223, //mov %rbx,%rdi
7837 235,65, //jmp b6 <_sk_start_pipeline_avx+0xb6>
7838 185,0,0,0,0, //mov $0x0,%ecx
7839 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
7840 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
7841 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
7842 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
7843 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
7844 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
7845 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
7846 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
7847 72,137,223, //mov %rbx,%rdi
7848 76,137,230, //mov %r12,%rsi
7849 76,137,242, //mov %r14,%rdx
7850 65,255,215, //callq *%r15
7851 72,141,123,8, //lea 0x8(%rbx),%rdi
7852 72,131,195,16, //add $0x10,%rbx
7853 76,57,235, //cmp %r13,%rbx
7854 72,137,251, //mov %rdi,%rbx
7855 118,191, //jbe 75 <_sk_start_pipeline_avx+0x75>
7856 76,137,233, //mov %r13,%rcx
7857 72,41,249, //sub %rdi,%rcx
7858 116,41, //je e7 <_sk_start_pipeline_avx+0xe7>
7859 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
7860 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
7861 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
7862 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
7863 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
7864 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
7865 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
7866 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
7867 76,137,230, //mov %r12,%rsi
7868 76,137,242, //mov %r14,%rdx
7869 65,255,215, //callq *%r15
7870 76,137,232, //mov %r13,%rax
7871 197,248,40,52,36, //vmovaps (%rsp),%xmm6
7872 197,248,40,124,36,16, //vmovaps 0x10(%rsp),%xmm7
7873 197,120,40,68,36,32, //vmovaps 0x20(%rsp),%xmm8
7874 197,120,40,76,36,48, //vmovaps 0x30(%rsp),%xmm9
7875 197,120,40,84,36,64, //vmovaps 0x40(%rsp),%xmm10
7876 197,120,40,92,36,80, //vmovaps 0x50(%rsp),%xmm11
7877 197,120,40,100,36,96, //vmovaps 0x60(%rsp),%xmm12
7878 197,120,40,108,36,112, //vmovaps 0x70(%rsp),%xmm13
7879 197,120,40,180,36,128,0,0,0, //vmovaps 0x80(%rsp),%xmm14
7880 197,120,40,188,36,144,0,0,0, //vmovaps 0x90(%rsp),%xmm15
7881 72,129,196,160,0,0,0, //add $0xa0,%rsp
7882 91, //pop %rbx
7883 95, //pop %rdi
7884 94, //pop %rsi
7885 65,92, //pop %r12
7886 65,93, //pop %r13
7887 65,94, //pop %r14
7888 65,95, //pop %r15
7889 197,248,119, //vzeroupper
7890 195, //retq
7891};
7892
7893CODE const uint8_t sk_just_return_avx[] = {
7894 195, //retq
7895};
7896
7897CODE const uint8_t sk_seed_shader_avx[] = {
7898 72,173, //lods %ds:(%rsi),%rax
7899 197,249,110,199, //vmovd %edi,%xmm0
7900 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
7901 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
7902 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
7903 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
7904 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
7905 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
7906 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
7907 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
7908 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
7909 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
7910 72,173, //lods %ds:(%rsi),%rax
7911 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
7912 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
7913 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5
7914 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6
7915 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7
7916 255,224, //jmpq *%rax
7917};
7918
7919CODE const uint8_t sk_constant_color_avx[] = {
7920 72,173, //lods %ds:(%rsi),%rax
7921 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
7922 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
7923 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
7924 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3
7925 72,173, //lods %ds:(%rsi),%rax
7926 255,224, //jmpq *%rax
7927};
7928
7929CODE const uint8_t sk_clear_avx[] = {
7930 72,173, //lods %ds:(%rsi),%rax
7931 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
7932 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
7933 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
7934 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
7935 255,224, //jmpq *%rax
7936};
7937
7938CODE const uint8_t sk_plus__avx[] = {
7939 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
7940 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
7941 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
7942 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
7943 72,173, //lods %ds:(%rsi),%rax
7944 255,224, //jmpq *%rax
7945};
7946
7947CODE const uint8_t sk_srcover_avx[] = {
7948 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
7949 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
7950 197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9
7951 197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0
7952 197,60,89,205, //vmulps %ymm5,%ymm8,%ymm9
7953 197,180,88,201, //vaddps %ymm1,%ymm9,%ymm1
7954 197,60,89,206, //vmulps %ymm6,%ymm8,%ymm9
7955 197,180,88,210, //vaddps %ymm2,%ymm9,%ymm2
7956 197,60,89,199, //vmulps %ymm7,%ymm8,%ymm8
7957 197,188,88,219, //vaddps %ymm3,%ymm8,%ymm3
7958 72,173, //lods %ds:(%rsi),%rax
7959 255,224, //jmpq *%rax
7960};
7961
7962CODE const uint8_t sk_dstover_avx[] = {
7963 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
7964 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
7965 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
7966 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
7967 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
7968 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
7969 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
7970 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
7971 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
7972 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
7973 72,173, //lods %ds:(%rsi),%rax
7974 255,224, //jmpq *%rax
7975};
7976
7977CODE const uint8_t sk_clamp_0_avx[] = {
7978 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
7979 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0
7980 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1
7981 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2
7982 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3
7983 72,173, //lods %ds:(%rsi),%rax
7984 255,224, //jmpq *%rax
7985};
7986
7987CODE const uint8_t sk_clamp_1_avx[] = {
7988 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
7989 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
7990 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
7991 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
7992 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
7993 72,173, //lods %ds:(%rsi),%rax
7994 255,224, //jmpq *%rax
7995};
7996
7997CODE const uint8_t sk_clamp_a_avx[] = {
7998 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
7999 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
8000 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
8001 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
8002 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2
8003 72,173, //lods %ds:(%rsi),%rax
8004 255,224, //jmpq *%rax
8005};
8006
8007CODE const uint8_t sk_set_rgb_avx[] = {
8008 72,173, //lods %ds:(%rsi),%rax
8009 196,226,125,24,0, //vbroadcastss (%rax),%ymm0
8010 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1
8011 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2
8012 72,173, //lods %ds:(%rsi),%rax
8013 255,224, //jmpq *%rax
8014};
8015
8016CODE const uint8_t sk_swap_rb_avx[] = {
8017 197,124,40,192, //vmovaps %ymm0,%ymm8
8018 72,173, //lods %ds:(%rsi),%rax
8019 197,252,40,194, //vmovaps %ymm2,%ymm0
8020 197,124,41,194, //vmovaps %ymm8,%ymm2
8021 255,224, //jmpq *%rax
8022};
8023
8024CODE const uint8_t sk_swap_avx[] = {
8025 197,124,40,195, //vmovaps %ymm3,%ymm8
8026 197,124,40,202, //vmovaps %ymm2,%ymm9
8027 197,124,40,209, //vmovaps %ymm1,%ymm10
8028 197,124,40,216, //vmovaps %ymm0,%ymm11
8029 72,173, //lods %ds:(%rsi),%rax
8030 197,252,40,196, //vmovaps %ymm4,%ymm0
8031 197,252,40,205, //vmovaps %ymm5,%ymm1
8032 197,252,40,214, //vmovaps %ymm6,%ymm2
8033 197,252,40,223, //vmovaps %ymm7,%ymm3
8034 197,124,41,220, //vmovaps %ymm11,%ymm4
8035 197,124,41,213, //vmovaps %ymm10,%ymm5
8036 197,124,41,206, //vmovaps %ymm9,%ymm6
8037 197,124,41,199, //vmovaps %ymm8,%ymm7
8038 255,224, //jmpq *%rax
8039};
8040
8041CODE const uint8_t sk_move_src_dst_avx[] = {
8042 72,173, //lods %ds:(%rsi),%rax
8043 197,252,40,224, //vmovaps %ymm0,%ymm4
8044 197,252,40,233, //vmovaps %ymm1,%ymm5
8045 197,252,40,242, //vmovaps %ymm2,%ymm6
8046 197,252,40,251, //vmovaps %ymm3,%ymm7
8047 255,224, //jmpq *%rax
8048};
8049
8050CODE const uint8_t sk_move_dst_src_avx[] = {
8051 72,173, //lods %ds:(%rsi),%rax
8052 197,252,40,196, //vmovaps %ymm4,%ymm0
8053 197,252,40,205, //vmovaps %ymm5,%ymm1
8054 197,252,40,214, //vmovaps %ymm6,%ymm2
8055 197,252,40,223, //vmovaps %ymm7,%ymm3
8056 255,224, //jmpq *%rax
8057};
8058
8059CODE const uint8_t sk_premul_avx[] = {
8060 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0
8061 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1
8062 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
8063 72,173, //lods %ds:(%rsi),%rax
8064 255,224, //jmpq *%rax
8065};
8066
8067CODE const uint8_t sk_unpremul_avx[] = {
8068 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
8069 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
8070 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
8071 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
8072 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
8073 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
8074 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
8075 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
8076 72,173, //lods %ds:(%rsi),%rax
8077 255,224, //jmpq *%rax
8078};
8079
8080CODE const uint8_t sk_from_srgb_avx[] = {
8081 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8
8082 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
8083 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10
8084 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11
8085 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12
8086 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13
8087 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13
8088 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14
8089 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10
8090 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10
8091 196,98,125,24,106,68, //vbroadcastss 0x44(%rdx),%ymm13
8092 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0
8093 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0
8094 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9
8095 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10
8096 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15
8097 196,65,4,88,252, //vaddps %ymm12,%ymm15,%ymm15
8098 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10
8099 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10
8100 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1
8101 196,195,45,74,201,16, //vblendvps %ymm1,%ymm9,%ymm10,%ymm1
8102 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
8103 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9
8104 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10
8105 196,65,44,88,212, //vaddps %ymm12,%ymm10,%ymm10
8106 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9
8107 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9
8108 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2
8109 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
8110 72,173, //lods %ds:(%rsi),%rax
8111 255,224, //jmpq *%rax
8112};
8113
8114CODE const uint8_t sk_to_srgb_avx[] = {
8115 197,124,82,192, //vrsqrtps %ymm0,%ymm8
8116 196,65,124,83,200, //vrcpps %ymm8,%ymm9
8117 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10
8118 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8
8119 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11
8120 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12
8121 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13
8122 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14
8123 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15
8124 196,65,52,89,206, //vmulps %ymm14,%ymm9,%ymm9
8125 196,65,52,88,207, //vaddps %ymm15,%ymm9,%ymm9
8126 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10
8127 196,65,44,88,201, //vaddps %ymm9,%ymm10,%ymm9
8128 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
8129 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10
8130 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0
8131 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0
8132 197,124,82,201, //vrsqrtps %ymm1,%ymm9
8133 196,65,124,83,217, //vrcpps %ymm9,%ymm11
8134 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
8135 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11
8136 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11
8137 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9
8138 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9
8139 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11
8140 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
8141 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1
8142 196,195,53,74,203,16, //vblendvps %ymm1,%ymm11,%ymm9,%ymm1
8143 197,124,82,202, //vrsqrtps %ymm2,%ymm9
8144 196,65,124,83,217, //vrcpps %ymm9,%ymm11
8145 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11
8146 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11
8147 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9
8148 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9
8149 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9
8150 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9
8151 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
8152 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2
8153 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2
8154 72,173, //lods %ds:(%rsi),%rax
8155 255,224, //jmpq *%rax
8156};
8157
8158CODE const uint8_t sk_scale_1_float_avx[] = {
8159 72,173, //lods %ds:(%rsi),%rax
8160 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
8161 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
8162 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
8163 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
8164 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
8165 72,173, //lods %ds:(%rsi),%rax
8166 255,224, //jmpq *%rax
8167};
8168
8169CODE const uint8_t sk_scale_u8_avx[] = {
8170 73,137,200, //mov %rcx,%r8
8171 72,173, //lods %ds:(%rsi),%rax
8172 72,139,0, //mov (%rax),%rax
8173 72,1,248, //add %rdi,%rax
8174 77,133,192, //test %r8,%r8
8175 117,65, //jne 50f <_sk_scale_u8_avx+0x51>
8176 197,123,16,0, //vmovsd (%rax),%xmm8
8177 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
8178 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
8179 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
8180 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
8181 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
8182 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
8183 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
8184 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
8185 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
8186 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
8187 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3
8188 72,173, //lods %ds:(%rsi),%rax
8189 76,137,193, //mov %r8,%rcx
8190 255,224, //jmpq *%rax
8191 49,201, //xor %ecx,%ecx
8192 77,137,194, //mov %r8,%r10
8193 69,49,201, //xor %r9d,%r9d
8194 68,15,182,24, //movzbl (%rax),%r11d
8195 72,255,192, //inc %rax
8196 73,211,227, //shl %cl,%r11
8197 77,9,217, //or %r11,%r9
8198 72,131,193,8, //add $0x8,%rcx
8199 73,255,202, //dec %r10
8200 117,234, //jne 517 <_sk_scale_u8_avx+0x59>
8201 196,65,249,110,193, //vmovq %r9,%xmm8
8202 235,158, //jmp 4d2 <_sk_scale_u8_avx+0x14>
8203};
8204
8205CODE const uint8_t sk_lerp_1_float_avx[] = {
8206 72,173, //lods %ds:(%rsi),%rax
8207 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
8208 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
8209 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
8210 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
8211 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
8212 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
8213 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
8214 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
8215 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
8216 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
8217 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
8218 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
8219 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
8220 72,173, //lods %ds:(%rsi),%rax
8221 255,224, //jmpq *%rax
8222};
8223
8224CODE const uint8_t sk_lerp_u8_avx[] = {
8225 73,137,200, //mov %rcx,%r8
8226 72,173, //lods %ds:(%rsi),%rax
8227 72,139,0, //mov (%rax),%rax
8228 72,1,248, //add %rdi,%rax
8229 77,133,192, //test %r8,%r8
8230 117,101, //jne 5e8 <_sk_lerp_u8_avx+0x75>
8231 197,123,16,0, //vmovsd (%rax),%xmm8
8232 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
8233 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
8234 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
8235 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
8236 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
8237 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
8238 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
8239 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
8240 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
8241 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
8242 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
8243 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
8244 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
8245 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
8246 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
8247 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
8248 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3
8249 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
8250 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3
8251 72,173, //lods %ds:(%rsi),%rax
8252 76,137,193, //mov %r8,%rcx
8253 255,224, //jmpq *%rax
8254 49,201, //xor %ecx,%ecx
8255 77,137,194, //mov %r8,%r10
8256 69,49,201, //xor %r9d,%r9d
8257 68,15,182,24, //movzbl (%rax),%r11d
8258 72,255,192, //inc %rax
8259 73,211,227, //shl %cl,%r11
8260 77,9,217, //or %r11,%r9
8261 72,131,193,8, //add $0x8,%rcx
8262 73,255,202, //dec %r10
8263 117,234, //jne 5f0 <_sk_lerp_u8_avx+0x7d>
8264 196,65,249,110,193, //vmovq %r9,%xmm8
8265 233,119,255,255,255, //jmpq 587 <_sk_lerp_u8_avx+0x14>
8266};
8267
8268CODE const uint8_t sk_lerp_565_avx[] = {
8269 72,173, //lods %ds:(%rsi),%rax
8270 76,139,16, //mov (%rax),%r10
8271 72,133,201, //test %rcx,%rcx
8272 15,133,148,0,0,0, //jne 6b2 <_sk_lerp_565_avx+0xa2>
8273 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8
8274 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
8275 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3
8276 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8
8277 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
8278 196,98,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm8
8279 197,60,84,195, //vandps %ymm3,%ymm8,%ymm8
8280 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
8281 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9
8282 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8
8283 196,98,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm9
8284 197,52,84,203, //vandps %ymm3,%ymm9,%ymm9
8285 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9
8286 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10
8287 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9
8288 196,98,125,24,82,112, //vbroadcastss 0x70(%rdx),%ymm10
8289 197,172,84,219, //vandps %ymm3,%ymm10,%ymm3
8290 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
8291 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10
8292 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3
8293 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
8294 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
8295 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
8296 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1
8297 196,193,116,89,201, //vmulps %ymm9,%ymm1,%ymm1
8298 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1
8299 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
8300 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
8301 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
8302 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
8303 72,173, //lods %ds:(%rsi),%rax
8304 255,224, //jmpq *%rax
8305 65,137,200, //mov %ecx,%r8d
8306 65,128,224,7, //and $0x7,%r8b
8307 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8
8308 65,254,200, //dec %r8b
8309 69,15,182,192, //movzbl %r8b,%r8d
8310 65,128,248,6, //cmp $0x6,%r8b
8311 15,135,85,255,255,255, //ja 624 <_sk_lerp_565_avx+0x14>
8312 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 720 <_sk_lerp_565_avx+0x110>
8313 75,99,4,129, //movslq (%r9,%r8,4),%rax
8314 76,1,200, //add %r9,%rax
8315 255,224, //jmpq *%rax
8316 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
8317 196,65,97,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
8318 196,65,57,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
8319 196,65,57,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
8320 196,65,57,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
8321 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
8322 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
8323 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8
8324 233,5,255,255,255, //jmpq 624 <_sk_lerp_565_avx+0x14>
8325 144, //nop
8326 243,255, //repz (bad)
8327 255, //(bad)
8328 255, //(bad)
8329 235,255, //jmp 725 <_sk_lerp_565_avx+0x115>
8330 255, //(bad)
8331 255,227, //jmpq *%rbx
8332 255, //(bad)
8333 255, //(bad)
8334 255, //(bad)
8335 219,255, //(bad)
8336 255, //(bad)
8337 255,211, //callq *%rbx
8338 255, //(bad)
8339 255, //(bad)
8340 255,203, //dec %ebx
8341 255, //(bad)
8342 255, //(bad)
8343 255, //(bad)
8344 191, //.byte 0xbf
8345 255, //(bad)
8346 255, //(bad)
8347 255, //.byte 0xff
8348};
8349
8350CODE const uint8_t sk_load_tables_avx[] = {
8351 85, //push %rbp
8352 65,87, //push %r15
8353 65,86, //push %r14
8354 65,85, //push %r13
8355 65,84, //push %r12
8356 83, //push %rbx
8357 72,173, //lods %ds:(%rsi),%rax
8358 76,139,0, //mov (%rax),%r8
8359 72,133,201, //test %rcx,%rcx
8360 15,133,18,2,0,0, //jne 966 <_sk_load_tables_avx+0x22a>
8361 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8
8362 196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9
8363 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0
8364 196,193,249,126,193, //vmovq %xmm0,%r9
8365 69,137,203, //mov %r9d,%r11d
8366 196,195,249,22,194,1, //vpextrq $0x1,%xmm0,%r10
8367 69,137,214, //mov %r10d,%r14d
8368 73,193,234,32, //shr $0x20,%r10
8369 73,193,233,32, //shr $0x20,%r9
8370 196,227,125,25,192,1, //vextractf128 $0x1,%ymm0,%xmm0
8371 196,193,249,126,196, //vmovq %xmm0,%r12
8372 69,137,231, //mov %r12d,%r15d
8373 196,227,249,22,195,1, //vpextrq $0x1,%xmm0,%rbx
8374 65,137,221, //mov %ebx,%r13d
8375 72,193,235,32, //shr $0x20,%rbx
8376 73,193,236,32, //shr $0x20,%r12
8377 72,139,104,8, //mov 0x8(%rax),%rbp
8378 76,139,64,16, //mov 0x10(%rax),%r8
8379 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0
8380 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
8381 196,163,121,33,68,173,0,32, //vinsertps $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
8382 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1
8383 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0
8384 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1
8385 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
8386 196,163,113,33,76,181,0,32, //vinsertps $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
8387 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3
8388 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1
8389 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
8390 196,193,113,114,208,8, //vpsrld $0x8,%xmm8,%xmm1
8391 196,67,125,25,194,1, //vextractf128 $0x1,%ymm8,%xmm10
8392 196,193,105,114,210,8, //vpsrld $0x8,%xmm10,%xmm2
8393 196,227,117,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
8394 197,180,84,201, //vandps %ymm1,%ymm9,%ymm1
8395 196,193,249,126,201, //vmovq %xmm1,%r9
8396 69,137,203, //mov %r9d,%r11d
8397 196,195,249,22,202,1, //vpextrq $0x1,%xmm1,%r10
8398 69,137,214, //mov %r10d,%r14d
8399 73,193,234,32, //shr $0x20,%r10
8400 73,193,233,32, //shr $0x20,%r9
8401 196,227,125,25,201,1, //vextractf128 $0x1,%ymm1,%xmm1
8402 196,225,249,126,205, //vmovq %xmm1,%rbp
8403 65,137,239, //mov %ebp,%r15d
8404 196,227,249,22,203,1, //vpextrq $0x1,%xmm1,%rbx
8405 65,137,220, //mov %ebx,%r12d
8406 72,193,235,32, //shr $0x20,%rbx
8407 72,193,237,32, //shr $0x20,%rbp
8408 196,129,122,16,12,184, //vmovss (%r8,%r15,4),%xmm1
8409 196,195,113,33,12,168,16, //vinsertps $0x10,(%r8,%rbp,4),%xmm1,%xmm1
8410 196,129,122,16,20,160, //vmovss (%r8,%r12,4),%xmm2
8411 196,227,113,33,202,32, //vinsertps $0x20,%xmm2,%xmm1,%xmm1
8412 196,193,122,16,20,152, //vmovss (%r8,%rbx,4),%xmm2
8413 196,227,113,33,202,48, //vinsertps $0x30,%xmm2,%xmm1,%xmm1
8414 196,129,122,16,20,152, //vmovss (%r8,%r11,4),%xmm2
8415 196,131,105,33,20,136,16, //vinsertps $0x10,(%r8,%r9,4),%xmm2,%xmm2
8416 196,129,122,16,28,176, //vmovss (%r8,%r14,4),%xmm3
8417 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2
8418 196,129,122,16,28,144, //vmovss (%r8,%r10,4),%xmm3
8419 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2
8420 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
8421 72,139,64,24, //mov 0x18(%rax),%rax
8422 196,193,105,114,208,16, //vpsrld $0x10,%xmm8,%xmm2
8423 196,193,97,114,210,16, //vpsrld $0x10,%xmm10,%xmm3
8424 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
8425 197,180,84,210, //vandps %ymm2,%ymm9,%ymm2
8426 196,193,249,126,208, //vmovq %xmm2,%r8
8427 69,137,194, //mov %r8d,%r10d
8428 196,195,249,22,209,1, //vpextrq $0x1,%xmm2,%r9
8429 69,137,203, //mov %r9d,%r11d
8430 73,193,233,32, //shr $0x20,%r9
8431 73,193,232,32, //shr $0x20,%r8
8432 196,227,125,25,210,1, //vextractf128 $0x1,%ymm2,%xmm2
8433 196,225,249,126,213, //vmovq %xmm2,%rbp
8434 65,137,238, //mov %ebp,%r14d
8435 196,227,249,22,211,1, //vpextrq $0x1,%xmm2,%rbx
8436 65,137,223, //mov %ebx,%r15d
8437 72,193,235,32, //shr $0x20,%rbx
8438 72,193,237,32, //shr $0x20,%rbp
8439 196,161,122,16,20,176, //vmovss (%rax,%r14,4),%xmm2
8440 196,227,105,33,20,168,16, //vinsertps $0x10,(%rax,%rbp,4),%xmm2,%xmm2
8441 196,161,122,16,28,184, //vmovss (%rax,%r15,4),%xmm3
8442 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2
8443 197,250,16,28,152, //vmovss (%rax,%rbx,4),%xmm3
8444 196,99,105,33,203,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm9
8445 196,161,122,16,28,144, //vmovss (%rax,%r10,4),%xmm3
8446 196,163,97,33,28,128,16, //vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3
8447 196,161,122,16,20,152, //vmovss (%rax,%r11,4),%xmm2
8448 196,227,97,33,210,32, //vinsertps $0x20,%xmm2,%xmm3,%xmm2
8449 196,161,122,16,28,136, //vmovss (%rax,%r9,4),%xmm3
8450 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2
8451 196,195,109,24,209,1, //vinsertf128 $0x1,%xmm9,%ymm2,%ymm2
8452 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8
8453 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3
8454 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
8455 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
8456 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
8457 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
8458 72,173, //lods %ds:(%rsi),%rax
8459 91, //pop %rbx
8460 65,92, //pop %r12
8461 65,93, //pop %r13
8462 65,94, //pop %r14
8463 65,95, //pop %r15
8464 93, //pop %rbp
8465 255,224, //jmpq *%rax
8466 65,137,201, //mov %ecx,%r9d
8467 65,128,225,7, //and $0x7,%r9b
8468 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
8469 65,254,201, //dec %r9b
8470 69,15,182,201, //movzbl %r9b,%r9d
8471 65,128,249,6, //cmp $0x6,%r9b
8472 15,135,215,253,255,255, //ja 75a <_sk_load_tables_avx+0x1e>
8473 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # a14 <_sk_load_tables_avx+0x2d8>
8474 79,99,12,138, //movslq (%r10,%r9,4),%r9
8475 77,1,209, //add %r10,%r9
8476 65,255,225, //jmpq *%r9
8477 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0
8478 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0
8479 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
8480 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
8481 196,99,117,12,192,64, //vblendps $0x40,%ymm0,%ymm1,%ymm8
8482 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
8483 196,195,121,34,68,184,20,1, //vpinsrd $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
8484 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8
8485 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
8486 196,195,121,34,68,184,16,0, //vpinsrd $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
8487 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8
8488 196,195,57,34,68,184,12,3, //vpinsrd $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
8489 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
8490 196,195,57,34,68,184,8,2, //vpinsrd $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
8491 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
8492 196,195,57,34,68,184,4,1, //vpinsrd $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
8493 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
8494 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0
8495 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
8496 233,70,253,255,255, //jmpq 75a <_sk_load_tables_avx+0x1e>
8497 238, //out %al,(%dx)
8498 255, //(bad)
8499 255, //(bad)
8500 255,224, //jmpq *%rax
8501 255, //(bad)
8502 255, //(bad)
8503 255,210, //callq *%rdx
8504 255, //(bad)
8505 255, //(bad)
8506 255,196, //inc %esp
8507 255, //(bad)
8508 255, //(bad)
8509 255,176,255,255,255,156, //pushq -0x63000001(%rax)
8510 255, //(bad)
8511 255, //(bad)
8512 255, //.byte 0xff
8513 128,255,255, //cmp $0xff,%bh
8514 255, //.byte 0xff
8515};
8516
8517CODE const uint8_t sk_load_a8_avx[] = {
8518 73,137,200, //mov %rcx,%r8
8519 72,173, //lods %ds:(%rsi),%rax
8520 72,139,0, //mov (%rax),%rax
8521 72,1,248, //add %rdi,%rax
8522 77,133,192, //test %r8,%r8
8523 117,59, //jne a7b <_sk_load_a8_avx+0x4b>
8524 197,251,16,0, //vmovsd (%rax),%xmm0
8525 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1
8526 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0
8527 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0
8528 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
8529 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
8530 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1
8531 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3
8532 72,173, //lods %ds:(%rsi),%rax
8533 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
8534 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
8535 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
8536 76,137,193, //mov %r8,%rcx
8537 255,224, //jmpq *%rax
8538 49,201, //xor %ecx,%ecx
8539 77,137,194, //mov %r8,%r10
8540 69,49,201, //xor %r9d,%r9d
8541 68,15,182,24, //movzbl (%rax),%r11d
8542 72,255,192, //inc %rax
8543 73,211,227, //shl %cl,%r11
8544 77,9,217, //or %r11,%r9
8545 72,131,193,8, //add $0x8,%rcx
8546 73,255,202, //dec %r10
8547 117,234, //jne a83 <_sk_load_a8_avx+0x53>
8548 196,193,249,110,193, //vmovq %r9,%xmm0
8549 235,164, //jmp a44 <_sk_load_a8_avx+0x14>
8550};
8551
8552CODE const uint8_t sk_store_a8_avx[] = {
8553 72,173, //lods %ds:(%rsi),%rax
8554 76,139,8, //mov (%rax),%r9
8555 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
8556 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
8557 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
8558 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
8559 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
8560 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
8561 72,133,201, //test %rcx,%rcx
8562 117,10, //jne ad3 <_sk_store_a8_avx+0x33>
8563 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
8564 72,173, //lods %ds:(%rsi),%rax
8565 255,224, //jmpq *%rax
8566 137,200, //mov %ecx,%eax
8567 36,7, //and $0x7,%al
8568 254,200, //dec %al
8569 68,15,182,192, //movzbl %al,%r8d
8570 65,128,248,6, //cmp $0x6,%r8b
8571 119,236, //ja acf <_sk_store_a8_avx+0x2f>
8572 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
8573 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b34 <_sk_store_a8_avx+0x94>
8574 75,99,4,130, //movslq (%r10,%r8,4),%rax
8575 76,1,208, //add %r10,%rax
8576 255,224, //jmpq *%rax
8577 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1)
8578 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1)
8579 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1)
8580 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1)
8581 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
8582 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
8583 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
8584 235,158, //jmp acf <_sk_store_a8_avx+0x2f>
8585 15,31,0, //nopl (%rax)
8586 244, //hlt
8587 255, //(bad)
8588 255, //(bad)
8589 255, //(bad)
8590 236, //in (%dx),%al
8591 255, //(bad)
8592 255, //(bad)
8593 255,228, //jmpq *%rsp
8594 255, //(bad)
8595 255, //(bad)
8596 255, //(bad)
8597 220,255, //fdivr %st,%st(7)
8598 255, //(bad)
8599 255,212, //callq *%rsp
8600 255, //(bad)
8601 255, //(bad)
8602 255,204, //dec %esp
8603 255, //(bad)
8604 255, //(bad)
8605 255,196, //inc %esp
8606 255, //(bad)
8607 255, //(bad)
8608 255, //.byte 0xff
8609};
8610
8611CODE const uint8_t sk_load_565_avx[] = {
8612 72,173, //lods %ds:(%rsi),%rax
8613 76,139,16, //mov (%rax),%r10
8614 72,133,201, //test %rcx,%rcx
8615 117,106, //jne bc4 <_sk_load_565_avx+0x74>
8616 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
8617 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1
8618 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1
8619 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0
8620 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2
8621 196,226,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm0
8622 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0
8623 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
8624 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1
8625 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0
8626 196,226,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm1
8627 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1
8628 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
8629 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3
8630 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
8631 196,226,125,24,90,112, //vbroadcastss 0x70(%rdx),%ymm3
8632 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2
8633 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
8634 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3
8635 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2
8636 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
8637 72,173, //lods %ds:(%rsi),%rax
8638 255,224, //jmpq *%rax
8639 65,137,200, //mov %ecx,%r8d
8640 65,128,224,7, //and $0x7,%r8b
8641 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
8642 65,254,200, //dec %r8b
8643 69,15,182,192, //movzbl %r8b,%r8d
8644 65,128,248,6, //cmp $0x6,%r8b
8645 119,132, //ja b60 <_sk_load_565_avx+0x10>
8646 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # c2c <_sk_load_565_avx+0xdc>
8647 75,99,4,129, //movslq (%r9,%r8,4),%rax
8648 76,1,200, //add %r9,%rax
8649 255,224, //jmpq *%rax
8650 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0
8651 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
8652 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
8653 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
8654 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
8655 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
8656 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
8657 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
8658 233,52,255,255,255, //jmpq b60 <_sk_load_565_avx+0x10>
8659 244, //hlt
8660 255, //(bad)
8661 255, //(bad)
8662 255, //(bad)
8663 236, //in (%dx),%al
8664 255, //(bad)
8665 255, //(bad)
8666 255,228, //jmpq *%rsp
8667 255, //(bad)
8668 255, //(bad)
8669 255, //(bad)
8670 220,255, //fdivr %st,%st(7)
8671 255, //(bad)
8672 255,212, //callq *%rsp
8673 255, //(bad)
8674 255, //(bad)
8675 255,204, //dec %esp
8676 255, //(bad)
8677 255, //(bad)
8678 255,192, //inc %eax
8679 255, //(bad)
8680 255, //(bad)
8681 255, //.byte 0xff
8682};
8683
8684CODE const uint8_t sk_store_565_avx[] = {
8685 72,173, //lods %ds:(%rsi),%rax
8686 76,139,8, //mov (%rax),%r9
8687 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8
8688 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
8689 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
8690 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10
8691 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9
8692 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9
8693 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9
8694 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10
8695 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
8696 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
8697 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11
8698 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
8699 196,193,41,114,242,5, //vpslld $0x5,%xmm10,%xmm10
8700 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
8701 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9
8702 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8
8703 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
8704 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
8705 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
8706 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
8707 72,133,201, //test %rcx,%rcx
8708 117,10, //jne cce <_sk_store_565_avx+0x86>
8709 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
8710 72,173, //lods %ds:(%rsi),%rax
8711 255,224, //jmpq *%rax
8712 137,200, //mov %ecx,%eax
8713 36,7, //and $0x7,%al
8714 254,200, //dec %al
8715 68,15,182,192, //movzbl %al,%r8d
8716 65,128,248,6, //cmp $0x6,%r8b
8717 119,236, //ja cca <_sk_store_565_avx+0x82>
8718 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # d2c <_sk_store_565_avx+0xe4>
8719 75,99,4,130, //movslq (%r10,%r8,4),%rax
8720 76,1,208, //add %r10,%rax
8721 255,224, //jmpq *%rax
8722 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2)
8723 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2)
8724 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2)
8725 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2)
8726 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2)
8727 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
8728 197,121,126,192, //vmovd %xmm8,%eax
8729 102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
8730 235,161, //jmp cca <_sk_store_565_avx+0x82>
8731 15,31,0, //nopl (%rax)
8732 242,255, //repnz (bad)
8733 255, //(bad)
8734 255, //(bad)
8735 234, //(bad)
8736 255, //(bad)
8737 255, //(bad)
8738 255,226, //jmpq *%rdx
8739 255, //(bad)
8740 255, //(bad)
8741 255, //(bad)
8742 218,255, //(bad)
8743 255, //(bad)
8744 255,210, //callq *%rdx
8745 255, //(bad)
8746 255, //(bad)
8747 255,202, //dec %edx
8748 255, //(bad)
8749 255, //(bad)
8750 255,194, //inc %edx
8751 255, //(bad)
8752 255, //(bad)
8753 255, //.byte 0xff
8754};
8755
8756CODE const uint8_t sk_load_8888_avx[] = {
8757 72,173, //lods %ds:(%rsi),%rax
8758 76,139,16, //mov (%rax),%r10
8759 72,133,201, //test %rcx,%rcx
8760 117,125, //jne dcf <_sk_load_8888_avx+0x87>
8761 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9
8762 196,98,125,24,90,16, //vbroadcastss 0x10(%rdx),%ymm11
8763 196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0
8764 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
8765 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
8766 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
8767 196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10
8768 196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3
8769 197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1
8770 196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
8771 197,164,84,201, //vandps %ymm1,%ymm11,%ymm1
8772 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
8773 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
8774 196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10
8775 197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2
8776 196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
8777 197,164,84,210, //vandps %ymm2,%ymm11,%ymm2
8778 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
8779 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
8780 196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9
8781 197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3
8782 196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
8783 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
8784 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
8785 72,173, //lods %ds:(%rsi),%rax
8786 255,224, //jmpq *%rax
8787 65,137,200, //mov %ecx,%r8d
8788 65,128,224,7, //and $0x7,%r8b
8789 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9
8790 65,254,200, //dec %r8b
8791 69,15,182,192, //movzbl %r8b,%r8d
8792 65,128,248,6, //cmp $0x6,%r8b
8793 15,135,108,255,255,255, //ja d58 <_sk_load_8888_avx+0x10>
8794 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # e7c <_sk_load_8888_avx+0x134>
8795 75,99,4,129, //movslq (%r9,%r8,4),%rax
8796 76,1,200, //add %r9,%rax
8797 255,224, //jmpq *%rax
8798 196,193,121,110,68,186,24, //vmovd 0x18(%r10,%rdi,4),%xmm0
8799 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0
8800 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
8801 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
8802 196,99,117,12,200,64, //vblendps $0x40,%ymm0,%ymm1,%ymm9
8803 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
8804 196,195,121,34,68,186,20,1, //vpinsrd $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
8805 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9
8806 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
8807 196,195,121,34,68,186,16,0, //vpinsrd $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
8808 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9
8809 196,195,49,34,68,186,12,3, //vpinsrd $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
8810 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
8811 196,195,49,34,68,186,8,2, //vpinsrd $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
8812 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
8813 196,195,49,34,68,186,4,1, //vpinsrd $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
8814 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
8815 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0
8816 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
8817 233,220,254,255,255, //jmpq d58 <_sk_load_8888_avx+0x10>
8818 238, //out %al,(%dx)
8819 255, //(bad)
8820 255, //(bad)
8821 255,224, //jmpq *%rax
8822 255, //(bad)
8823 255, //(bad)
8824 255,210, //callq *%rdx
8825 255, //(bad)
8826 255, //(bad)
8827 255,196, //inc %esp
8828 255, //(bad)
8829 255, //(bad)
8830 255,176,255,255,255,156, //pushq -0x63000001(%rax)
8831 255, //(bad)
8832 255, //(bad)
8833 255, //.byte 0xff
8834 128,255,255, //cmp $0xff,%bh
8835 255, //.byte 0xff
8836};
8837
8838CODE const uint8_t sk_store_8888_avx[] = {
8839 72,173, //lods %ds:(%rsi),%rax
8840 76,139,8, //mov (%rax),%r9
8841 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
8842 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
8843 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
8844 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
8845 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
8846 196,193,33,114,242,8, //vpslld $0x8,%xmm10,%xmm11
8847 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
8848 196,193,41,114,242,8, //vpslld $0x8,%xmm10,%xmm10
8849 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
8850 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9
8851 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10
8852 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10
8853 196,193,33,114,242,16, //vpslld $0x10,%xmm10,%xmm11
8854 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10
8855 196,193,41,114,242,16, //vpslld $0x10,%xmm10,%xmm10
8856 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
8857 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
8858 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8
8859 196,193,33,114,240,24, //vpslld $0x18,%xmm8,%xmm11
8860 196,67,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm8
8861 196,193,57,114,240,24, //vpslld $0x18,%xmm8,%xmm8
8862 196,67,37,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm11,%ymm8
8863 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8
8864 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
8865 72,133,201, //test %rcx,%rcx
8866 117,10, //jne f2d <_sk_store_8888_avx+0x95>
8867 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4)
8868 72,173, //lods %ds:(%rsi),%rax
8869 255,224, //jmpq *%rax
8870 137,200, //mov %ecx,%eax
8871 36,7, //and $0x7,%al
8872 254,200, //dec %al
8873 68,15,182,192, //movzbl %al,%r8d
8874 65,128,248,6, //cmp $0x6,%r8b
8875 119,236, //ja f29 <_sk_store_8888_avx+0x91>
8876 76,141,21,84,0,0,0, //lea 0x54(%rip),%r10 # f98 <_sk_store_8888_avx+0x100>
8877 75,99,4,130, //movslq (%r10,%r8,4),%rax
8878 76,1,208, //add %r10,%rax
8879 255,224, //jmpq *%rax
8880 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
8881 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4)
8882 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
8883 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4)
8884 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
8885 196,65,121,126,76,185,16, //vmovd %xmm9,0x10(%r9,%rdi,4)
8886 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4)
8887 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4)
8888 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4)
8889 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4)
8890 235,147, //jmp f29 <_sk_store_8888_avx+0x91>
8891 102,144, //xchg %ax,%ax
8892 246,255, //idiv %bh
8893 255, //(bad)
8894 255, //(bad)
8895 238, //out %al,(%dx)
8896 255, //(bad)
8897 255, //(bad)
8898 255,230, //jmpq *%rsi
8899 255, //(bad)
8900 255, //(bad)
8901 255, //(bad)
8902 222,255, //fdivrp %st,%st(7)
8903 255, //(bad)
8904 255,209, //callq *%rcx
8905 255, //(bad)
8906 255, //(bad)
8907 255,195, //inc %ebx
8908 255, //(bad)
8909 255, //(bad)
8910 255, //.byte 0xff
8911 181,255, //mov $0xff,%ch
8912 255, //(bad)
8913 255, //.byte 0xff
8914};
8915
8916CODE const uint8_t sk_load_f16_avx[] = {
8917 72,173, //lods %ds:(%rsi),%rax
8918 72,139,0, //mov (%rax),%rax
8919 72,133,201, //test %rcx,%rcx
8920 15,133,240,0,0,0, //jne 10b2 <_sk_load_f16_avx+0xfe>
8921 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
8922 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
8923 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
8924 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8
8925 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0
8926 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1
8927 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2
8928 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3
8929 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8
8930 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0
8931 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1
8932 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9
8933 197,249,110,90,100, //vmovd 0x64(%rdx),%xmm3
8934 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3
8935 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2
8936 196,65,105,223,192, //vpandn %xmm8,%xmm2,%xmm8
8937 197,225,101,208, //vpcmpgtw %xmm0,%xmm3,%xmm2
8938 197,233,223,192, //vpandn %xmm0,%xmm2,%xmm0
8939 197,225,101,209, //vpcmpgtw %xmm1,%xmm3,%xmm2
8940 197,233,223,201, //vpandn %xmm1,%xmm2,%xmm1
8941 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2
8942 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2
8943 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10
8944 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9
8945 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
8946 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8
8947 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1
8948 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11
8949 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12
8950 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13
8951 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14
8952 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0
8953 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2
8954 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0
8955 196,98,125,24,74,92, //vbroadcastss 0x5c(%rdx),%ymm9
8956 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0
8957 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2
8958 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1
8959 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
8960 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1
8961 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2
8962 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3
8963 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
8964 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2
8965 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8
8966 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3
8967 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3
8968 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3
8969 72,173, //lods %ds:(%rsi),%rax
8970 255,224, //jmpq *%rax
8971 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
8972 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
8973 72,131,249,1, //cmp $0x1,%rcx
8974 117,6, //jne 10c8 <_sk_load_f16_avx+0x114>
8975 197,250,126,201, //vmovq %xmm1,%xmm1
8976 235,30, //jmp 10e6 <_sk_load_f16_avx+0x132>
8977 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
8978 72,131,249,3, //cmp $0x3,%rcx
8979 114,18, //jb 10e6 <_sk_load_f16_avx+0x132>
8980 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
8981 72,131,249,3, //cmp $0x3,%rcx
8982 117,19, //jne 10f3 <_sk_load_f16_avx+0x13f>
8983 197,250,126,210, //vmovq %xmm2,%xmm2
8984 235,46, //jmp 1114 <_sk_load_f16_avx+0x160>
8985 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
8986 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
8987 233,230,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
8988 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
8989 72,131,249,5, //cmp $0x5,%rcx
8990 114,21, //jb 1114 <_sk_load_f16_avx+0x160>
8991 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
8992 72,131,249,5, //cmp $0x5,%rcx
8993 117,18, //jne 111d <_sk_load_f16_avx+0x169>
8994 197,250,126,219, //vmovq %xmm3,%xmm3
8995 233,197,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
8996 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
8997 233,188,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
8998 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
8999 72,131,249,7, //cmp $0x7,%rcx
9000 15,130,172,254,255,255, //jb fd9 <_sk_load_f16_avx+0x25>
9001 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
9002 233,161,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
9003};
9004
9005CODE const uint8_t sk_store_f16_avx[] = {
9006 72,173, //lods %ds:(%rsi),%rax
9007 72,139,0, //mov (%rax),%rax
9008 196,98,125,24,66,96, //vbroadcastss 0x60(%rdx),%ymm8
9009 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
9010 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10
9011 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10
9012 196,193,49,114,209,13, //vpsrld $0xd,%xmm9,%xmm9
9013 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11
9014 196,67,125,25,220,1, //vextractf128 $0x1,%ymm11,%xmm12
9015 196,193,25,114,212,13, //vpsrld $0xd,%xmm12,%xmm12
9016 196,193,33,114,211,13, //vpsrld $0xd,%xmm11,%xmm11
9017 197,60,89,234, //vmulps %ymm2,%ymm8,%ymm13
9018 196,67,125,25,238,1, //vextractf128 $0x1,%ymm13,%xmm14
9019 196,193,9,114,214,13, //vpsrld $0xd,%xmm14,%xmm14
9020 196,193,17,114,213,13, //vpsrld $0xd,%xmm13,%xmm13
9021 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8
9022 196,67,125,25,199,1, //vextractf128 $0x1,%ymm8,%xmm15
9023 196,193,1,114,215,13, //vpsrld $0xd,%xmm15,%xmm15
9024 196,193,57,114,208,13, //vpsrld $0xd,%xmm8,%xmm8
9025 196,193,33,115,251,2, //vpslldq $0x2,%xmm11,%xmm11
9026 196,65,33,235,201, //vpor %xmm9,%xmm11,%xmm9
9027 196,193,33,115,252,2, //vpslldq $0x2,%xmm12,%xmm11
9028 196,65,33,235,226, //vpor %xmm10,%xmm11,%xmm12
9029 196,193,57,115,248,2, //vpslldq $0x2,%xmm8,%xmm8
9030 196,65,57,235,197, //vpor %xmm13,%xmm8,%xmm8
9031 196,193,41,115,255,2, //vpslldq $0x2,%xmm15,%xmm10
9032 196,65,41,235,238, //vpor %xmm14,%xmm10,%xmm13
9033 196,65,49,98,216, //vpunpckldq %xmm8,%xmm9,%xmm11
9034 196,65,49,106,208, //vpunpckhdq %xmm8,%xmm9,%xmm10
9035 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9
9036 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8
9037 72,133,201, //test %rcx,%rcx
9038 117,27, //jne 11fb <_sk_store_f16_avx+0xc3>
9039 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
9040 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
9041 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
9042 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8)
9043 72,173, //lods %ds:(%rsi),%rax
9044 255,224, //jmpq *%rax
9045 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
9046 72,131,249,1, //cmp $0x1,%rcx
9047 116,241, //je 11f7 <_sk_store_f16_avx+0xbf>
9048 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
9049 72,131,249,3, //cmp $0x3,%rcx
9050 114,229, //jb 11f7 <_sk_store_f16_avx+0xbf>
9051 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
9052 116,221, //je 11f7 <_sk_store_f16_avx+0xbf>
9053 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
9054 72,131,249,5, //cmp $0x5,%rcx
9055 114,209, //jb 11f7 <_sk_store_f16_avx+0xbf>
9056 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
9057 116,201, //je 11f7 <_sk_store_f16_avx+0xbf>
9058 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
9059 72,131,249,7, //cmp $0x7,%rcx
9060 114,189, //jb 11f7 <_sk_store_f16_avx+0xbf>
9061 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
9062 235,181, //jmp 11f7 <_sk_store_f16_avx+0xbf>
9063};
9064
9065CODE const uint8_t sk_store_f32_avx[] = {
9066 72,173, //lods %ds:(%rsi),%rax
9067 76,139,0, //mov (%rax),%r8
9068 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax
9069 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8
9070 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11
9071 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9
9072 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12
9073 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10
9074 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9
9075 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
9076 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
9077 72,133,201, //test %rcx,%rcx
9078 117,55, //jne 12af <_sk_store_f32_avx+0x6d>
9079 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
9080 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
9081 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
9082 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
9083 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4)
9084 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4)
9085 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4)
9086 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4)
9087 72,173, //lods %ds:(%rsi),%rax
9088 255,224, //jmpq *%rax
9089 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
9090 72,131,249,1, //cmp $0x1,%rcx
9091 116,240, //je 12ab <_sk_store_f32_avx+0x69>
9092 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
9093 72,131,249,3, //cmp $0x3,%rcx
9094 114,227, //jb 12ab <_sk_store_f32_avx+0x69>
9095 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
9096 116,218, //je 12ab <_sk_store_f32_avx+0x69>
9097 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
9098 72,131,249,5, //cmp $0x5,%rcx
9099 114,205, //jb 12ab <_sk_store_f32_avx+0x69>
9100 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
9101 116,195, //je 12ab <_sk_store_f32_avx+0x69>
9102 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
9103 72,131,249,7, //cmp $0x7,%rcx
9104 114,181, //jb 12ab <_sk_store_f32_avx+0x69>
9105 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
9106 235,171, //jmp 12ab <_sk_store_f32_avx+0x69>
9107};
9108
9109CODE const uint8_t sk_clamp_x_avx[] = {
9110 72,173, //lods %ds:(%rsi),%rax
9111 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
9112 197,60,95,200, //vmaxps %ymm0,%ymm8,%ymm9
9113 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9114 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
9115 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9116 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
9117 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
9118 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
9119 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
9120 72,173, //lods %ds:(%rsi),%rax
9121 255,224, //jmpq *%rax
9122};
9123
9124CODE const uint8_t sk_clamp_y_avx[] = {
9125 72,173, //lods %ds:(%rsi),%rax
9126 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
9127 197,60,95,201, //vmaxps %ymm1,%ymm8,%ymm9
9128 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9129 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1
9130 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9131 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
9132 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
9133 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
9134 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
9135 72,173, //lods %ds:(%rsi),%rax
9136 255,224, //jmpq *%rax
9137};
9138
9139CODE const uint8_t sk_repeat_x_avx[] = {
9140 72,173, //lods %ds:(%rsi),%rax
9141 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9142 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9
9143 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
9144 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9
9145 196,65,124,92,201, //vsubps %ymm9,%ymm0,%ymm9
9146 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0
9147 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9148 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
9149 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
9150 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
9151 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0
9152 72,173, //lods %ds:(%rsi),%rax
9153 255,224, //jmpq *%rax
9154};
9155
9156CODE const uint8_t sk_repeat_y_avx[] = {
9157 72,173, //lods %ds:(%rsi),%rax
9158 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9159 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9
9160 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9
9161 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9
9162 196,65,116,92,201, //vsubps %ymm9,%ymm1,%ymm9
9163 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1
9164 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9165 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
9166 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8
9167 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
9168 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1
9169 72,173, //lods %ds:(%rsi),%rax
9170 255,224, //jmpq *%rax
9171};
9172
9173CODE const uint8_t sk_mirror_x_avx[] = {
9174 72,173, //lods %ds:(%rsi),%rax
9175 197,122,16,0, //vmovss (%rax),%xmm8
9176 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9
9177 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
9178 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10
9179 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0
9180 196,227,121,4,192,0, //vpermilps $0x0,%xmm0,%xmm0
9181 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
9182 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8
9183 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
9184 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
9185 197,172,92,192, //vsubps %ymm0,%ymm10,%ymm0
9186 196,193,124,92,193, //vsubps %ymm9,%ymm0,%ymm0
9187 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
9188 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8
9189 197,60,84,192, //vandps %ymm0,%ymm8,%ymm8
9190 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0
9191 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9192 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0
9193 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9
9194 196,227,53,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm0
9195 197,188,93,192, //vminps %ymm0,%ymm8,%ymm0
9196 72,173, //lods %ds:(%rsi),%rax
9197 255,224, //jmpq *%rax
9198};
9199
9200CODE const uint8_t sk_mirror_y_avx[] = {
9201 72,173, //lods %ds:(%rsi),%rax
9202 197,122,16,0, //vmovss (%rax),%xmm8
9203 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9
9204 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
9205 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10
9206 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1
9207 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
9208 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
9209 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8
9210 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8
9211 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
9212 197,172,92,201, //vsubps %ymm1,%ymm10,%ymm1
9213 196,193,116,92,201, //vsubps %ymm9,%ymm1,%ymm1
9214 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
9215 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8
9216 197,60,84,193, //vandps %ymm1,%ymm8,%ymm8
9217 196,99,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm1
9218 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10
9219 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1
9220 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9
9221 196,227,53,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm9,%ymm1
9222 197,188,93,201, //vminps %ymm1,%ymm8,%ymm1
9223 72,173, //lods %ds:(%rsi),%rax
9224 255,224, //jmpq *%rax
9225};
9226
9227CODE const uint8_t sk_matrix_2x3_avx[] = {
9228 72,173, //lods %ds:(%rsi),%rax
9229 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9230 196,98,125,24,72,8, //vbroadcastss 0x8(%rax),%ymm9
9231 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
9232 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
9233 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
9234 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
9235 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
9236 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
9237 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10
9238 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
9239 197,172,89,201, //vmulps %ymm1,%ymm10,%ymm1
9240 196,193,116,88,203, //vaddps %ymm11,%ymm1,%ymm1
9241 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0
9242 197,252,88,201, //vaddps %ymm1,%ymm0,%ymm1
9243 72,173, //lods %ds:(%rsi),%rax
9244 197,124,41,192, //vmovaps %ymm8,%ymm0
9245 255,224, //jmpq *%rax
9246};
9247
9248CODE const uint8_t sk_matrix_3x4_avx[] = {
9249 72,173, //lods %ds:(%rsi),%rax
9250 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9251 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
9252 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10
9253 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11
9254 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10
9255 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
9256 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
9257 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
9258 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
9259 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
9260 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
9261 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
9262 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11
9263 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12
9264 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11
9265 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
9266 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
9267 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
9268 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
9269 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
9270 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
9271 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
9272 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
9273 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13
9274 197,156,89,210, //vmulps %ymm2,%ymm12,%ymm2
9275 196,193,108,88,213, //vaddps %ymm13,%ymm2,%ymm2
9276 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
9277 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1
9278 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0
9279 197,252,88,209, //vaddps %ymm1,%ymm0,%ymm2
9280 72,173, //lods %ds:(%rsi),%rax
9281 197,124,41,192, //vmovaps %ymm8,%ymm0
9282 197,124,41,201, //vmovaps %ymm9,%ymm1
9283 255,224, //jmpq *%rax
9284};
9285
9286CODE const uint8_t sk_matrix_perspective_avx[] = {
9287 72,173, //lods %ds:(%rsi),%rax
9288 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
9289 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
9290 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
9291 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
9292 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
9293 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
9294 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
9295 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
9296 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
9297 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
9298 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
9299 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
9300 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
9301 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
9302 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10
9303 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11
9304 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12
9305 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1
9306 196,193,116,88,204, //vaddps %ymm12,%ymm1,%ymm1
9307 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0
9308 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
9309 197,252,83,200, //vrcpps %ymm0,%ymm1
9310 197,188,89,193, //vmulps %ymm1,%ymm8,%ymm0
9311 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1
9312 72,173, //lods %ds:(%rsi),%rax
9313 255,224, //jmpq *%rax
9314};
9315
9316CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
9317 72,173, //lods %ds:(%rsi),%rax
9318 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1
9319 196,226,125,24,16, //vbroadcastss (%rax),%ymm2
9320 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1
9321 197,108,88,193, //vaddps %ymm1,%ymm2,%ymm8
9322 196,226,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm1
9323 196,226,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm2
9324 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1
9325 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
9326 196,226,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm2
9327 196,226,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm3
9328 197,236,89,208, //vmulps %ymm0,%ymm2,%ymm2
9329 197,228,88,210, //vaddps %ymm2,%ymm3,%ymm2
9330 196,226,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm3
9331 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9
9332 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0
9333 197,180,88,216, //vaddps %ymm0,%ymm9,%ymm3
9334 72,173, //lods %ds:(%rsi),%rax
9335 197,124,41,192, //vmovaps %ymm8,%ymm0
9336 255,224, //jmpq *%rax
9337};
9338
9339CODE const uint8_t sk_start_pipeline_sse41[] = {
9340 65,87, //push %r15
9341 65,86, //push %r14
9342 65,85, //push %r13
9343 65,84, //push %r12
9344 86, //push %rsi
9345 87, //push %rdi
9346 83, //push %rbx
9347 72,129,236,160,0,0,0, //sub $0xa0,%rsp
9348 68,15,41,188,36,144,0,0,0, //movaps %xmm15,0x90(%rsp)
9349 68,15,41,180,36,128,0,0,0, //movaps %xmm14,0x80(%rsp)
9350 68,15,41,108,36,112, //movaps %xmm13,0x70(%rsp)
9351 68,15,41,100,36,96, //movaps %xmm12,0x60(%rsp)
9352 68,15,41,92,36,80, //movaps %xmm11,0x50(%rsp)
9353 68,15,41,84,36,64, //movaps %xmm10,0x40(%rsp)
9354 68,15,41,76,36,48, //movaps %xmm9,0x30(%rsp)
9355 68,15,41,68,36,32, //movaps %xmm8,0x20(%rsp)
9356 15,41,124,36,16, //movaps %xmm7,0x10(%rsp)
9357 15,41,52,36, //movaps %xmm6,(%rsp)
9358 77,137,207, //mov %r9,%r15
9359 77,137,198, //mov %r8,%r14
9360 72,137,203, //mov %rcx,%rbx
9361 72,137,214, //mov %rdx,%rsi
9362 72,173, //lods %ds:(%rsi),%rax
9363 73,137,196, //mov %rax,%r12
9364 73,137,245, //mov %rsi,%r13
9365 72,141,67,4, //lea 0x4(%rbx),%rax
9366 76,57,248, //cmp %r15,%rax
9367 118,5, //jbe 73 <_sk_start_pipeline_sse41+0x73>
9368 72,137,216, //mov %rbx,%rax
9369 235,52, //jmp a7 <_sk_start_pipeline_sse41+0xa7>
9370 15,87,192, //xorps %xmm0,%xmm0
9371 15,87,201, //xorps %xmm1,%xmm1
9372 15,87,210, //xorps %xmm2,%xmm2
9373 15,87,219, //xorps %xmm3,%xmm3
9374 15,87,228, //xorps %xmm4,%xmm4
9375 15,87,237, //xorps %xmm5,%xmm5
9376 15,87,246, //xorps %xmm6,%xmm6
9377 15,87,255, //xorps %xmm7,%xmm7
9378 72,137,223, //mov %rbx,%rdi
9379 76,137,238, //mov %r13,%rsi
9380 76,137,242, //mov %r14,%rdx
9381 65,255,212, //callq *%r12
9382 72,141,67,4, //lea 0x4(%rbx),%rax
9383 72,131,195,8, //add $0x8,%rbx
9384 76,57,251, //cmp %r15,%rbx
9385 72,137,195, //mov %rax,%rbx
9386 118,204, //jbe 73 <_sk_start_pipeline_sse41+0x73>
9387 15,40,52,36, //movaps (%rsp),%xmm6
9388 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7
9389 68,15,40,68,36,32, //movaps 0x20(%rsp),%xmm8
9390 68,15,40,76,36,48, //movaps 0x30(%rsp),%xmm9
9391 68,15,40,84,36,64, //movaps 0x40(%rsp),%xmm10
9392 68,15,40,92,36,80, //movaps 0x50(%rsp),%xmm11
9393 68,15,40,100,36,96, //movaps 0x60(%rsp),%xmm12
9394 68,15,40,108,36,112, //movaps 0x70(%rsp),%xmm13
9395 68,15,40,180,36,128,0,0,0, //movaps 0x80(%rsp),%xmm14
9396 68,15,40,188,36,144,0,0,0, //movaps 0x90(%rsp),%xmm15
9397 72,129,196,160,0,0,0, //add $0xa0,%rsp
9398 91, //pop %rbx
9399 95, //pop %rdi
9400 94, //pop %rsi
9401 65,92, //pop %r12
9402 65,93, //pop %r13
9403 65,94, //pop %r14
9404 65,95, //pop %r15
9405 195, //retq
9406};
9407
9408CODE const uint8_t sk_just_return_sse41[] = {
9409 195, //retq
9410};
9411
9412CODE const uint8_t sk_seed_shader_sse41[] = {
9413 72,173, //lods %ds:(%rsi),%rax
9414 102,15,110,199, //movd %edi,%xmm0
9415 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
9416 15,91,200, //cvtdq2ps %xmm0,%xmm1
9417 243,15,16,18, //movss (%rdx),%xmm2
9418 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
9419 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
9420 15,88,203, //addps %xmm3,%xmm1
9421 15,16,66,20, //movups 0x14(%rdx),%xmm0
9422 15,88,193, //addps %xmm1,%xmm0
9423 102,15,110,8, //movd (%rax),%xmm1
9424 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
9425 15,91,201, //cvtdq2ps %xmm1,%xmm1
9426 15,88,203, //addps %xmm3,%xmm1
9427 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
9428 72,173, //lods %ds:(%rsi),%rax
9429 15,87,219, //xorps %xmm3,%xmm3
9430 15,87,228, //xorps %xmm4,%xmm4
9431 15,87,237, //xorps %xmm5,%xmm5
9432 15,87,246, //xorps %xmm6,%xmm6
9433 15,87,255, //xorps %xmm7,%xmm7
9434 255,224, //jmpq *%rax
9435};
9436
9437CODE const uint8_t sk_constant_color_sse41[] = {
9438 72,173, //lods %ds:(%rsi),%rax
9439 15,16,24, //movups (%rax),%xmm3
9440 15,40,195, //movaps %xmm3,%xmm0
9441 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
9442 15,40,203, //movaps %xmm3,%xmm1
9443 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
9444 15,40,211, //movaps %xmm3,%xmm2
9445 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
9446 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
9447 72,173, //lods %ds:(%rsi),%rax
9448 255,224, //jmpq *%rax
9449};
9450
9451CODE const uint8_t sk_clear_sse41[] = {
9452 72,173, //lods %ds:(%rsi),%rax
9453 15,87,192, //xorps %xmm0,%xmm0
9454 15,87,201, //xorps %xmm1,%xmm1
9455 15,87,210, //xorps %xmm2,%xmm2
9456 15,87,219, //xorps %xmm3,%xmm3
9457 255,224, //jmpq *%rax
9458};
9459
9460CODE const uint8_t sk_plus__sse41[] = {
9461 15,88,196, //addps %xmm4,%xmm0
9462 15,88,205, //addps %xmm5,%xmm1
9463 15,88,214, //addps %xmm6,%xmm2
9464 15,88,223, //addps %xmm7,%xmm3
9465 72,173, //lods %ds:(%rsi),%rax
9466 255,224, //jmpq *%rax
9467};
9468
9469CODE const uint8_t sk_srcover_sse41[] = {
9470 243,68,15,16,2, //movss (%rdx),%xmm8
9471 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9472 68,15,92,195, //subps %xmm3,%xmm8
9473 69,15,40,200, //movaps %xmm8,%xmm9
9474 68,15,89,204, //mulps %xmm4,%xmm9
9475 65,15,88,193, //addps %xmm9,%xmm0
9476 69,15,40,200, //movaps %xmm8,%xmm9
9477 68,15,89,205, //mulps %xmm5,%xmm9
9478 65,15,88,201, //addps %xmm9,%xmm1
9479 69,15,40,200, //movaps %xmm8,%xmm9
9480 68,15,89,206, //mulps %xmm6,%xmm9
9481 65,15,88,209, //addps %xmm9,%xmm2
9482 68,15,89,199, //mulps %xmm7,%xmm8
9483 65,15,88,216, //addps %xmm8,%xmm3
9484 72,173, //lods %ds:(%rsi),%rax
9485 255,224, //jmpq *%rax
9486};
9487
9488CODE const uint8_t sk_dstover_sse41[] = {
9489 243,68,15,16,2, //movss (%rdx),%xmm8
9490 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9491 68,15,92,199, //subps %xmm7,%xmm8
9492 65,15,89,192, //mulps %xmm8,%xmm0
9493 15,88,196, //addps %xmm4,%xmm0
9494 65,15,89,200, //mulps %xmm8,%xmm1
9495 15,88,205, //addps %xmm5,%xmm1
9496 65,15,89,208, //mulps %xmm8,%xmm2
9497 15,88,214, //addps %xmm6,%xmm2
9498 65,15,89,216, //mulps %xmm8,%xmm3
9499 15,88,223, //addps %xmm7,%xmm3
9500 72,173, //lods %ds:(%rsi),%rax
9501 255,224, //jmpq *%rax
9502};
9503
9504CODE const uint8_t sk_clamp_0_sse41[] = {
9505 69,15,87,192, //xorps %xmm8,%xmm8
9506 65,15,95,192, //maxps %xmm8,%xmm0
9507 65,15,95,200, //maxps %xmm8,%xmm1
9508 65,15,95,208, //maxps %xmm8,%xmm2
9509 65,15,95,216, //maxps %xmm8,%xmm3
9510 72,173, //lods %ds:(%rsi),%rax
9511 255,224, //jmpq *%rax
9512};
9513
9514CODE const uint8_t sk_clamp_1_sse41[] = {
9515 243,68,15,16,2, //movss (%rdx),%xmm8
9516 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9517 65,15,93,192, //minps %xmm8,%xmm0
9518 65,15,93,200, //minps %xmm8,%xmm1
9519 65,15,93,208, //minps %xmm8,%xmm2
9520 65,15,93,216, //minps %xmm8,%xmm3
9521 72,173, //lods %ds:(%rsi),%rax
9522 255,224, //jmpq *%rax
9523};
9524
9525CODE const uint8_t sk_clamp_a_sse41[] = {
9526 243,68,15,16,2, //movss (%rdx),%xmm8
9527 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9528 65,15,93,216, //minps %xmm8,%xmm3
9529 15,93,195, //minps %xmm3,%xmm0
9530 15,93,203, //minps %xmm3,%xmm1
9531 15,93,211, //minps %xmm3,%xmm2
9532 72,173, //lods %ds:(%rsi),%rax
9533 255,224, //jmpq *%rax
9534};
9535
9536CODE const uint8_t sk_set_rgb_sse41[] = {
9537 72,173, //lods %ds:(%rsi),%rax
9538 243,15,16,0, //movss (%rax),%xmm0
9539 243,15,16,72,4, //movss 0x4(%rax),%xmm1
9540 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
9541 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
9542 243,15,16,80,8, //movss 0x8(%rax),%xmm2
9543 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
9544 72,173, //lods %ds:(%rsi),%rax
9545 255,224, //jmpq *%rax
9546};
9547
9548CODE const uint8_t sk_swap_rb_sse41[] = {
9549 68,15,40,192, //movaps %xmm0,%xmm8
9550 72,173, //lods %ds:(%rsi),%rax
9551 15,40,194, //movaps %xmm2,%xmm0
9552 65,15,40,208, //movaps %xmm8,%xmm2
9553 255,224, //jmpq *%rax
9554};
9555
9556CODE const uint8_t sk_swap_sse41[] = {
9557 68,15,40,195, //movaps %xmm3,%xmm8
9558 68,15,40,202, //movaps %xmm2,%xmm9
9559 68,15,40,209, //movaps %xmm1,%xmm10
9560 68,15,40,216, //movaps %xmm0,%xmm11
9561 72,173, //lods %ds:(%rsi),%rax
9562 15,40,196, //movaps %xmm4,%xmm0
9563 15,40,205, //movaps %xmm5,%xmm1
9564 15,40,214, //movaps %xmm6,%xmm2
9565 15,40,223, //movaps %xmm7,%xmm3
9566 65,15,40,227, //movaps %xmm11,%xmm4
9567 65,15,40,234, //movaps %xmm10,%xmm5
9568 65,15,40,241, //movaps %xmm9,%xmm6
9569 65,15,40,248, //movaps %xmm8,%xmm7
9570 255,224, //jmpq *%rax
9571};
9572
9573CODE const uint8_t sk_move_src_dst_sse41[] = {
9574 72,173, //lods %ds:(%rsi),%rax
9575 15,40,224, //movaps %xmm0,%xmm4
9576 15,40,233, //movaps %xmm1,%xmm5
9577 15,40,242, //movaps %xmm2,%xmm6
9578 15,40,251, //movaps %xmm3,%xmm7
9579 255,224, //jmpq *%rax
9580};
9581
9582CODE const uint8_t sk_move_dst_src_sse41[] = {
9583 72,173, //lods %ds:(%rsi),%rax
9584 15,40,196, //movaps %xmm4,%xmm0
9585 15,40,205, //movaps %xmm5,%xmm1
9586 15,40,214, //movaps %xmm6,%xmm2
9587 15,40,223, //movaps %xmm7,%xmm3
9588 255,224, //jmpq *%rax
9589};
9590
9591CODE const uint8_t sk_premul_sse41[] = {
9592 15,89,195, //mulps %xmm3,%xmm0
9593 15,89,203, //mulps %xmm3,%xmm1
9594 15,89,211, //mulps %xmm3,%xmm2
9595 72,173, //lods %ds:(%rsi),%rax
9596 255,224, //jmpq *%rax
9597};
9598
9599CODE const uint8_t sk_unpremul_sse41[] = {
9600 68,15,40,192, //movaps %xmm0,%xmm8
9601 69,15,87,201, //xorps %xmm9,%xmm9
9602 243,68,15,16,18, //movss (%rdx),%xmm10
9603 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
9604 68,15,94,211, //divps %xmm3,%xmm10
9605 15,40,195, //movaps %xmm3,%xmm0
9606 65,15,194,193,0, //cmpeqps %xmm9,%xmm0
9607 102,69,15,56,20,209, //blendvps %xmm0,%xmm9,%xmm10
9608 69,15,89,194, //mulps %xmm10,%xmm8
9609 65,15,89,202, //mulps %xmm10,%xmm1
9610 65,15,89,210, //mulps %xmm10,%xmm2
9611 72,173, //lods %ds:(%rsi),%rax
9612 65,15,40,192, //movaps %xmm8,%xmm0
9613 255,224, //jmpq *%rax
9614};
9615
9616CODE const uint8_t sk_from_srgb_sse41[] = {
9617 68,15,40,194, //movaps %xmm2,%xmm8
9618 243,68,15,16,90,64, //movss 0x40(%rdx),%xmm11
9619 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
9620 69,15,40,211, //movaps %xmm11,%xmm10
9621 68,15,89,208, //mulps %xmm0,%xmm10
9622 68,15,40,240, //movaps %xmm0,%xmm14
9623 69,15,89,246, //mulps %xmm14,%xmm14
9624 243,15,16,82,60, //movss 0x3c(%rdx),%xmm2
9625 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
9626 243,68,15,16,98,52, //movss 0x34(%rdx),%xmm12
9627 243,68,15,16,106,56, //movss 0x38(%rdx),%xmm13
9628 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
9629 68,15,40,202, //movaps %xmm2,%xmm9
9630 68,15,89,200, //mulps %xmm0,%xmm9
9631 69,15,88,205, //addps %xmm13,%xmm9
9632 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
9633 69,15,89,206, //mulps %xmm14,%xmm9
9634 69,15,88,204, //addps %xmm12,%xmm9
9635 243,68,15,16,114,68, //movss 0x44(%rdx),%xmm14
9636 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
9637 65,15,194,198,1, //cmpltps %xmm14,%xmm0
9638 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9
9639 69,15,40,251, //movaps %xmm11,%xmm15
9640 68,15,89,249, //mulps %xmm1,%xmm15
9641 15,40,193, //movaps %xmm1,%xmm0
9642 15,89,192, //mulps %xmm0,%xmm0
9643 68,15,40,210, //movaps %xmm2,%xmm10
9644 68,15,89,209, //mulps %xmm1,%xmm10
9645 69,15,88,213, //addps %xmm13,%xmm10
9646 68,15,89,208, //mulps %xmm0,%xmm10
9647 69,15,88,212, //addps %xmm12,%xmm10
9648 65,15,194,206,1, //cmpltps %xmm14,%xmm1
9649 15,40,193, //movaps %xmm1,%xmm0
9650 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10
9651 69,15,89,216, //mulps %xmm8,%xmm11
9652 65,15,40,192, //movaps %xmm8,%xmm0
9653 15,89,192, //mulps %xmm0,%xmm0
9654 65,15,89,208, //mulps %xmm8,%xmm2
9655 65,15,88,213, //addps %xmm13,%xmm2
9656 15,89,208, //mulps %xmm0,%xmm2
9657 65,15,88,212, //addps %xmm12,%xmm2
9658 69,15,194,198,1, //cmpltps %xmm14,%xmm8
9659 65,15,40,192, //movaps %xmm8,%xmm0
9660 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2
9661 72,173, //lods %ds:(%rsi),%rax
9662 65,15,40,193, //movaps %xmm9,%xmm0
9663 65,15,40,202, //movaps %xmm10,%xmm1
9664 255,224, //jmpq *%rax
9665};
9666
9667CODE const uint8_t sk_to_srgb_sse41[] = {
9668 72,131,236,24, //sub $0x18,%rsp
9669 15,41,60,36, //movaps %xmm7,(%rsp)
9670 15,40,254, //movaps %xmm6,%xmm7
9671 15,40,245, //movaps %xmm5,%xmm6
9672 15,40,236, //movaps %xmm4,%xmm5
9673 15,40,227, //movaps %xmm3,%xmm4
9674 68,15,40,194, //movaps %xmm2,%xmm8
9675 15,40,217, //movaps %xmm1,%xmm3
9676 15,82,208, //rsqrtps %xmm0,%xmm2
9677 68,15,83,202, //rcpps %xmm2,%xmm9
9678 68,15,82,210, //rsqrtps %xmm2,%xmm10
9679 243,15,16,18, //movss (%rdx),%xmm2
9680 243,68,15,16,90,72, //movss 0x48(%rdx),%xmm11
9681 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
9682 65,15,40,203, //movaps %xmm11,%xmm1
9683 15,89,200, //mulps %xmm0,%xmm1
9684 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
9685 243,68,15,16,98,76, //movss 0x4c(%rdx),%xmm12
9686 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
9687 243,68,15,16,106,80, //movss 0x50(%rdx),%xmm13
9688 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
9689 243,68,15,16,114,84, //movss 0x54(%rdx),%xmm14
9690 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
9691 69,15,89,205, //mulps %xmm13,%xmm9
9692 69,15,88,206, //addps %xmm14,%xmm9
9693 69,15,89,212, //mulps %xmm12,%xmm10
9694 69,15,88,209, //addps %xmm9,%xmm10
9695 68,15,40,202, //movaps %xmm2,%xmm9
9696 69,15,93,202, //minps %xmm10,%xmm9
9697 243,68,15,16,122,88, //movss 0x58(%rdx),%xmm15
9698 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
9699 65,15,194,199,1, //cmpltps %xmm15,%xmm0
9700 102,68,15,56,20,201, //blendvps %xmm0,%xmm1,%xmm9
9701 15,82,195, //rsqrtps %xmm3,%xmm0
9702 15,83,200, //rcpps %xmm0,%xmm1
9703 15,82,192, //rsqrtps %xmm0,%xmm0
9704 65,15,89,205, //mulps %xmm13,%xmm1
9705 65,15,88,206, //addps %xmm14,%xmm1
9706 65,15,89,196, //mulps %xmm12,%xmm0
9707 15,88,193, //addps %xmm1,%xmm0
9708 68,15,40,210, //movaps %xmm2,%xmm10
9709 68,15,93,208, //minps %xmm0,%xmm10
9710 65,15,40,203, //movaps %xmm11,%xmm1
9711 15,89,203, //mulps %xmm3,%xmm1
9712 65,15,194,223,1, //cmpltps %xmm15,%xmm3
9713 15,40,195, //movaps %xmm3,%xmm0
9714 102,68,15,56,20,209, //blendvps %xmm0,%xmm1,%xmm10
9715 65,15,82,192, //rsqrtps %xmm8,%xmm0
9716 15,83,200, //rcpps %xmm0,%xmm1
9717 65,15,89,205, //mulps %xmm13,%xmm1
9718 65,15,88,206, //addps %xmm14,%xmm1
9719 15,82,192, //rsqrtps %xmm0,%xmm0
9720 65,15,89,196, //mulps %xmm12,%xmm0
9721 15,88,193, //addps %xmm1,%xmm0
9722 15,93,208, //minps %xmm0,%xmm2
9723 69,15,89,216, //mulps %xmm8,%xmm11
9724 69,15,194,199,1, //cmpltps %xmm15,%xmm8
9725 65,15,40,192, //movaps %xmm8,%xmm0
9726 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2
9727 72,173, //lods %ds:(%rsi),%rax
9728 65,15,40,193, //movaps %xmm9,%xmm0
9729 65,15,40,202, //movaps %xmm10,%xmm1
9730 15,40,220, //movaps %xmm4,%xmm3
9731 15,40,229, //movaps %xmm5,%xmm4
9732 15,40,238, //movaps %xmm6,%xmm5
9733 15,40,247, //movaps %xmm7,%xmm6
9734 15,40,60,36, //movaps (%rsp),%xmm7
9735 72,131,196,24, //add $0x18,%rsp
9736 255,224, //jmpq *%rax
9737};
9738
9739CODE const uint8_t sk_scale_1_float_sse41[] = {
9740 72,173, //lods %ds:(%rsi),%rax
9741 243,68,15,16,0, //movss (%rax),%xmm8
9742 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9743 65,15,89,192, //mulps %xmm8,%xmm0
9744 65,15,89,200, //mulps %xmm8,%xmm1
9745 65,15,89,208, //mulps %xmm8,%xmm2
9746 65,15,89,216, //mulps %xmm8,%xmm3
9747 72,173, //lods %ds:(%rsi),%rax
9748 255,224, //jmpq *%rax
9749};
9750
9751CODE const uint8_t sk_scale_u8_sse41[] = {
9752 72,173, //lods %ds:(%rsi),%rax
9753 72,139,0, //mov (%rax),%rax
9754 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
9755 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
9756 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
9757 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
9758 69,15,89,200, //mulps %xmm8,%xmm9
9759 65,15,89,193, //mulps %xmm9,%xmm0
9760 65,15,89,201, //mulps %xmm9,%xmm1
9761 65,15,89,209, //mulps %xmm9,%xmm2
9762 65,15,89,217, //mulps %xmm9,%xmm3
9763 72,173, //lods %ds:(%rsi),%rax
9764 255,224, //jmpq *%rax
9765};
9766
9767CODE const uint8_t sk_lerp_1_float_sse41[] = {
9768 72,173, //lods %ds:(%rsi),%rax
9769 243,68,15,16,0, //movss (%rax),%xmm8
9770 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9771 15,92,196, //subps %xmm4,%xmm0
9772 65,15,89,192, //mulps %xmm8,%xmm0
9773 15,88,196, //addps %xmm4,%xmm0
9774 15,92,205, //subps %xmm5,%xmm1
9775 65,15,89,200, //mulps %xmm8,%xmm1
9776 15,88,205, //addps %xmm5,%xmm1
9777 15,92,214, //subps %xmm6,%xmm2
9778 65,15,89,208, //mulps %xmm8,%xmm2
9779 15,88,214, //addps %xmm6,%xmm2
9780 15,92,223, //subps %xmm7,%xmm3
9781 65,15,89,216, //mulps %xmm8,%xmm3
9782 15,88,223, //addps %xmm7,%xmm3
9783 72,173, //lods %ds:(%rsi),%rax
9784 255,224, //jmpq *%rax
9785};
9786
9787CODE const uint8_t sk_lerp_u8_sse41[] = {
9788 72,173, //lods %ds:(%rsi),%rax
9789 72,139,0, //mov (%rax),%rax
9790 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
9791 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
9792 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
9793 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
9794 69,15,89,200, //mulps %xmm8,%xmm9
9795 15,92,196, //subps %xmm4,%xmm0
9796 65,15,89,193, //mulps %xmm9,%xmm0
9797 15,88,196, //addps %xmm4,%xmm0
9798 15,92,205, //subps %xmm5,%xmm1
9799 65,15,89,201, //mulps %xmm9,%xmm1
9800 15,88,205, //addps %xmm5,%xmm1
9801 15,92,214, //subps %xmm6,%xmm2
9802 65,15,89,209, //mulps %xmm9,%xmm2
9803 15,88,214, //addps %xmm6,%xmm2
9804 15,92,223, //subps %xmm7,%xmm3
9805 65,15,89,217, //mulps %xmm9,%xmm3
9806 15,88,223, //addps %xmm7,%xmm3
9807 72,173, //lods %ds:(%rsi),%rax
9808 255,224, //jmpq *%rax
9809};
9810
9811CODE const uint8_t sk_lerp_565_sse41[] = {
9812 72,173, //lods %ds:(%rsi),%rax
9813 72,139,0, //mov (%rax),%rax
9814 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8
9815 102,15,110,90,104, //movd 0x68(%rdx),%xmm3
9816 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
9817 102,65,15,219,216, //pand %xmm8,%xmm3
9818 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
9819 243,15,16,26, //movss (%rdx),%xmm3
9820 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
9821 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
9822 69,15,89,209, //mulps %xmm9,%xmm10
9823 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
9824 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
9825 102,69,15,219,200, //pand %xmm8,%xmm9
9826 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
9827 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
9828 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
9829 69,15,89,217, //mulps %xmm9,%xmm11
9830 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
9831 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
9832 102,69,15,219,200, //pand %xmm8,%xmm9
9833 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
9834 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
9835 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
9836 69,15,89,200, //mulps %xmm8,%xmm9
9837 15,92,196, //subps %xmm4,%xmm0
9838 65,15,89,194, //mulps %xmm10,%xmm0
9839 15,88,196, //addps %xmm4,%xmm0
9840 15,92,205, //subps %xmm5,%xmm1
9841 65,15,89,203, //mulps %xmm11,%xmm1
9842 15,88,205, //addps %xmm5,%xmm1
9843 15,92,214, //subps %xmm6,%xmm2
9844 65,15,89,209, //mulps %xmm9,%xmm2
9845 15,88,214, //addps %xmm6,%xmm2
9846 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
9847 72,173, //lods %ds:(%rsi),%rax
9848 255,224, //jmpq *%rax
9849};
9850
9851CODE const uint8_t sk_load_tables_sse41[] = {
9852 72,173, //lods %ds:(%rsi),%rax
9853 72,139,8, //mov (%rax),%rcx
9854 76,139,64,8, //mov 0x8(%rax),%r8
9855 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8
9856 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
9857 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
9858 102,65,15,111,200, //movdqa %xmm8,%xmm1
9859 102,15,114,209,8, //psrld $0x8,%xmm1
9860 102,15,219,200, //pand %xmm0,%xmm1
9861 102,65,15,111,208, //movdqa %xmm8,%xmm2
9862 102,15,114,210,16, //psrld $0x10,%xmm2
9863 102,15,219,208, //pand %xmm0,%xmm2
9864 102,65,15,219,192, //pand %xmm8,%xmm0
9865 102,72,15,58,22,193,1, //pextrq $0x1,%xmm0,%rcx
9866 65,137,201, //mov %ecx,%r9d
9867 72,193,233,32, //shr $0x20,%rcx
9868 102,73,15,126,194, //movq %xmm0,%r10
9869 69,137,211, //mov %r10d,%r11d
9870 73,193,234,32, //shr $0x20,%r10
9871 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0
9872 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0
9873 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0
9874 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0
9875 72,139,72,16, //mov 0x10(%rax),%rcx
9876 102,73,15,58,22,200,1, //pextrq $0x1,%xmm1,%r8
9877 69,137,193, //mov %r8d,%r9d
9878 73,193,232,32, //shr $0x20,%r8
9879 102,73,15,126,202, //movq %xmm1,%r10
9880 69,137,211, //mov %r10d,%r11d
9881 73,193,234,32, //shr $0x20,%r10
9882 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1
9883 102,66,15,58,33,12,145,16, //insertps $0x10,(%rcx,%r10,4),%xmm1
9884 243,66,15,16,28,137, //movss (%rcx,%r9,4),%xmm3
9885 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1
9886 243,66,15,16,28,129, //movss (%rcx,%r8,4),%xmm3
9887 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1
9888 72,139,64,24, //mov 0x18(%rax),%rax
9889 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx
9890 65,137,200, //mov %ecx,%r8d
9891 72,193,233,32, //shr $0x20,%rcx
9892 102,73,15,126,209, //movq %xmm2,%r9
9893 69,137,202, //mov %r9d,%r10d
9894 73,193,233,32, //shr $0x20,%r9
9895 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2
9896 102,66,15,58,33,20,136,16, //insertps $0x10,(%rax,%r9,4),%xmm2
9897 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3
9898 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2
9899 243,15,16,28,136, //movss (%rax,%rcx,4),%xmm3
9900 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2
9901 102,65,15,114,208,24, //psrld $0x18,%xmm8
9902 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
9903 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
9904 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
9905 65,15,89,216, //mulps %xmm8,%xmm3
9906 72,173, //lods %ds:(%rsi),%rax
9907 255,224, //jmpq *%rax
9908};
9909
9910CODE const uint8_t sk_load_a8_sse41[] = {
9911 72,173, //lods %ds:(%rsi),%rax
9912 72,139,0, //mov (%rax),%rax
9913 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0
9914 15,91,192, //cvtdq2ps %xmm0,%xmm0
9915 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
9916 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
9917 15,89,216, //mulps %xmm0,%xmm3
9918 72,173, //lods %ds:(%rsi),%rax
9919 15,87,192, //xorps %xmm0,%xmm0
9920 15,87,201, //xorps %xmm1,%xmm1
9921 15,87,210, //xorps %xmm2,%xmm2
9922 255,224, //jmpq *%rax
9923};
9924
9925CODE const uint8_t sk_store_a8_sse41[] = {
9926 72,173, //lods %ds:(%rsi),%rax
9927 72,139,0, //mov (%rax),%rax
9928 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
9929 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9930 68,15,89,195, //mulps %xmm3,%xmm8
9931 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
9932 102,69,15,56,43,192, //packusdw %xmm8,%xmm8
9933 102,69,15,103,192, //packuswb %xmm8,%xmm8
9934 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1)
9935 72,173, //lods %ds:(%rsi),%rax
9936 255,224, //jmpq *%rax
9937};
9938
9939CODE const uint8_t sk_load_565_sse41[] = {
9940 72,173, //lods %ds:(%rsi),%rax
9941 72,139,0, //mov (%rax),%rax
9942 102,68,15,56,51,12,120, //pmovzxwd (%rax,%rdi,2),%xmm9
9943 102,15,110,66,104, //movd 0x68(%rdx),%xmm0
9944 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
9945 102,65,15,219,193, //pand %xmm9,%xmm0
9946 15,91,200, //cvtdq2ps %xmm0,%xmm1
9947 243,15,16,26, //movss (%rdx),%xmm3
9948 243,15,16,66,116, //movss 0x74(%rdx),%xmm0
9949 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
9950 15,89,193, //mulps %xmm1,%xmm0
9951 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1
9952 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
9953 102,65,15,219,201, //pand %xmm9,%xmm1
9954 68,15,91,193, //cvtdq2ps %xmm1,%xmm8
9955 243,15,16,74,120, //movss 0x78(%rdx),%xmm1
9956 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
9957 65,15,89,200, //mulps %xmm8,%xmm1
9958 102,15,110,82,112, //movd 0x70(%rdx),%xmm2
9959 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2
9960 102,65,15,219,209, //pand %xmm9,%xmm2
9961 68,15,91,194, //cvtdq2ps %xmm2,%xmm8
9962 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2
9963 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
9964 65,15,89,208, //mulps %xmm8,%xmm2
9965 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
9966 72,173, //lods %ds:(%rsi),%rax
9967 255,224, //jmpq *%rax
9968};
9969
9970CODE const uint8_t sk_store_565_sse41[] = {
9971 72,173, //lods %ds:(%rsi),%rax
9972 72,139,0, //mov (%rax),%rax
9973 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8
9974 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9
9975 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
9976 69,15,40,208, //movaps %xmm8,%xmm10
9977 68,15,89,208, //mulps %xmm0,%xmm10
9978 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
9979 102,65,15,114,242,11, //pslld $0xb,%xmm10
9980 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
9981 68,15,89,201, //mulps %xmm1,%xmm9
9982 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
9983 102,65,15,114,241,5, //pslld $0x5,%xmm9
9984 102,69,15,235,202, //por %xmm10,%xmm9
9985 68,15,89,194, //mulps %xmm2,%xmm8
9986 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
9987 102,69,15,86,193, //orpd %xmm9,%xmm8
9988 102,69,15,56,43,192, //packusdw %xmm8,%xmm8
9989 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2)
9990 72,173, //lods %ds:(%rsi),%rax
9991 255,224, //jmpq *%rax
9992};
9993
9994CODE const uint8_t sk_load_8888_sse41[] = {
9995 72,173, //lods %ds:(%rsi),%rax
9996 72,139,0, //mov (%rax),%rax
9997 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
9998 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
9999 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
10000 102,15,111,203, //movdqa %xmm3,%xmm1
10001 102,15,114,209,8, //psrld $0x8,%xmm1
10002 102,15,219,200, //pand %xmm0,%xmm1
10003 102,15,111,211, //movdqa %xmm3,%xmm2
10004 102,15,114,210,16, //psrld $0x10,%xmm2
10005 102,15,219,208, //pand %xmm0,%xmm2
10006 102,15,219,195, //pand %xmm3,%xmm0
10007 15,91,192, //cvtdq2ps %xmm0,%xmm0
10008 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
10009 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10010 65,15,89,192, //mulps %xmm8,%xmm0
10011 15,91,201, //cvtdq2ps %xmm1,%xmm1
10012 65,15,89,200, //mulps %xmm8,%xmm1
10013 15,91,210, //cvtdq2ps %xmm2,%xmm2
10014 65,15,89,208, //mulps %xmm8,%xmm2
10015 102,15,114,211,24, //psrld $0x18,%xmm3
10016 15,91,219, //cvtdq2ps %xmm3,%xmm3
10017 65,15,89,216, //mulps %xmm8,%xmm3
10018 72,173, //lods %ds:(%rsi),%rax
10019 255,224, //jmpq *%rax
10020};
10021
10022CODE const uint8_t sk_store_8888_sse41[] = {
10023 72,173, //lods %ds:(%rsi),%rax
10024 72,139,0, //mov (%rax),%rax
10025 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
10026 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10027 69,15,40,200, //movaps %xmm8,%xmm9
10028 68,15,89,200, //mulps %xmm0,%xmm9
10029 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
10030 69,15,40,208, //movaps %xmm8,%xmm10
10031 68,15,89,209, //mulps %xmm1,%xmm10
10032 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
10033 102,65,15,114,242,8, //pslld $0x8,%xmm10
10034 102,69,15,235,209, //por %xmm9,%xmm10
10035 69,15,40,200, //movaps %xmm8,%xmm9
10036 68,15,89,202, //mulps %xmm2,%xmm9
10037 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
10038 102,65,15,114,241,16, //pslld $0x10,%xmm9
10039 68,15,89,195, //mulps %xmm3,%xmm8
10040 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
10041 102,65,15,114,240,24, //pslld $0x18,%xmm8
10042 102,69,15,235,193, //por %xmm9,%xmm8
10043 102,69,15,235,194, //por %xmm10,%xmm8
10044 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4)
10045 72,173, //lods %ds:(%rsi),%rax
10046 255,224, //jmpq *%rax
10047};
10048
10049CODE const uint8_t sk_load_f16_sse41[] = {
10050 72,173, //lods %ds:(%rsi),%rax
10051 72,139,0, //mov (%rax),%rax
10052 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0
10053 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1
10054 102,15,111,208, //movdqa %xmm0,%xmm2
10055 102,15,97,209, //punpcklwd %xmm1,%xmm2
10056 102,15,105,193, //punpckhwd %xmm1,%xmm0
10057 102,68,15,111,194, //movdqa %xmm2,%xmm8
10058 102,68,15,97,192, //punpcklwd %xmm0,%xmm8
10059 102,15,105,208, //punpckhwd %xmm0,%xmm2
10060 102,15,110,66,100, //movd 0x64(%rdx),%xmm0
10061 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3
10062 102,15,111,203, //movdqa %xmm3,%xmm1
10063 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1
10064 102,65,15,223,200, //pandn %xmm8,%xmm1
10065 102,15,101,218, //pcmpgtw %xmm2,%xmm3
10066 102,15,223,218, //pandn %xmm2,%xmm3
10067 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0
10068 102,15,114,240,13, //pslld $0xd,%xmm0
10069 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2
10070 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8
10071 65,15,89,192, //mulps %xmm8,%xmm0
10072 102,69,15,239,201, //pxor %xmm9,%xmm9
10073 102,65,15,105,201, //punpckhwd %xmm9,%xmm1
10074 102,15,114,241,13, //pslld $0xd,%xmm1
10075 65,15,89,200, //mulps %xmm8,%xmm1
10076 102,15,56,51,211, //pmovzxwd %xmm3,%xmm2
10077 102,15,114,242,13, //pslld $0xd,%xmm2
10078 65,15,89,208, //mulps %xmm8,%xmm2
10079 102,65,15,105,217, //punpckhwd %xmm9,%xmm3
10080 102,15,114,243,13, //pslld $0xd,%xmm3
10081 65,15,89,216, //mulps %xmm8,%xmm3
10082 72,173, //lods %ds:(%rsi),%rax
10083 255,224, //jmpq *%rax
10084};
10085
10086CODE const uint8_t sk_store_f16_sse41[] = {
10087 72,173, //lods %ds:(%rsi),%rax
10088 72,139,0, //mov (%rax),%rax
10089 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8
10090 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8
10091 102,69,15,111,200, //movdqa %xmm8,%xmm9
10092 68,15,89,200, //mulps %xmm0,%xmm9
10093 102,65,15,114,209,13, //psrld $0xd,%xmm9
10094 102,69,15,111,208, //movdqa %xmm8,%xmm10
10095 68,15,89,209, //mulps %xmm1,%xmm10
10096 102,65,15,114,210,13, //psrld $0xd,%xmm10
10097 102,69,15,111,216, //movdqa %xmm8,%xmm11
10098 68,15,89,218, //mulps %xmm2,%xmm11
10099 102,65,15,114,211,13, //psrld $0xd,%xmm11
10100 68,15,89,195, //mulps %xmm3,%xmm8
10101 102,65,15,114,208,13, //psrld $0xd,%xmm8
10102 102,65,15,115,250,2, //pslldq $0x2,%xmm10
10103 102,69,15,235,209, //por %xmm9,%xmm10
10104 102,65,15,115,248,2, //pslldq $0x2,%xmm8
10105 102,69,15,235,195, //por %xmm11,%xmm8
10106 102,69,15,111,202, //movdqa %xmm10,%xmm9
10107 102,69,15,98,200, //punpckldq %xmm8,%xmm9
10108 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8)
10109 102,69,15,106,208, //punpckhdq %xmm8,%xmm10
10110 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8)
10111 72,173, //lods %ds:(%rsi),%rax
10112 255,224, //jmpq *%rax
10113};
10114
10115CODE const uint8_t sk_store_f32_sse41[] = {
10116 72,173, //lods %ds:(%rsi),%rax
10117 72,139,0, //mov (%rax),%rax
10118 72,137,249, //mov %rdi,%rcx
10119 72,193,225,4, //shl $0x4,%rcx
10120 68,15,40,192, //movaps %xmm0,%xmm8
10121 68,15,40,200, //movaps %xmm0,%xmm9
10122 68,15,20,201, //unpcklps %xmm1,%xmm9
10123 68,15,40,210, //movaps %xmm2,%xmm10
10124 68,15,40,218, //movaps %xmm2,%xmm11
10125 68,15,20,219, //unpcklps %xmm3,%xmm11
10126 68,15,21,193, //unpckhps %xmm1,%xmm8
10127 68,15,21,211, //unpckhps %xmm3,%xmm10
10128 69,15,40,225, //movaps %xmm9,%xmm12
10129 102,69,15,20,227, //unpcklpd %xmm11,%xmm12
10130 102,69,15,21,203, //unpckhpd %xmm11,%xmm9
10131 69,15,40,216, //movaps %xmm8,%xmm11
10132 102,69,15,20,218, //unpcklpd %xmm10,%xmm11
10133 102,69,15,21,194, //unpckhpd %xmm10,%xmm8
10134 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1)
10135 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1)
10136 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1)
10137 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1)
10138 72,173, //lods %ds:(%rsi),%rax
10139 255,224, //jmpq *%rax
10140};
10141
10142CODE const uint8_t sk_clamp_x_sse41[] = {
10143 72,173, //lods %ds:(%rsi),%rax
10144 69,15,87,192, //xorps %xmm8,%xmm8
10145 68,15,95,192, //maxps %xmm0,%xmm8
10146 243,68,15,16,8, //movss (%rax),%xmm9
10147 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10148 102,15,118,192, //pcmpeqd %xmm0,%xmm0
10149 102,65,15,254,193, //paddd %xmm9,%xmm0
10150 68,15,93,192, //minps %xmm0,%xmm8
10151 72,173, //lods %ds:(%rsi),%rax
10152 65,15,40,192, //movaps %xmm8,%xmm0
10153 255,224, //jmpq *%rax
10154};
10155
10156CODE const uint8_t sk_clamp_y_sse41[] = {
10157 72,173, //lods %ds:(%rsi),%rax
10158 69,15,87,192, //xorps %xmm8,%xmm8
10159 68,15,95,193, //maxps %xmm1,%xmm8
10160 243,68,15,16,8, //movss (%rax),%xmm9
10161 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10162 102,15,118,201, //pcmpeqd %xmm1,%xmm1
10163 102,65,15,254,201, //paddd %xmm9,%xmm1
10164 68,15,93,193, //minps %xmm1,%xmm8
10165 72,173, //lods %ds:(%rsi),%rax
10166 65,15,40,200, //movaps %xmm8,%xmm1
10167 255,224, //jmpq *%rax
10168};
10169
10170CODE const uint8_t sk_repeat_x_sse41[] = {
10171 72,173, //lods %ds:(%rsi),%rax
10172 243,68,15,16,0, //movss (%rax),%xmm8
10173 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10174 68,15,40,200, //movaps %xmm0,%xmm9
10175 69,15,94,200, //divps %xmm8,%xmm9
10176 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9
10177 69,15,89,200, //mulps %xmm8,%xmm9
10178 65,15,92,193, //subps %xmm9,%xmm0
10179 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
10180 102,69,15,254,200, //paddd %xmm8,%xmm9
10181 65,15,93,193, //minps %xmm9,%xmm0
10182 72,173, //lods %ds:(%rsi),%rax
10183 255,224, //jmpq *%rax
10184};
10185
10186CODE const uint8_t sk_repeat_y_sse41[] = {
10187 72,173, //lods %ds:(%rsi),%rax
10188 243,68,15,16,0, //movss (%rax),%xmm8
10189 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10190 68,15,40,201, //movaps %xmm1,%xmm9
10191 69,15,94,200, //divps %xmm8,%xmm9
10192 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9
10193 69,15,89,200, //mulps %xmm8,%xmm9
10194 65,15,92,201, //subps %xmm9,%xmm1
10195 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
10196 102,69,15,254,200, //paddd %xmm8,%xmm9
10197 65,15,93,201, //minps %xmm9,%xmm1
10198 72,173, //lods %ds:(%rsi),%rax
10199 255,224, //jmpq *%rax
10200};
10201
10202CODE const uint8_t sk_mirror_x_sse41[] = {
10203 72,173, //lods %ds:(%rsi),%rax
10204 243,68,15,16,0, //movss (%rax),%xmm8
10205 69,15,40,200, //movaps %xmm8,%xmm9
10206 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10207 65,15,92,193, //subps %xmm9,%xmm0
10208 243,69,15,88,192, //addss %xmm8,%xmm8
10209 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10210 68,15,40,208, //movaps %xmm0,%xmm10
10211 69,15,94,208, //divps %xmm8,%xmm10
10212 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10
10213 69,15,89,208, //mulps %xmm8,%xmm10
10214 65,15,92,194, //subps %xmm10,%xmm0
10215 65,15,92,193, //subps %xmm9,%xmm0
10216 69,15,87,192, //xorps %xmm8,%xmm8
10217 68,15,92,192, //subps %xmm0,%xmm8
10218 65,15,84,192, //andps %xmm8,%xmm0
10219 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8
10220 102,69,15,254,193, //paddd %xmm9,%xmm8
10221 65,15,93,192, //minps %xmm8,%xmm0
10222 72,173, //lods %ds:(%rsi),%rax
10223 255,224, //jmpq *%rax
10224};
10225
10226CODE const uint8_t sk_mirror_y_sse41[] = {
10227 72,173, //lods %ds:(%rsi),%rax
10228 243,68,15,16,0, //movss (%rax),%xmm8
10229 69,15,40,200, //movaps %xmm8,%xmm9
10230 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10231 65,15,92,201, //subps %xmm9,%xmm1
10232 243,69,15,88,192, //addss %xmm8,%xmm8
10233 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10234 68,15,40,209, //movaps %xmm1,%xmm10
10235 69,15,94,208, //divps %xmm8,%xmm10
10236 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10
10237 69,15,89,208, //mulps %xmm8,%xmm10
10238 65,15,92,202, //subps %xmm10,%xmm1
10239 65,15,92,201, //subps %xmm9,%xmm1
10240 69,15,87,192, //xorps %xmm8,%xmm8
10241 68,15,92,193, //subps %xmm1,%xmm8
10242 65,15,84,200, //andps %xmm8,%xmm1
10243 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8
10244 102,69,15,254,193, //paddd %xmm9,%xmm8
10245 65,15,93,200, //minps %xmm8,%xmm1
10246 72,173, //lods %ds:(%rsi),%rax
10247 255,224, //jmpq *%rax
10248};
10249
10250CODE const uint8_t sk_matrix_2x3_sse41[] = {
10251 68,15,40,201, //movaps %xmm1,%xmm9
10252 68,15,40,192, //movaps %xmm0,%xmm8
10253 72,173, //lods %ds:(%rsi),%rax
10254 243,15,16,0, //movss (%rax),%xmm0
10255 243,15,16,72,4, //movss 0x4(%rax),%xmm1
10256 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
10257 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
10258 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10259 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11
10260 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10261 69,15,89,209, //mulps %xmm9,%xmm10
10262 69,15,88,211, //addps %xmm11,%xmm10
10263 65,15,89,192, //mulps %xmm8,%xmm0
10264 65,15,88,194, //addps %xmm10,%xmm0
10265 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
10266 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
10267 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10268 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
10269 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10270 69,15,89,209, //mulps %xmm9,%xmm10
10271 69,15,88,211, //addps %xmm11,%xmm10
10272 65,15,89,200, //mulps %xmm8,%xmm1
10273 65,15,88,202, //addps %xmm10,%xmm1
10274 72,173, //lods %ds:(%rsi),%rax
10275 255,224, //jmpq *%rax
10276};
10277
10278CODE const uint8_t sk_matrix_3x4_sse41[] = {
10279 68,15,40,201, //movaps %xmm1,%xmm9
10280 68,15,40,192, //movaps %xmm0,%xmm8
10281 72,173, //lods %ds:(%rsi),%rax
10282 243,15,16,0, //movss (%rax),%xmm0
10283 243,15,16,72,4, //movss 0x4(%rax),%xmm1
10284 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
10285 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
10286 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10287 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
10288 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10289 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12
10290 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10291 68,15,89,218, //mulps %xmm2,%xmm11
10292 69,15,88,220, //addps %xmm12,%xmm11
10293 69,15,89,209, //mulps %xmm9,%xmm10
10294 69,15,88,211, //addps %xmm11,%xmm10
10295 65,15,89,192, //mulps %xmm8,%xmm0
10296 65,15,88,194, //addps %xmm10,%xmm0
10297 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
10298 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
10299 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10300 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
10301 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10302 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
10303 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10304 68,15,89,218, //mulps %xmm2,%xmm11
10305 69,15,88,220, //addps %xmm12,%xmm11
10306 69,15,89,209, //mulps %xmm9,%xmm10
10307 69,15,88,211, //addps %xmm11,%xmm10
10308 65,15,89,200, //mulps %xmm8,%xmm1
10309 65,15,88,202, //addps %xmm10,%xmm1
10310 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
10311 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10312 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
10313 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10314 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
10315 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10316 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
10317 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
10318 68,15,89,226, //mulps %xmm2,%xmm12
10319 69,15,88,229, //addps %xmm13,%xmm12
10320 69,15,89,217, //mulps %xmm9,%xmm11
10321 69,15,88,220, //addps %xmm12,%xmm11
10322 69,15,89,208, //mulps %xmm8,%xmm10
10323 69,15,88,211, //addps %xmm11,%xmm10
10324 72,173, //lods %ds:(%rsi),%rax
10325 65,15,40,210, //movaps %xmm10,%xmm2
10326 255,224, //jmpq *%rax
10327};
10328
10329CODE const uint8_t sk_matrix_perspective_sse41[] = {
10330 68,15,40,192, //movaps %xmm0,%xmm8
10331 72,173, //lods %ds:(%rsi),%rax
10332 243,15,16,0, //movss (%rax),%xmm0
10333 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9
10334 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
10335 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10336 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
10337 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10338 68,15,89,201, //mulps %xmm1,%xmm9
10339 69,15,88,202, //addps %xmm10,%xmm9
10340 65,15,89,192, //mulps %xmm8,%xmm0
10341 65,15,88,193, //addps %xmm9,%xmm0
10342 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9
10343 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10344 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
10345 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10346 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
10347 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10348 68,15,89,209, //mulps %xmm1,%xmm10
10349 69,15,88,211, //addps %xmm11,%xmm10
10350 69,15,89,200, //mulps %xmm8,%xmm9
10351 69,15,88,202, //addps %xmm10,%xmm9
10352 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10
10353 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10354 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
10355 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10356 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
10357 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10358 68,15,89,217, //mulps %xmm1,%xmm11
10359 69,15,88,220, //addps %xmm12,%xmm11
10360 69,15,89,208, //mulps %xmm8,%xmm10
10361 69,15,88,211, //addps %xmm11,%xmm10
10362 65,15,83,202, //rcpps %xmm10,%xmm1
10363 15,89,193, //mulps %xmm1,%xmm0
10364 68,15,89,201, //mulps %xmm1,%xmm9
10365 72,173, //lods %ds:(%rsi),%rax
10366 65,15,40,201, //movaps %xmm9,%xmm1
10367 255,224, //jmpq *%rax
10368};
10369
10370CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
10371 72,173, //lods %ds:(%rsi),%rax
10372 68,15,16,8, //movups (%rax),%xmm9
10373 15,16,88,16, //movups 0x10(%rax),%xmm3
10374 68,15,40,195, //movaps %xmm3,%xmm8
10375 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10376 65,15,40,201, //movaps %xmm9,%xmm1
10377 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
10378 68,15,89,192, //mulps %xmm0,%xmm8
10379 68,15,88,193, //addps %xmm1,%xmm8
10380 15,40,203, //movaps %xmm3,%xmm1
10381 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
10382 65,15,40,209, //movaps %xmm9,%xmm2
10383 15,198,210,85, //shufps $0x55,%xmm2,%xmm2
10384 15,89,200, //mulps %xmm0,%xmm1
10385 15,88,202, //addps %xmm2,%xmm1
10386 15,40,211, //movaps %xmm3,%xmm2
10387 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
10388 69,15,40,209, //movaps %xmm9,%xmm10
10389 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10
10390 15,89,208, //mulps %xmm0,%xmm2
10391 65,15,88,210, //addps %xmm10,%xmm2
10392 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
10393 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9
10394 15,89,216, //mulps %xmm0,%xmm3
10395 65,15,88,217, //addps %xmm9,%xmm3
10396 72,173, //lods %ds:(%rsi),%rax
10397 65,15,40,192, //movaps %xmm8,%xmm0
10398 255,224, //jmpq *%rax
10399};
10400
10401CODE const uint8_t sk_start_pipeline_sse2[] = {
10402 65,87, //push %r15
10403 65,86, //push %r14
10404 65,85, //push %r13
10405 65,84, //push %r12
10406 86, //push %rsi
10407 87, //push %rdi
10408 83, //push %rbx
10409 72,129,236,160,0,0,0, //sub $0xa0,%rsp
10410 68,15,41,188,36,144,0,0,0, //movaps %xmm15,0x90(%rsp)
10411 68,15,41,180,36,128,0,0,0, //movaps %xmm14,0x80(%rsp)
10412 68,15,41,108,36,112, //movaps %xmm13,0x70(%rsp)
10413 68,15,41,100,36,96, //movaps %xmm12,0x60(%rsp)
10414 68,15,41,92,36,80, //movaps %xmm11,0x50(%rsp)
10415 68,15,41,84,36,64, //movaps %xmm10,0x40(%rsp)
10416 68,15,41,76,36,48, //movaps %xmm9,0x30(%rsp)
10417 68,15,41,68,36,32, //movaps %xmm8,0x20(%rsp)
10418 15,41,124,36,16, //movaps %xmm7,0x10(%rsp)
10419 15,41,52,36, //movaps %xmm6,(%rsp)
10420 77,137,207, //mov %r9,%r15
10421 77,137,198, //mov %r8,%r14
10422 72,137,203, //mov %rcx,%rbx
10423 72,137,214, //mov %rdx,%rsi
10424 72,173, //lods %ds:(%rsi),%rax
10425 73,137,196, //mov %rax,%r12
10426 73,137,245, //mov %rsi,%r13
10427 72,141,67,4, //lea 0x4(%rbx),%rax
10428 76,57,248, //cmp %r15,%rax
10429 118,5, //jbe 73 <_sk_start_pipeline_sse2+0x73>
10430 72,137,216, //mov %rbx,%rax
10431 235,52, //jmp a7 <_sk_start_pipeline_sse2+0xa7>
10432 15,87,192, //xorps %xmm0,%xmm0
10433 15,87,201, //xorps %xmm1,%xmm1
10434 15,87,210, //xorps %xmm2,%xmm2
10435 15,87,219, //xorps %xmm3,%xmm3
10436 15,87,228, //xorps %xmm4,%xmm4
10437 15,87,237, //xorps %xmm5,%xmm5
10438 15,87,246, //xorps %xmm6,%xmm6
10439 15,87,255, //xorps %xmm7,%xmm7
10440 72,137,223, //mov %rbx,%rdi
10441 76,137,238, //mov %r13,%rsi
10442 76,137,242, //mov %r14,%rdx
10443 65,255,212, //callq *%r12
10444 72,141,67,4, //lea 0x4(%rbx),%rax
10445 72,131,195,8, //add $0x8,%rbx
10446 76,57,251, //cmp %r15,%rbx
10447 72,137,195, //mov %rax,%rbx
10448 118,204, //jbe 73 <_sk_start_pipeline_sse2+0x73>
10449 15,40,52,36, //movaps (%rsp),%xmm6
10450 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7
10451 68,15,40,68,36,32, //movaps 0x20(%rsp),%xmm8
10452 68,15,40,76,36,48, //movaps 0x30(%rsp),%xmm9
10453 68,15,40,84,36,64, //movaps 0x40(%rsp),%xmm10
10454 68,15,40,92,36,80, //movaps 0x50(%rsp),%xmm11
10455 68,15,40,100,36,96, //movaps 0x60(%rsp),%xmm12
10456 68,15,40,108,36,112, //movaps 0x70(%rsp),%xmm13
10457 68,15,40,180,36,128,0,0,0, //movaps 0x80(%rsp),%xmm14
10458 68,15,40,188,36,144,0,0,0, //movaps 0x90(%rsp),%xmm15
10459 72,129,196,160,0,0,0, //add $0xa0,%rsp
10460 91, //pop %rbx
10461 95, //pop %rdi
10462 94, //pop %rsi
10463 65,92, //pop %r12
10464 65,93, //pop %r13
10465 65,94, //pop %r14
10466 65,95, //pop %r15
10467 195, //retq
10468};
10469
10470CODE const uint8_t sk_just_return_sse2[] = {
10471 195, //retq
10472};
10473
10474CODE const uint8_t sk_seed_shader_sse2[] = {
10475 72,173, //lods %ds:(%rsi),%rax
10476 102,15,110,199, //movd %edi,%xmm0
10477 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
10478 15,91,200, //cvtdq2ps %xmm0,%xmm1
10479 243,15,16,18, //movss (%rdx),%xmm2
10480 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
10481 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
10482 15,88,203, //addps %xmm3,%xmm1
10483 15,16,66,20, //movups 0x14(%rdx),%xmm0
10484 15,88,193, //addps %xmm1,%xmm0
10485 102,15,110,8, //movd (%rax),%xmm1
10486 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
10487 15,91,201, //cvtdq2ps %xmm1,%xmm1
10488 15,88,203, //addps %xmm3,%xmm1
10489 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
10490 72,173, //lods %ds:(%rsi),%rax
10491 15,87,219, //xorps %xmm3,%xmm3
10492 15,87,228, //xorps %xmm4,%xmm4
10493 15,87,237, //xorps %xmm5,%xmm5
10494 15,87,246, //xorps %xmm6,%xmm6
10495 15,87,255, //xorps %xmm7,%xmm7
10496 255,224, //jmpq *%rax
10497};
10498
10499CODE const uint8_t sk_constant_color_sse2[] = {
10500 72,173, //lods %ds:(%rsi),%rax
10501 15,16,24, //movups (%rax),%xmm3
10502 15,40,195, //movaps %xmm3,%xmm0
10503 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
10504 15,40,203, //movaps %xmm3,%xmm1
10505 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
10506 15,40,211, //movaps %xmm3,%xmm2
10507 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
10508 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
10509 72,173, //lods %ds:(%rsi),%rax
10510 255,224, //jmpq *%rax
10511};
10512
10513CODE const uint8_t sk_clear_sse2[] = {
10514 72,173, //lods %ds:(%rsi),%rax
10515 15,87,192, //xorps %xmm0,%xmm0
10516 15,87,201, //xorps %xmm1,%xmm1
10517 15,87,210, //xorps %xmm2,%xmm2
10518 15,87,219, //xorps %xmm3,%xmm3
10519 255,224, //jmpq *%rax
10520};
10521
10522CODE const uint8_t sk_plus__sse2[] = {
10523 15,88,196, //addps %xmm4,%xmm0
10524 15,88,205, //addps %xmm5,%xmm1
10525 15,88,214, //addps %xmm6,%xmm2
10526 15,88,223, //addps %xmm7,%xmm3
10527 72,173, //lods %ds:(%rsi),%rax
10528 255,224, //jmpq *%rax
10529};
10530
10531CODE const uint8_t sk_srcover_sse2[] = {
10532 243,68,15,16,2, //movss (%rdx),%xmm8
10533 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10534 68,15,92,195, //subps %xmm3,%xmm8
10535 69,15,40,200, //movaps %xmm8,%xmm9
10536 68,15,89,204, //mulps %xmm4,%xmm9
10537 65,15,88,193, //addps %xmm9,%xmm0
10538 69,15,40,200, //movaps %xmm8,%xmm9
10539 68,15,89,205, //mulps %xmm5,%xmm9
10540 65,15,88,201, //addps %xmm9,%xmm1
10541 69,15,40,200, //movaps %xmm8,%xmm9
10542 68,15,89,206, //mulps %xmm6,%xmm9
10543 65,15,88,209, //addps %xmm9,%xmm2
10544 68,15,89,199, //mulps %xmm7,%xmm8
10545 65,15,88,216, //addps %xmm8,%xmm3
10546 72,173, //lods %ds:(%rsi),%rax
10547 255,224, //jmpq *%rax
10548};
10549
10550CODE const uint8_t sk_dstover_sse2[] = {
10551 243,68,15,16,2, //movss (%rdx),%xmm8
10552 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10553 68,15,92,199, //subps %xmm7,%xmm8
10554 65,15,89,192, //mulps %xmm8,%xmm0
10555 15,88,196, //addps %xmm4,%xmm0
10556 65,15,89,200, //mulps %xmm8,%xmm1
10557 15,88,205, //addps %xmm5,%xmm1
10558 65,15,89,208, //mulps %xmm8,%xmm2
10559 15,88,214, //addps %xmm6,%xmm2
10560 65,15,89,216, //mulps %xmm8,%xmm3
10561 15,88,223, //addps %xmm7,%xmm3
10562 72,173, //lods %ds:(%rsi),%rax
10563 255,224, //jmpq *%rax
10564};
10565
10566CODE const uint8_t sk_clamp_0_sse2[] = {
10567 69,15,87,192, //xorps %xmm8,%xmm8
10568 65,15,95,192, //maxps %xmm8,%xmm0
10569 65,15,95,200, //maxps %xmm8,%xmm1
10570 65,15,95,208, //maxps %xmm8,%xmm2
10571 65,15,95,216, //maxps %xmm8,%xmm3
10572 72,173, //lods %ds:(%rsi),%rax
10573 255,224, //jmpq *%rax
10574};
10575
10576CODE const uint8_t sk_clamp_1_sse2[] = {
10577 243,68,15,16,2, //movss (%rdx),%xmm8
10578 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10579 65,15,93,192, //minps %xmm8,%xmm0
10580 65,15,93,200, //minps %xmm8,%xmm1
10581 65,15,93,208, //minps %xmm8,%xmm2
10582 65,15,93,216, //minps %xmm8,%xmm3
10583 72,173, //lods %ds:(%rsi),%rax
10584 255,224, //jmpq *%rax
10585};
10586
10587CODE const uint8_t sk_clamp_a_sse2[] = {
10588 243,68,15,16,2, //movss (%rdx),%xmm8
10589 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10590 65,15,93,216, //minps %xmm8,%xmm3
10591 15,93,195, //minps %xmm3,%xmm0
10592 15,93,203, //minps %xmm3,%xmm1
10593 15,93,211, //minps %xmm3,%xmm2
10594 72,173, //lods %ds:(%rsi),%rax
10595 255,224, //jmpq *%rax
10596};
10597
10598CODE const uint8_t sk_set_rgb_sse2[] = {
10599 72,173, //lods %ds:(%rsi),%rax
10600 243,15,16,0, //movss (%rax),%xmm0
10601 243,15,16,72,4, //movss 0x4(%rax),%xmm1
10602 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
10603 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
10604 243,15,16,80,8, //movss 0x8(%rax),%xmm2
10605 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
10606 72,173, //lods %ds:(%rsi),%rax
10607 255,224, //jmpq *%rax
10608};
10609
10610CODE const uint8_t sk_swap_rb_sse2[] = {
10611 68,15,40,192, //movaps %xmm0,%xmm8
10612 72,173, //lods %ds:(%rsi),%rax
10613 15,40,194, //movaps %xmm2,%xmm0
10614 65,15,40,208, //movaps %xmm8,%xmm2
10615 255,224, //jmpq *%rax
10616};
10617
10618CODE const uint8_t sk_swap_sse2[] = {
10619 68,15,40,195, //movaps %xmm3,%xmm8
10620 68,15,40,202, //movaps %xmm2,%xmm9
10621 68,15,40,209, //movaps %xmm1,%xmm10
10622 68,15,40,216, //movaps %xmm0,%xmm11
10623 72,173, //lods %ds:(%rsi),%rax
10624 15,40,196, //movaps %xmm4,%xmm0
10625 15,40,205, //movaps %xmm5,%xmm1
10626 15,40,214, //movaps %xmm6,%xmm2
10627 15,40,223, //movaps %xmm7,%xmm3
10628 65,15,40,227, //movaps %xmm11,%xmm4
10629 65,15,40,234, //movaps %xmm10,%xmm5
10630 65,15,40,241, //movaps %xmm9,%xmm6
10631 65,15,40,248, //movaps %xmm8,%xmm7
10632 255,224, //jmpq *%rax
10633};
10634
10635CODE const uint8_t sk_move_src_dst_sse2[] = {
10636 72,173, //lods %ds:(%rsi),%rax
10637 15,40,224, //movaps %xmm0,%xmm4
10638 15,40,233, //movaps %xmm1,%xmm5
10639 15,40,242, //movaps %xmm2,%xmm6
10640 15,40,251, //movaps %xmm3,%xmm7
10641 255,224, //jmpq *%rax
10642};
10643
10644CODE const uint8_t sk_move_dst_src_sse2[] = {
10645 72,173, //lods %ds:(%rsi),%rax
10646 15,40,196, //movaps %xmm4,%xmm0
10647 15,40,205, //movaps %xmm5,%xmm1
10648 15,40,214, //movaps %xmm6,%xmm2
10649 15,40,223, //movaps %xmm7,%xmm3
10650 255,224, //jmpq *%rax
10651};
10652
10653CODE const uint8_t sk_premul_sse2[] = {
10654 15,89,195, //mulps %xmm3,%xmm0
10655 15,89,203, //mulps %xmm3,%xmm1
10656 15,89,211, //mulps %xmm3,%xmm2
10657 72,173, //lods %ds:(%rsi),%rax
10658 255,224, //jmpq *%rax
10659};
10660
10661CODE const uint8_t sk_unpremul_sse2[] = {
10662 69,15,87,192, //xorps %xmm8,%xmm8
10663 68,15,194,195,0, //cmpeqps %xmm3,%xmm8
10664 243,68,15,16,10, //movss (%rdx),%xmm9
10665 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10666 68,15,94,203, //divps %xmm3,%xmm9
10667 69,15,85,193, //andnps %xmm9,%xmm8
10668 65,15,89,192, //mulps %xmm8,%xmm0
10669 65,15,89,200, //mulps %xmm8,%xmm1
10670 65,15,89,208, //mulps %xmm8,%xmm2
10671 72,173, //lods %ds:(%rsi),%rax
10672 255,224, //jmpq *%rax
10673};
10674
10675CODE const uint8_t sk_from_srgb_sse2[] = {
10676 243,68,15,16,66,64, //movss 0x40(%rdx),%xmm8
10677 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10678 69,15,40,232, //movaps %xmm8,%xmm13
10679 68,15,89,232, //mulps %xmm0,%xmm13
10680 68,15,40,224, //movaps %xmm0,%xmm12
10681 69,15,89,228, //mulps %xmm12,%xmm12
10682 243,68,15,16,74,60, //movss 0x3c(%rdx),%xmm9
10683 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10684 243,68,15,16,82,52, //movss 0x34(%rdx),%xmm10
10685 243,68,15,16,90,56, //movss 0x38(%rdx),%xmm11
10686 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10687 69,15,40,241, //movaps %xmm9,%xmm14
10688 68,15,89,240, //mulps %xmm0,%xmm14
10689 69,15,88,243, //addps %xmm11,%xmm14
10690 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10691 69,15,89,244, //mulps %xmm12,%xmm14
10692 69,15,88,242, //addps %xmm10,%xmm14
10693 243,68,15,16,98,68, //movss 0x44(%rdx),%xmm12
10694 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10695 65,15,194,196,1, //cmpltps %xmm12,%xmm0
10696 68,15,84,232, //andps %xmm0,%xmm13
10697 65,15,85,198, //andnps %xmm14,%xmm0
10698 65,15,86,197, //orps %xmm13,%xmm0
10699 69,15,40,232, //movaps %xmm8,%xmm13
10700 68,15,89,233, //mulps %xmm1,%xmm13
10701 68,15,40,241, //movaps %xmm1,%xmm14
10702 69,15,89,246, //mulps %xmm14,%xmm14
10703 69,15,40,249, //movaps %xmm9,%xmm15
10704 68,15,89,249, //mulps %xmm1,%xmm15
10705 69,15,88,251, //addps %xmm11,%xmm15
10706 69,15,89,254, //mulps %xmm14,%xmm15
10707 69,15,88,250, //addps %xmm10,%xmm15
10708 65,15,194,204,1, //cmpltps %xmm12,%xmm1
10709 68,15,84,233, //andps %xmm1,%xmm13
10710 65,15,85,207, //andnps %xmm15,%xmm1
10711 65,15,86,205, //orps %xmm13,%xmm1
10712 68,15,89,194, //mulps %xmm2,%xmm8
10713 68,15,40,234, //movaps %xmm2,%xmm13
10714 69,15,89,237, //mulps %xmm13,%xmm13
10715 68,15,89,202, //mulps %xmm2,%xmm9
10716 69,15,88,203, //addps %xmm11,%xmm9
10717 69,15,89,205, //mulps %xmm13,%xmm9
10718 69,15,88,202, //addps %xmm10,%xmm9
10719 65,15,194,212,1, //cmpltps %xmm12,%xmm2
10720 68,15,84,194, //andps %xmm2,%xmm8
10721 65,15,85,209, //andnps %xmm9,%xmm2
10722 65,15,86,208, //orps %xmm8,%xmm2
10723 72,173, //lods %ds:(%rsi),%rax
10724 255,224, //jmpq *%rax
10725};
10726
10727CODE const uint8_t sk_to_srgb_sse2[] = {
10728 72,131,236,40, //sub $0x28,%rsp
10729 15,41,124,36,16, //movaps %xmm7,0x10(%rsp)
10730 15,41,52,36, //movaps %xmm6,(%rsp)
10731 15,40,245, //movaps %xmm5,%xmm6
10732 15,40,236, //movaps %xmm4,%xmm5
10733 15,40,227, //movaps %xmm3,%xmm4
10734 68,15,82,192, //rsqrtps %xmm0,%xmm8
10735 69,15,83,232, //rcpps %xmm8,%xmm13
10736 69,15,82,248, //rsqrtps %xmm8,%xmm15
10737 243,15,16,26, //movss (%rdx),%xmm3
10738 243,68,15,16,66,72, //movss 0x48(%rdx),%xmm8
10739 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10740 69,15,40,240, //movaps %xmm8,%xmm14
10741 68,15,89,240, //mulps %xmm0,%xmm14
10742 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
10743 243,68,15,16,82,76, //movss 0x4c(%rdx),%xmm10
10744 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10745 243,68,15,16,90,80, //movss 0x50(%rdx),%xmm11
10746 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10747 243,68,15,16,98,84, //movss 0x54(%rdx),%xmm12
10748 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
10749 69,15,89,235, //mulps %xmm11,%xmm13
10750 69,15,88,236, //addps %xmm12,%xmm13
10751 69,15,89,250, //mulps %xmm10,%xmm15
10752 69,15,88,253, //addps %xmm13,%xmm15
10753 68,15,40,203, //movaps %xmm3,%xmm9
10754 69,15,93,207, //minps %xmm15,%xmm9
10755 243,68,15,16,106,88, //movss 0x58(%rdx),%xmm13
10756 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
10757 65,15,194,197,1, //cmpltps %xmm13,%xmm0
10758 68,15,84,240, //andps %xmm0,%xmm14
10759 65,15,85,193, //andnps %xmm9,%xmm0
10760 65,15,86,198, //orps %xmm14,%xmm0
10761 68,15,82,201, //rsqrtps %xmm1,%xmm9
10762 69,15,83,241, //rcpps %xmm9,%xmm14
10763 69,15,82,201, //rsqrtps %xmm9,%xmm9
10764 69,15,89,243, //mulps %xmm11,%xmm14
10765 69,15,88,244, //addps %xmm12,%xmm14
10766 69,15,89,202, //mulps %xmm10,%xmm9
10767 69,15,88,206, //addps %xmm14,%xmm9
10768 68,15,40,243, //movaps %xmm3,%xmm14
10769 69,15,93,241, //minps %xmm9,%xmm14
10770 69,15,40,200, //movaps %xmm8,%xmm9
10771 68,15,89,201, //mulps %xmm1,%xmm9
10772 65,15,194,205,1, //cmpltps %xmm13,%xmm1
10773 68,15,84,201, //andps %xmm1,%xmm9
10774 65,15,85,206, //andnps %xmm14,%xmm1
10775 65,15,86,201, //orps %xmm9,%xmm1
10776 68,15,82,202, //rsqrtps %xmm2,%xmm9
10777 69,15,83,241, //rcpps %xmm9,%xmm14
10778 69,15,89,243, //mulps %xmm11,%xmm14
10779 69,15,88,244, //addps %xmm12,%xmm14
10780 65,15,82,249, //rsqrtps %xmm9,%xmm7
10781 65,15,89,250, //mulps %xmm10,%xmm7
10782 65,15,88,254, //addps %xmm14,%xmm7
10783 15,93,223, //minps %xmm7,%xmm3
10784 68,15,89,194, //mulps %xmm2,%xmm8
10785 65,15,194,213,1, //cmpltps %xmm13,%xmm2
10786 68,15,84,194, //andps %xmm2,%xmm8
10787 15,85,211, //andnps %xmm3,%xmm2
10788 65,15,86,208, //orps %xmm8,%xmm2
10789 72,173, //lods %ds:(%rsi),%rax
10790 15,40,220, //movaps %xmm4,%xmm3
10791 15,40,229, //movaps %xmm5,%xmm4
10792 15,40,238, //movaps %xmm6,%xmm5
10793 15,40,52,36, //movaps (%rsp),%xmm6
10794 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7
10795 72,131,196,40, //add $0x28,%rsp
10796 255,224, //jmpq *%rax
10797};
10798
10799CODE const uint8_t sk_scale_1_float_sse2[] = {
10800 72,173, //lods %ds:(%rsi),%rax
10801 243,68,15,16,0, //movss (%rax),%xmm8
10802 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10803 65,15,89,192, //mulps %xmm8,%xmm0
10804 65,15,89,200, //mulps %xmm8,%xmm1
10805 65,15,89,208, //mulps %xmm8,%xmm2
10806 65,15,89,216, //mulps %xmm8,%xmm3
10807 72,173, //lods %ds:(%rsi),%rax
10808 255,224, //jmpq *%rax
10809};
10810
10811CODE const uint8_t sk_scale_u8_sse2[] = {
10812 72,173, //lods %ds:(%rsi),%rax
10813 72,139,0, //mov (%rax),%rax
10814 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8
10815 102,69,15,239,201, //pxor %xmm9,%xmm9
10816 102,69,15,96,193, //punpcklbw %xmm9,%xmm8
10817 102,69,15,97,193, //punpcklwd %xmm9,%xmm8
10818 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
10819 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
10820 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10821 69,15,89,200, //mulps %xmm8,%xmm9
10822 65,15,89,193, //mulps %xmm9,%xmm0
10823 65,15,89,201, //mulps %xmm9,%xmm1
10824 65,15,89,209, //mulps %xmm9,%xmm2
10825 65,15,89,217, //mulps %xmm9,%xmm3
10826 72,173, //lods %ds:(%rsi),%rax
10827 255,224, //jmpq *%rax
10828};
10829
10830CODE const uint8_t sk_lerp_1_float_sse2[] = {
10831 72,173, //lods %ds:(%rsi),%rax
10832 243,68,15,16,0, //movss (%rax),%xmm8
10833 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
10834 15,92,196, //subps %xmm4,%xmm0
10835 65,15,89,192, //mulps %xmm8,%xmm0
10836 15,88,196, //addps %xmm4,%xmm0
10837 15,92,205, //subps %xmm5,%xmm1
10838 65,15,89,200, //mulps %xmm8,%xmm1
10839 15,88,205, //addps %xmm5,%xmm1
10840 15,92,214, //subps %xmm6,%xmm2
10841 65,15,89,208, //mulps %xmm8,%xmm2
10842 15,88,214, //addps %xmm6,%xmm2
10843 15,92,223, //subps %xmm7,%xmm3
10844 65,15,89,216, //mulps %xmm8,%xmm3
10845 15,88,223, //addps %xmm7,%xmm3
10846 72,173, //lods %ds:(%rsi),%rax
10847 255,224, //jmpq *%rax
10848};
10849
10850CODE const uint8_t sk_lerp_u8_sse2[] = {
10851 72,173, //lods %ds:(%rsi),%rax
10852 72,139,0, //mov (%rax),%rax
10853 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8
10854 102,69,15,239,201, //pxor %xmm9,%xmm9
10855 102,69,15,96,193, //punpcklbw %xmm9,%xmm8
10856 102,69,15,97,193, //punpcklwd %xmm9,%xmm8
10857 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
10858 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
10859 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10860 69,15,89,200, //mulps %xmm8,%xmm9
10861 15,92,196, //subps %xmm4,%xmm0
10862 65,15,89,193, //mulps %xmm9,%xmm0
10863 15,88,196, //addps %xmm4,%xmm0
10864 15,92,205, //subps %xmm5,%xmm1
10865 65,15,89,201, //mulps %xmm9,%xmm1
10866 15,88,205, //addps %xmm5,%xmm1
10867 15,92,214, //subps %xmm6,%xmm2
10868 65,15,89,209, //mulps %xmm9,%xmm2
10869 15,88,214, //addps %xmm6,%xmm2
10870 15,92,223, //subps %xmm7,%xmm3
10871 65,15,89,217, //mulps %xmm9,%xmm3
10872 15,88,223, //addps %xmm7,%xmm3
10873 72,173, //lods %ds:(%rsi),%rax
10874 255,224, //jmpq *%rax
10875};
10876
10877CODE const uint8_t sk_lerp_565_sse2[] = {
10878 72,173, //lods %ds:(%rsi),%rax
10879 72,139,0, //mov (%rax),%rax
10880 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8
10881 102,15,239,219, //pxor %xmm3,%xmm3
10882 102,68,15,97,195, //punpcklwd %xmm3,%xmm8
10883 102,15,110,90,104, //movd 0x68(%rdx),%xmm3
10884 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
10885 102,65,15,219,216, //pand %xmm8,%xmm3
10886 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
10887 243,15,16,26, //movss (%rdx),%xmm3
10888 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
10889 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
10890 69,15,89,209, //mulps %xmm9,%xmm10
10891 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
10892 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
10893 102,69,15,219,200, //pand %xmm8,%xmm9
10894 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
10895 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
10896 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
10897 69,15,89,217, //mulps %xmm9,%xmm11
10898 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
10899 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
10900 102,69,15,219,200, //pand %xmm8,%xmm9
10901 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
10902 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
10903 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
10904 69,15,89,200, //mulps %xmm8,%xmm9
10905 15,92,196, //subps %xmm4,%xmm0
10906 65,15,89,194, //mulps %xmm10,%xmm0
10907 15,88,196, //addps %xmm4,%xmm0
10908 15,92,205, //subps %xmm5,%xmm1
10909 65,15,89,203, //mulps %xmm11,%xmm1
10910 15,88,205, //addps %xmm5,%xmm1
10911 15,92,214, //subps %xmm6,%xmm2
10912 65,15,89,209, //mulps %xmm9,%xmm2
10913 15,88,214, //addps %xmm6,%xmm2
10914 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
10915 72,173, //lods %ds:(%rsi),%rax
10916 255,224, //jmpq *%rax
10917};
10918
10919CODE const uint8_t sk_load_tables_sse2[] = {
10920 72,173, //lods %ds:(%rsi),%rax
10921 72,139,8, //mov (%rax),%rcx
10922 76,139,64,8, //mov 0x8(%rax),%r8
10923 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8
10924 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
10925 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
10926 102,69,15,111,200, //movdqa %xmm8,%xmm9
10927 102,65,15,114,209,8, //psrld $0x8,%xmm9
10928 102,68,15,219,200, //pand %xmm0,%xmm9
10929 102,69,15,111,208, //movdqa %xmm8,%xmm10
10930 102,65,15,114,210,16, //psrld $0x10,%xmm10
10931 102,68,15,219,208, //pand %xmm0,%xmm10
10932 102,65,15,219,192, //pand %xmm8,%xmm0
10933 102,15,112,216,78, //pshufd $0x4e,%xmm0,%xmm3
10934 102,72,15,126,217, //movq %xmm3,%rcx
10935 65,137,201, //mov %ecx,%r9d
10936 72,193,233,32, //shr $0x20,%rcx
10937 102,73,15,126,194, //movq %xmm0,%r10
10938 69,137,211, //mov %r10d,%r11d
10939 73,193,234,32, //shr $0x20,%r10
10940 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3
10941 243,65,15,16,4,136, //movss (%r8,%rcx,4),%xmm0
10942 15,20,216, //unpcklps %xmm0,%xmm3
10943 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0
10944 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1
10945 15,20,193, //unpcklps %xmm1,%xmm0
10946 15,20,195, //unpcklps %xmm3,%xmm0
10947 72,139,72,16, //mov 0x10(%rax),%rcx
10948 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1
10949 102,73,15,126,200, //movq %xmm1,%r8
10950 69,137,193, //mov %r8d,%r9d
10951 73,193,232,32, //shr $0x20,%r8
10952 102,77,15,126,202, //movq %xmm9,%r10
10953 69,137,211, //mov %r10d,%r11d
10954 73,193,234,32, //shr $0x20,%r10
10955 243,66,15,16,28,145, //movss (%rcx,%r10,4),%xmm3
10956 243,66,15,16,12,129, //movss (%rcx,%r8,4),%xmm1
10957 15,20,217, //unpcklps %xmm1,%xmm3
10958 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1
10959 243,66,15,16,20,137, //movss (%rcx,%r9,4),%xmm2
10960 15,20,202, //unpcklps %xmm2,%xmm1
10961 15,20,203, //unpcklps %xmm3,%xmm1
10962 72,139,64,24, //mov 0x18(%rax),%rax
10963 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2
10964 102,72,15,126,209, //movq %xmm2,%rcx
10965 65,137,200, //mov %ecx,%r8d
10966 72,193,233,32, //shr $0x20,%rcx
10967 102,77,15,126,209, //movq %xmm10,%r9
10968 69,137,202, //mov %r9d,%r10d
10969 73,193,233,32, //shr $0x20,%r9
10970 243,70,15,16,12,136, //movss (%rax,%r9,4),%xmm9
10971 243,15,16,20,136, //movss (%rax,%rcx,4),%xmm2
10972 68,15,20,202, //unpcklps %xmm2,%xmm9
10973 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2
10974 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3
10975 15,20,211, //unpcklps %xmm3,%xmm2
10976 65,15,20,209, //unpcklps %xmm9,%xmm2
10977 102,65,15,114,208,24, //psrld $0x18,%xmm8
10978 69,15,91,192, //cvtdq2ps %xmm8,%xmm8
10979 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
10980 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
10981 65,15,89,216, //mulps %xmm8,%xmm3
10982 72,173, //lods %ds:(%rsi),%rax
10983 255,224, //jmpq *%rax
10984};
10985
10986CODE const uint8_t sk_load_a8_sse2[] = {
10987 72,173, //lods %ds:(%rsi),%rax
10988 72,139,0, //mov (%rax),%rax
10989 102,15,110,4,56, //movd (%rax,%rdi,1),%xmm0
10990 102,15,239,201, //pxor %xmm1,%xmm1
10991 102,15,96,193, //punpcklbw %xmm1,%xmm0
10992 102,15,97,193, //punpcklwd %xmm1,%xmm0
10993 15,91,192, //cvtdq2ps %xmm0,%xmm0
10994 243,15,16,90,12, //movss 0xc(%rdx),%xmm3
10995 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
10996 15,89,216, //mulps %xmm0,%xmm3
10997 72,173, //lods %ds:(%rsi),%rax
10998 15,87,192, //xorps %xmm0,%xmm0
10999 102,15,239,201, //pxor %xmm1,%xmm1
11000 15,87,210, //xorps %xmm2,%xmm2
11001 255,224, //jmpq *%rax
11002};
11003
11004CODE const uint8_t sk_store_a8_sse2[] = {
11005 72,173, //lods %ds:(%rsi),%rax
11006 72,139,0, //mov (%rax),%rax
11007 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
11008 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11009 68,15,89,195, //mulps %xmm3,%xmm8
11010 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
11011 102,65,15,114,240,16, //pslld $0x10,%xmm8
11012 102,65,15,114,224,16, //psrad $0x10,%xmm8
11013 102,69,15,107,192, //packssdw %xmm8,%xmm8
11014 102,69,15,103,192, //packuswb %xmm8,%xmm8
11015 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1)
11016 72,173, //lods %ds:(%rsi),%rax
11017 255,224, //jmpq *%rax
11018};
11019
11020CODE const uint8_t sk_load_565_sse2[] = {
11021 72,173, //lods %ds:(%rsi),%rax
11022 72,139,0, //mov (%rax),%rax
11023 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9
11024 102,15,239,192, //pxor %xmm0,%xmm0
11025 102,68,15,97,200, //punpcklwd %xmm0,%xmm9
11026 102,15,110,66,104, //movd 0x68(%rdx),%xmm0
11027 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
11028 102,65,15,219,193, //pand %xmm9,%xmm0
11029 15,91,200, //cvtdq2ps %xmm0,%xmm1
11030 243,15,16,26, //movss (%rdx),%xmm3
11031 243,15,16,66,116, //movss 0x74(%rdx),%xmm0
11032 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
11033 15,89,193, //mulps %xmm1,%xmm0
11034 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1
11035 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
11036 102,65,15,219,201, //pand %xmm9,%xmm1
11037 68,15,91,193, //cvtdq2ps %xmm1,%xmm8
11038 243,15,16,74,120, //movss 0x78(%rdx),%xmm1
11039 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
11040 65,15,89,200, //mulps %xmm8,%xmm1
11041 102,15,110,82,112, //movd 0x70(%rdx),%xmm2
11042 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2
11043 102,65,15,219,209, //pand %xmm9,%xmm2
11044 68,15,91,194, //cvtdq2ps %xmm2,%xmm8
11045 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2
11046 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
11047 65,15,89,208, //mulps %xmm8,%xmm2
11048 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
11049 72,173, //lods %ds:(%rsi),%rax
11050 255,224, //jmpq *%rax
11051};
11052
11053CODE const uint8_t sk_store_565_sse2[] = {
11054 72,173, //lods %ds:(%rsi),%rax
11055 72,139,0, //mov (%rax),%rax
11056 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8
11057 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9
11058 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11059 69,15,40,208, //movaps %xmm8,%xmm10
11060 68,15,89,208, //mulps %xmm0,%xmm10
11061 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
11062 102,65,15,114,242,11, //pslld $0xb,%xmm10
11063 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11064 68,15,89,201, //mulps %xmm1,%xmm9
11065 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
11066 102,65,15,114,241,5, //pslld $0x5,%xmm9
11067 102,69,15,235,202, //por %xmm10,%xmm9
11068 68,15,89,194, //mulps %xmm2,%xmm8
11069 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
11070 102,69,15,86,193, //orpd %xmm9,%xmm8
11071 102,65,15,114,240,16, //pslld $0x10,%xmm8
11072 102,65,15,114,224,16, //psrad $0x10,%xmm8
11073 102,69,15,107,192, //packssdw %xmm8,%xmm8
11074 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2)
11075 72,173, //lods %ds:(%rsi),%rax
11076 255,224, //jmpq *%rax
11077};
11078
11079CODE const uint8_t sk_load_8888_sse2[] = {
11080 72,173, //lods %ds:(%rsi),%rax
11081 72,139,0, //mov (%rax),%rax
11082 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
11083 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
11084 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
11085 102,15,111,203, //movdqa %xmm3,%xmm1
11086 102,15,114,209,8, //psrld $0x8,%xmm1
11087 102,15,219,200, //pand %xmm0,%xmm1
11088 102,15,111,211, //movdqa %xmm3,%xmm2
11089 102,15,114,210,16, //psrld $0x10,%xmm2
11090 102,15,219,208, //pand %xmm0,%xmm2
11091 102,15,219,195, //pand %xmm3,%xmm0
11092 15,91,192, //cvtdq2ps %xmm0,%xmm0
11093 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
11094 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11095 65,15,89,192, //mulps %xmm8,%xmm0
11096 15,91,201, //cvtdq2ps %xmm1,%xmm1
11097 65,15,89,200, //mulps %xmm8,%xmm1
11098 15,91,210, //cvtdq2ps %xmm2,%xmm2
11099 65,15,89,208, //mulps %xmm8,%xmm2
11100 102,15,114,211,24, //psrld $0x18,%xmm3
11101 15,91,219, //cvtdq2ps %xmm3,%xmm3
11102 65,15,89,216, //mulps %xmm8,%xmm3
11103 72,173, //lods %ds:(%rsi),%rax
11104 255,224, //jmpq *%rax
11105};
11106
11107CODE const uint8_t sk_store_8888_sse2[] = {
11108 72,173, //lods %ds:(%rsi),%rax
11109 72,139,0, //mov (%rax),%rax
11110 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
11111 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11112 69,15,40,200, //movaps %xmm8,%xmm9
11113 68,15,89,200, //mulps %xmm0,%xmm9
11114 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
11115 69,15,40,208, //movaps %xmm8,%xmm10
11116 68,15,89,209, //mulps %xmm1,%xmm10
11117 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10
11118 102,65,15,114,242,8, //pslld $0x8,%xmm10
11119 102,69,15,235,209, //por %xmm9,%xmm10
11120 69,15,40,200, //movaps %xmm8,%xmm9
11121 68,15,89,202, //mulps %xmm2,%xmm9
11122 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9
11123 102,65,15,114,241,16, //pslld $0x10,%xmm9
11124 68,15,89,195, //mulps %xmm3,%xmm8
11125 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8
11126 102,65,15,114,240,24, //pslld $0x18,%xmm8
11127 102,69,15,235,193, //por %xmm9,%xmm8
11128 102,69,15,235,194, //por %xmm10,%xmm8
11129 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4)
11130 72,173, //lods %ds:(%rsi),%rax
11131 255,224, //jmpq *%rax
11132};
11133
11134CODE const uint8_t sk_load_f16_sse2[] = {
11135 72,173, //lods %ds:(%rsi),%rax
11136 72,139,0, //mov (%rax),%rax
11137 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0
11138 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1
11139 102,15,111,208, //movdqa %xmm0,%xmm2
11140 102,15,97,209, //punpcklwd %xmm1,%xmm2
11141 102,15,105,193, //punpckhwd %xmm1,%xmm0
11142 102,68,15,111,194, //movdqa %xmm2,%xmm8
11143 102,68,15,97,192, //punpcklwd %xmm0,%xmm8
11144 102,15,105,208, //punpckhwd %xmm0,%xmm2
11145 102,15,110,66,100, //movd 0x64(%rdx),%xmm0
11146 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3
11147 102,15,111,203, //movdqa %xmm3,%xmm1
11148 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1
11149 102,65,15,223,200, //pandn %xmm8,%xmm1
11150 102,15,101,218, //pcmpgtw %xmm2,%xmm3
11151 102,15,223,218, //pandn %xmm2,%xmm3
11152 102,69,15,239,192, //pxor %xmm8,%xmm8
11153 102,15,111,193, //movdqa %xmm1,%xmm0
11154 102,65,15,97,192, //punpcklwd %xmm8,%xmm0
11155 102,15,114,240,13, //pslld $0xd,%xmm0
11156 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2
11157 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9
11158 65,15,89,193, //mulps %xmm9,%xmm0
11159 102,65,15,105,200, //punpckhwd %xmm8,%xmm1
11160 102,15,114,241,13, //pslld $0xd,%xmm1
11161 65,15,89,201, //mulps %xmm9,%xmm1
11162 102,15,111,211, //movdqa %xmm3,%xmm2
11163 102,65,15,97,208, //punpcklwd %xmm8,%xmm2
11164 102,15,114,242,13, //pslld $0xd,%xmm2
11165 65,15,89,209, //mulps %xmm9,%xmm2
11166 102,65,15,105,216, //punpckhwd %xmm8,%xmm3
11167 102,15,114,243,13, //pslld $0xd,%xmm3
11168 65,15,89,217, //mulps %xmm9,%xmm3
11169 72,173, //lods %ds:(%rsi),%rax
11170 255,224, //jmpq *%rax
11171};
11172
11173CODE const uint8_t sk_store_f16_sse2[] = {
11174 72,173, //lods %ds:(%rsi),%rax
11175 72,139,0, //mov (%rax),%rax
11176 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8
11177 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8
11178 102,69,15,111,200, //movdqa %xmm8,%xmm9
11179 68,15,89,200, //mulps %xmm0,%xmm9
11180 102,65,15,114,209,13, //psrld $0xd,%xmm9
11181 102,69,15,111,208, //movdqa %xmm8,%xmm10
11182 68,15,89,209, //mulps %xmm1,%xmm10
11183 102,65,15,114,210,13, //psrld $0xd,%xmm10
11184 102,69,15,111,216, //movdqa %xmm8,%xmm11
11185 68,15,89,218, //mulps %xmm2,%xmm11
11186 102,65,15,114,211,13, //psrld $0xd,%xmm11
11187 68,15,89,195, //mulps %xmm3,%xmm8
11188 102,65,15,114,208,13, //psrld $0xd,%xmm8
11189 102,65,15,115,250,2, //pslldq $0x2,%xmm10
11190 102,69,15,235,209, //por %xmm9,%xmm10
11191 102,65,15,115,248,2, //pslldq $0x2,%xmm8
11192 102,69,15,235,195, //por %xmm11,%xmm8
11193 102,69,15,111,202, //movdqa %xmm10,%xmm9
11194 102,69,15,98,200, //punpckldq %xmm8,%xmm9
11195 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8)
11196 102,69,15,106,208, //punpckhdq %xmm8,%xmm10
11197 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8)
11198 72,173, //lods %ds:(%rsi),%rax
11199 255,224, //jmpq *%rax
11200};
11201
11202CODE const uint8_t sk_store_f32_sse2[] = {
11203 72,173, //lods %ds:(%rsi),%rax
11204 72,139,0, //mov (%rax),%rax
11205 72,137,249, //mov %rdi,%rcx
11206 72,193,225,4, //shl $0x4,%rcx
11207 68,15,40,192, //movaps %xmm0,%xmm8
11208 68,15,40,200, //movaps %xmm0,%xmm9
11209 68,15,20,201, //unpcklps %xmm1,%xmm9
11210 68,15,40,210, //movaps %xmm2,%xmm10
11211 68,15,40,218, //movaps %xmm2,%xmm11
11212 68,15,20,219, //unpcklps %xmm3,%xmm11
11213 68,15,21,193, //unpckhps %xmm1,%xmm8
11214 68,15,21,211, //unpckhps %xmm3,%xmm10
11215 69,15,40,225, //movaps %xmm9,%xmm12
11216 102,69,15,20,227, //unpcklpd %xmm11,%xmm12
11217 102,69,15,21,203, //unpckhpd %xmm11,%xmm9
11218 69,15,40,216, //movaps %xmm8,%xmm11
11219 102,69,15,20,218, //unpcklpd %xmm10,%xmm11
11220 102,69,15,21,194, //unpckhpd %xmm10,%xmm8
11221 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1)
11222 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1)
11223 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1)
11224 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1)
11225 72,173, //lods %ds:(%rsi),%rax
11226 255,224, //jmpq *%rax
11227};
11228
11229CODE const uint8_t sk_clamp_x_sse2[] = {
11230 72,173, //lods %ds:(%rsi),%rax
11231 69,15,87,192, //xorps %xmm8,%xmm8
11232 68,15,95,192, //maxps %xmm0,%xmm8
11233 243,68,15,16,8, //movss (%rax),%xmm9
11234 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11235 102,15,118,192, //pcmpeqd %xmm0,%xmm0
11236 102,65,15,254,193, //paddd %xmm9,%xmm0
11237 68,15,93,192, //minps %xmm0,%xmm8
11238 72,173, //lods %ds:(%rsi),%rax
11239 65,15,40,192, //movaps %xmm8,%xmm0
11240 255,224, //jmpq *%rax
11241};
11242
11243CODE const uint8_t sk_clamp_y_sse2[] = {
11244 72,173, //lods %ds:(%rsi),%rax
11245 69,15,87,192, //xorps %xmm8,%xmm8
11246 68,15,95,193, //maxps %xmm1,%xmm8
11247 243,68,15,16,8, //movss (%rax),%xmm9
11248 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11249 102,15,118,201, //pcmpeqd %xmm1,%xmm1
11250 102,65,15,254,201, //paddd %xmm9,%xmm1
11251 68,15,93,193, //minps %xmm1,%xmm8
11252 72,173, //lods %ds:(%rsi),%rax
11253 65,15,40,200, //movaps %xmm8,%xmm1
11254 255,224, //jmpq *%rax
11255};
11256
11257CODE const uint8_t sk_repeat_x_sse2[] = {
11258 72,173, //lods %ds:(%rsi),%rax
11259 243,68,15,16,0, //movss (%rax),%xmm8
11260 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11261 68,15,40,200, //movaps %xmm0,%xmm9
11262 69,15,94,200, //divps %xmm8,%xmm9
11263 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
11264 69,15,91,210, //cvtdq2ps %xmm10,%xmm10
11265 69,15,194,202,1, //cmpltps %xmm10,%xmm9
11266 243,68,15,16,26, //movss (%rdx),%xmm11
11267 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11268 69,15,84,217, //andps %xmm9,%xmm11
11269 69,15,92,211, //subps %xmm11,%xmm10
11270 69,15,89,208, //mulps %xmm8,%xmm10
11271 65,15,92,194, //subps %xmm10,%xmm0
11272 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
11273 102,69,15,254,200, //paddd %xmm8,%xmm9
11274 65,15,93,193, //minps %xmm9,%xmm0
11275 72,173, //lods %ds:(%rsi),%rax
11276 255,224, //jmpq *%rax
11277};
11278
11279CODE const uint8_t sk_repeat_y_sse2[] = {
11280 72,173, //lods %ds:(%rsi),%rax
11281 243,68,15,16,0, //movss (%rax),%xmm8
11282 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11283 68,15,40,201, //movaps %xmm1,%xmm9
11284 69,15,94,200, //divps %xmm8,%xmm9
11285 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
11286 69,15,91,210, //cvtdq2ps %xmm10,%xmm10
11287 69,15,194,202,1, //cmpltps %xmm10,%xmm9
11288 243,68,15,16,26, //movss (%rdx),%xmm11
11289 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11290 69,15,84,217, //andps %xmm9,%xmm11
11291 69,15,92,211, //subps %xmm11,%xmm10
11292 69,15,89,208, //mulps %xmm8,%xmm10
11293 65,15,92,202, //subps %xmm10,%xmm1
11294 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
11295 102,69,15,254,200, //paddd %xmm8,%xmm9
11296 65,15,93,201, //minps %xmm9,%xmm1
11297 72,173, //lods %ds:(%rsi),%rax
11298 255,224, //jmpq *%rax
11299};
11300
11301CODE const uint8_t sk_mirror_x_sse2[] = {
11302 72,173, //lods %ds:(%rsi),%rax
11303 243,68,15,16,8, //movss (%rax),%xmm9
11304 69,15,40,193, //movaps %xmm9,%xmm8
11305 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11306 65,15,92,192, //subps %xmm8,%xmm0
11307 243,69,15,88,201, //addss %xmm9,%xmm9
11308 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11309 68,15,40,208, //movaps %xmm0,%xmm10
11310 69,15,94,209, //divps %xmm9,%xmm10
11311 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
11312 69,15,91,219, //cvtdq2ps %xmm11,%xmm11
11313 69,15,194,211,1, //cmpltps %xmm11,%xmm10
11314 243,68,15,16,34, //movss (%rdx),%xmm12
11315 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11316 69,15,84,226, //andps %xmm10,%xmm12
11317 69,15,87,210, //xorps %xmm10,%xmm10
11318 69,15,92,220, //subps %xmm12,%xmm11
11319 69,15,89,217, //mulps %xmm9,%xmm11
11320 65,15,92,195, //subps %xmm11,%xmm0
11321 65,15,92,192, //subps %xmm8,%xmm0
11322 68,15,92,208, //subps %xmm0,%xmm10
11323 65,15,84,194, //andps %xmm10,%xmm0
11324 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
11325 102,69,15,254,200, //paddd %xmm8,%xmm9
11326 65,15,93,193, //minps %xmm9,%xmm0
11327 72,173, //lods %ds:(%rsi),%rax
11328 255,224, //jmpq *%rax
11329};
11330
11331CODE const uint8_t sk_mirror_y_sse2[] = {
11332 72,173, //lods %ds:(%rsi),%rax
11333 243,68,15,16,8, //movss (%rax),%xmm9
11334 69,15,40,193, //movaps %xmm9,%xmm8
11335 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11336 65,15,92,200, //subps %xmm8,%xmm1
11337 243,69,15,88,201, //addss %xmm9,%xmm9
11338 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11339 68,15,40,209, //movaps %xmm1,%xmm10
11340 69,15,94,209, //divps %xmm9,%xmm10
11341 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
11342 69,15,91,219, //cvtdq2ps %xmm11,%xmm11
11343 69,15,194,211,1, //cmpltps %xmm11,%xmm10
11344 243,68,15,16,34, //movss (%rdx),%xmm12
11345 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11346 69,15,84,226, //andps %xmm10,%xmm12
11347 69,15,87,210, //xorps %xmm10,%xmm10
11348 69,15,92,220, //subps %xmm12,%xmm11
11349 69,15,89,217, //mulps %xmm9,%xmm11
11350 65,15,92,203, //subps %xmm11,%xmm1
11351 65,15,92,200, //subps %xmm8,%xmm1
11352 68,15,92,209, //subps %xmm1,%xmm10
11353 65,15,84,202, //andps %xmm10,%xmm1
11354 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9
11355 102,69,15,254,200, //paddd %xmm8,%xmm9
11356 65,15,93,201, //minps %xmm9,%xmm1
11357 72,173, //lods %ds:(%rsi),%rax
11358 255,224, //jmpq *%rax
11359};
11360
11361CODE const uint8_t sk_matrix_2x3_sse2[] = {
11362 68,15,40,201, //movaps %xmm1,%xmm9
11363 68,15,40,192, //movaps %xmm0,%xmm8
11364 72,173, //lods %ds:(%rsi),%rax
11365 243,15,16,0, //movss (%rax),%xmm0
11366 243,15,16,72,4, //movss 0x4(%rax),%xmm1
11367 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
11368 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
11369 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11370 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11
11371 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11372 69,15,89,209, //mulps %xmm9,%xmm10
11373 69,15,88,211, //addps %xmm11,%xmm10
11374 65,15,89,192, //mulps %xmm8,%xmm0
11375 65,15,88,194, //addps %xmm10,%xmm0
11376 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
11377 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
11378 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11379 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
11380 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11381 69,15,89,209, //mulps %xmm9,%xmm10
11382 69,15,88,211, //addps %xmm11,%xmm10
11383 65,15,89,200, //mulps %xmm8,%xmm1
11384 65,15,88,202, //addps %xmm10,%xmm1
11385 72,173, //lods %ds:(%rsi),%rax
11386 255,224, //jmpq *%rax
11387};
11388
11389CODE const uint8_t sk_matrix_3x4_sse2[] = {
11390 68,15,40,201, //movaps %xmm1,%xmm9
11391 68,15,40,192, //movaps %xmm0,%xmm8
11392 72,173, //lods %ds:(%rsi),%rax
11393 243,15,16,0, //movss (%rax),%xmm0
11394 243,15,16,72,4, //movss 0x4(%rax),%xmm1
11395 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
11396 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10
11397 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11398 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
11399 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11400 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12
11401 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11402 68,15,89,218, //mulps %xmm2,%xmm11
11403 69,15,88,220, //addps %xmm12,%xmm11
11404 69,15,89,209, //mulps %xmm9,%xmm10
11405 69,15,88,211, //addps %xmm11,%xmm10
11406 65,15,89,192, //mulps %xmm8,%xmm0
11407 65,15,88,194, //addps %xmm10,%xmm0
11408 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
11409 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
11410 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11411 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
11412 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11413 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
11414 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11415 68,15,89,218, //mulps %xmm2,%xmm11
11416 69,15,88,220, //addps %xmm12,%xmm11
11417 69,15,89,209, //mulps %xmm9,%xmm10
11418 69,15,88,211, //addps %xmm11,%xmm10
11419 65,15,89,200, //mulps %xmm8,%xmm1
11420 65,15,88,202, //addps %xmm10,%xmm1
11421 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
11422 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11423 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
11424 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11425 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
11426 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11427 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
11428 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
11429 68,15,89,226, //mulps %xmm2,%xmm12
11430 69,15,88,229, //addps %xmm13,%xmm12
11431 69,15,89,217, //mulps %xmm9,%xmm11
11432 69,15,88,220, //addps %xmm12,%xmm11
11433 69,15,89,208, //mulps %xmm8,%xmm10
11434 69,15,88,211, //addps %xmm11,%xmm10
11435 72,173, //lods %ds:(%rsi),%rax
11436 65,15,40,210, //movaps %xmm10,%xmm2
11437 255,224, //jmpq *%rax
11438};
11439
11440CODE const uint8_t sk_matrix_perspective_sse2[] = {
11441 68,15,40,192, //movaps %xmm0,%xmm8
11442 72,173, //lods %ds:(%rsi),%rax
11443 243,15,16,0, //movss (%rax),%xmm0
11444 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9
11445 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
11446 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11447 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
11448 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11449 68,15,89,201, //mulps %xmm1,%xmm9
11450 69,15,88,202, //addps %xmm10,%xmm9
11451 65,15,89,192, //mulps %xmm8,%xmm0
11452 65,15,88,193, //addps %xmm9,%xmm0
11453 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9
11454 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
11455 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
11456 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11457 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11
11458 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11459 68,15,89,209, //mulps %xmm1,%xmm10
11460 69,15,88,211, //addps %xmm11,%xmm10
11461 69,15,89,200, //mulps %xmm8,%xmm9
11462 69,15,88,202, //addps %xmm10,%xmm9
11463 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10
11464 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
11465 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11
11466 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
11467 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12
11468 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
11469 68,15,89,217, //mulps %xmm1,%xmm11
11470 69,15,88,220, //addps %xmm12,%xmm11
11471 69,15,89,208, //mulps %xmm8,%xmm10
11472 69,15,88,211, //addps %xmm11,%xmm10
11473 65,15,83,202, //rcpps %xmm10,%xmm1
11474 15,89,193, //mulps %xmm1,%xmm0
11475 68,15,89,201, //mulps %xmm1,%xmm9
11476 72,173, //lods %ds:(%rsi),%rax
11477 65,15,40,201, //movaps %xmm9,%xmm1
11478 255,224, //jmpq *%rax
11479};
11480
11481CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
11482 72,173, //lods %ds:(%rsi),%rax
11483 68,15,16,8, //movups (%rax),%xmm9
11484 15,16,88,16, //movups 0x10(%rax),%xmm3
11485 68,15,40,195, //movaps %xmm3,%xmm8
11486 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
11487 65,15,40,201, //movaps %xmm9,%xmm1
11488 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
11489 68,15,89,192, //mulps %xmm0,%xmm8
11490 68,15,88,193, //addps %xmm1,%xmm8
11491 15,40,203, //movaps %xmm3,%xmm1
11492 15,198,201,85, //shufps $0x55,%xmm1,%xmm1
11493 65,15,40,209, //movaps %xmm9,%xmm2
11494 15,198,210,85, //shufps $0x55,%xmm2,%xmm2
11495 15,89,200, //mulps %xmm0,%xmm1
11496 15,88,202, //addps %xmm2,%xmm1
11497 15,40,211, //movaps %xmm3,%xmm2
11498 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2
11499 69,15,40,209, //movaps %xmm9,%xmm10
11500 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10
11501 15,89,208, //mulps %xmm0,%xmm2
11502 65,15,88,210, //addps %xmm10,%xmm2
11503 15,198,219,255, //shufps $0xff,%xmm3,%xmm3
11504 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9
11505 15,89,216, //mulps %xmm0,%xmm3
11506 65,15,88,217, //addps %xmm9,%xmm3
11507 72,173, //lods %ds:(%rsi),%rax
11508 65,15,40,192, //movaps %xmm8,%xmm0
11509 255,224, //jmpq *%rax
11510};
11511#endif