Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include "libyuv/row.h" |
| 12 | #include "libyuv/rotate_row.h" |
| 13 | |
| 14 | #ifdef __cplusplus |
| 15 | namespace libyuv { |
| 16 | extern "C" { |
| 17 | #endif |
| 18 | |
| 19 | // This module is for 32 bit Visual C x86 and clangcl |
| 20 | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| 21 | |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 22 | __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, |
| 23 | int src_stride, |
| 24 | uint8* dst, |
| 25 | int dst_stride, |
| 26 | int width) { |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 27 | __asm { |
| 28 | push edi |
| 29 | push esi |
| 30 | push ebp |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 31 | mov eax, [esp + 12 + 4] // src |
| 32 | mov edi, [esp + 12 + 8] // src_stride |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 33 | mov edx, [esp + 12 + 12] // dst |
| 34 | mov esi, [esp + 12 + 16] // dst_stride |
| 35 | mov ecx, [esp + 12 + 20] // width |
| 36 | |
| 37 | // Read in the data from the source pointer. |
| 38 | // First round of bit swap. |
| 39 | align 4 |
| 40 | convertloop: |
| 41 | movq xmm0, qword ptr [eax] |
| 42 | lea ebp, [eax + 8] |
| 43 | movq xmm1, qword ptr [eax + edi] |
| 44 | lea eax, [eax + 2 * edi] |
| 45 | punpcklbw xmm0, xmm1 |
| 46 | movq xmm2, qword ptr [eax] |
| 47 | movdqa xmm1, xmm0 |
| 48 | palignr xmm1, xmm1, 8 |
| 49 | movq xmm3, qword ptr [eax + edi] |
| 50 | lea eax, [eax + 2 * edi] |
| 51 | punpcklbw xmm2, xmm3 |
| 52 | movdqa xmm3, xmm2 |
| 53 | movq xmm4, qword ptr [eax] |
| 54 | palignr xmm3, xmm3, 8 |
| 55 | movq xmm5, qword ptr [eax + edi] |
| 56 | punpcklbw xmm4, xmm5 |
| 57 | lea eax, [eax + 2 * edi] |
| 58 | movdqa xmm5, xmm4 |
| 59 | movq xmm6, qword ptr [eax] |
| 60 | palignr xmm5, xmm5, 8 |
| 61 | movq xmm7, qword ptr [eax + edi] |
| 62 | punpcklbw xmm6, xmm7 |
| 63 | mov eax, ebp |
| 64 | movdqa xmm7, xmm6 |
| 65 | palignr xmm7, xmm7, 8 |
| 66 | // Second round of bit swap. |
| 67 | punpcklwd xmm0, xmm2 |
| 68 | punpcklwd xmm1, xmm3 |
| 69 | movdqa xmm2, xmm0 |
| 70 | movdqa xmm3, xmm1 |
| 71 | palignr xmm2, xmm2, 8 |
| 72 | palignr xmm3, xmm3, 8 |
| 73 | punpcklwd xmm4, xmm6 |
| 74 | punpcklwd xmm5, xmm7 |
| 75 | movdqa xmm6, xmm4 |
| 76 | movdqa xmm7, xmm5 |
| 77 | palignr xmm6, xmm6, 8 |
| 78 | palignr xmm7, xmm7, 8 |
| 79 | // Third round of bit swap. |
| 80 | // Write to the destination pointer. |
| 81 | punpckldq xmm0, xmm4 |
| 82 | movq qword ptr [edx], xmm0 |
| 83 | movdqa xmm4, xmm0 |
| 84 | palignr xmm4, xmm4, 8 |
| 85 | movq qword ptr [edx + esi], xmm4 |
| 86 | lea edx, [edx + 2 * esi] |
| 87 | punpckldq xmm2, xmm6 |
| 88 | movdqa xmm6, xmm2 |
| 89 | palignr xmm6, xmm6, 8 |
| 90 | movq qword ptr [edx], xmm2 |
| 91 | punpckldq xmm1, xmm5 |
| 92 | movq qword ptr [edx + esi], xmm6 |
| 93 | lea edx, [edx + 2 * esi] |
| 94 | movdqa xmm5, xmm1 |
| 95 | movq qword ptr [edx], xmm1 |
| 96 | palignr xmm5, xmm5, 8 |
| 97 | punpckldq xmm3, xmm7 |
| 98 | movq qword ptr [edx + esi], xmm5 |
| 99 | lea edx, [edx + 2 * esi] |
| 100 | movq qword ptr [edx], xmm3 |
| 101 | movdqa xmm7, xmm3 |
| 102 | palignr xmm7, xmm7, 8 |
| 103 | sub ecx, 8 |
| 104 | movq qword ptr [edx + esi], xmm7 |
| 105 | lea edx, [edx + 2 * esi] |
| 106 | jg convertloop |
| 107 | |
| 108 | pop ebp |
| 109 | pop esi |
| 110 | pop edi |
| 111 | ret |
| 112 | } |
| 113 | } |
| 114 | |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 115 | __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, |
| 116 | int src_stride, |
| 117 | uint8* dst_a, |
| 118 | int dst_stride_a, |
| 119 | uint8* dst_b, |
| 120 | int dst_stride_b, |
| 121 | int w) { |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 122 | __asm { |
| 123 | push ebx |
| 124 | push esi |
| 125 | push edi |
| 126 | push ebp |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 127 | mov eax, [esp + 16 + 4] // src |
| 128 | mov edi, [esp + 16 + 8] // src_stride |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 129 | mov edx, [esp + 16 + 12] // dst_a |
| 130 | mov esi, [esp + 16 + 16] // dst_stride_a |
| 131 | mov ebx, [esp + 16 + 20] // dst_b |
| 132 | mov ebp, [esp + 16 + 24] // dst_stride_b |
| 133 | mov ecx, esp |
| 134 | sub esp, 4 + 16 |
| 135 | and esp, ~15 |
| 136 | mov [esp + 16], ecx |
| 137 | mov ecx, [ecx + 16 + 28] // w |
| 138 | |
| 139 | align 4 |
| 140 | convertloop: |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 141 | // Read in the data from the source pointer. |
| 142 | // First round of bit swap. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 143 | movdqu xmm0, [eax] |
| 144 | movdqu xmm1, [eax + edi] |
| 145 | lea eax, [eax + 2 * edi] |
| 146 | movdqa xmm7, xmm0 // use xmm7 as temp register. |
| 147 | punpcklbw xmm0, xmm1 |
| 148 | punpckhbw xmm7, xmm1 |
| 149 | movdqa xmm1, xmm7 |
| 150 | movdqu xmm2, [eax] |
| 151 | movdqu xmm3, [eax + edi] |
| 152 | lea eax, [eax + 2 * edi] |
| 153 | movdqa xmm7, xmm2 |
| 154 | punpcklbw xmm2, xmm3 |
| 155 | punpckhbw xmm7, xmm3 |
| 156 | movdqa xmm3, xmm7 |
| 157 | movdqu xmm4, [eax] |
| 158 | movdqu xmm5, [eax + edi] |
| 159 | lea eax, [eax + 2 * edi] |
| 160 | movdqa xmm7, xmm4 |
| 161 | punpcklbw xmm4, xmm5 |
| 162 | punpckhbw xmm7, xmm5 |
| 163 | movdqa xmm5, xmm7 |
| 164 | movdqu xmm6, [eax] |
| 165 | movdqu xmm7, [eax + edi] |
| 166 | lea eax, [eax + 2 * edi] |
| 167 | movdqu [esp], xmm5 // backup xmm5 |
| 168 | neg edi |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 169 | movdqa xmm5, xmm6 // use xmm5 as temp register. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 170 | punpcklbw xmm6, xmm7 |
| 171 | punpckhbw xmm5, xmm7 |
| 172 | movdqa xmm7, xmm5 |
| 173 | lea eax, [eax + 8 * edi + 16] |
| 174 | neg edi |
| 175 | // Second round of bit swap. |
| 176 | movdqa xmm5, xmm0 |
| 177 | punpcklwd xmm0, xmm2 |
| 178 | punpckhwd xmm5, xmm2 |
| 179 | movdqa xmm2, xmm5 |
| 180 | movdqa xmm5, xmm1 |
| 181 | punpcklwd xmm1, xmm3 |
| 182 | punpckhwd xmm5, xmm3 |
| 183 | movdqa xmm3, xmm5 |
| 184 | movdqa xmm5, xmm4 |
| 185 | punpcklwd xmm4, xmm6 |
| 186 | punpckhwd xmm5, xmm6 |
| 187 | movdqa xmm6, xmm5 |
| 188 | movdqu xmm5, [esp] // restore xmm5 |
| 189 | movdqu [esp], xmm6 // backup xmm6 |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 190 | movdqa xmm6, xmm5 // use xmm6 as temp register. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 191 | punpcklwd xmm5, xmm7 |
| 192 | punpckhwd xmm6, xmm7 |
| 193 | movdqa xmm7, xmm6 |
| 194 | // Third round of bit swap. |
| 195 | // Write to the destination pointer. |
| 196 | movdqa xmm6, xmm0 |
| 197 | punpckldq xmm0, xmm4 |
| 198 | punpckhdq xmm6, xmm4 |
| 199 | movdqa xmm4, xmm6 |
| 200 | movdqu xmm6, [esp] // restore xmm6 |
| 201 | movlpd qword ptr [edx], xmm0 |
| 202 | movhpd qword ptr [ebx], xmm0 |
| 203 | movlpd qword ptr [edx + esi], xmm4 |
| 204 | lea edx, [edx + 2 * esi] |
| 205 | movhpd qword ptr [ebx + ebp], xmm4 |
| 206 | lea ebx, [ebx + 2 * ebp] |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 207 | movdqa xmm0, xmm2 // use xmm0 as the temp register. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 208 | punpckldq xmm2, xmm6 |
| 209 | movlpd qword ptr [edx], xmm2 |
| 210 | movhpd qword ptr [ebx], xmm2 |
| 211 | punpckhdq xmm0, xmm6 |
| 212 | movlpd qword ptr [edx + esi], xmm0 |
| 213 | lea edx, [edx + 2 * esi] |
| 214 | movhpd qword ptr [ebx + ebp], xmm0 |
| 215 | lea ebx, [ebx + 2 * ebp] |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 216 | movdqa xmm0, xmm1 // use xmm0 as the temp register. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 217 | punpckldq xmm1, xmm5 |
| 218 | movlpd qword ptr [edx], xmm1 |
| 219 | movhpd qword ptr [ebx], xmm1 |
| 220 | punpckhdq xmm0, xmm5 |
| 221 | movlpd qword ptr [edx + esi], xmm0 |
| 222 | lea edx, [edx + 2 * esi] |
| 223 | movhpd qword ptr [ebx + ebp], xmm0 |
| 224 | lea ebx, [ebx + 2 * ebp] |
Frank Barchard | b83bb38 | 2017-02-22 18:01:07 -0800 | [diff] [blame^] | 225 | movdqa xmm0, xmm3 // use xmm0 as the temp register. |
Hangyu Kuang | f047e7c | 2016-07-06 14:21:45 -0700 | [diff] [blame] | 226 | punpckldq xmm3, xmm7 |
| 227 | movlpd qword ptr [edx], xmm3 |
| 228 | movhpd qword ptr [ebx], xmm3 |
| 229 | punpckhdq xmm0, xmm7 |
| 230 | sub ecx, 8 |
| 231 | movlpd qword ptr [edx + esi], xmm0 |
| 232 | lea edx, [edx + 2 * esi] |
| 233 | movhpd qword ptr [ebx + ebp], xmm0 |
| 234 | lea ebx, [ebx + 2 * ebp] |
| 235 | jg convertloop |
| 236 | |
| 237 | mov esp, [esp + 16] |
| 238 | pop ebp |
| 239 | pop edi |
| 240 | pop esi |
| 241 | pop ebx |
| 242 | ret |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| 247 | |
| 248 | #ifdef __cplusplus |
| 249 | } // extern "C" |
| 250 | } // namespace libyuv |
| 251 | #endif |