MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 1 | ; |
| 2 | ; jsimdext.inc - common declarations |
| 3 | ; |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 4 | ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
DRC | 4132b1d | 2010-04-20 20:54:03 +0000 | [diff] [blame] | 5 | ; Copyright 2010 D. R. Commander |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 6 | ; |
| 7 | ; Based on |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 8 | ; x86 SIMD extension for IJG JPEG library - version 1.02 |
| 9 | ; |
| 10 | ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 11 | ; |
| 12 | ; This software is provided 'as-is', without any express or implied |
| 13 | ; warranty. In no event will the authors be held liable for any damages |
| 14 | ; arising from the use of this software. |
| 15 | ; |
| 16 | ; Permission is granted to anyone to use this software for any purpose, |
| 17 | ; including commercial applications, and to alter it and redistribute it |
| 18 | ; freely, subject to the following restrictions: |
| 19 | ; |
| 20 | ; 1. The origin of this software must not be misrepresented; you must not |
| 21 | ; claim that you wrote the original software. If you use this software |
| 22 | ; in a product, an acknowledgment in the product documentation would be |
| 23 | ; appreciated but is not required. |
| 24 | ; 2. Altered source versions must be plainly marked as such, and must not be |
| 25 | ; misrepresented as being the original software. |
| 26 | ; 3. This notice may not be removed or altered from any source distribution. |
| 27 | ; |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 28 | ; [TAB8] |
| 29 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 30 | ; ========================================================================== |
| 31 | ; System-dependent configurations |
| 32 | |
| 33 | %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- |
| 34 | ; * Microsoft Visual C++ |
| 35 | ; * MinGW (Minimalist GNU for Windows) |
| 36 | ; * CygWin |
| 37 | ; * LCC-Win32 |
| 38 | |
| 39 | ; -- segment definition -- |
| 40 | ; |
| 41 | %define SEG_TEXT .text align=16 public use32 class=CODE |
| 42 | %define SEG_CONST .rdata align=16 public use32 class=CONST |
| 43 | |
DRC | 8b014d7 | 2010-02-18 13:03:41 +0000 | [diff] [blame] | 44 | %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- |
| 45 | ; * Microsoft Visual C++ |
| 46 | |
| 47 | ; -- segment definition -- |
| 48 | ; |
| 49 | %define SEG_TEXT .text align=16 public use64 class=CODE |
| 50 | %define SEG_CONST .rdata align=16 public use64 class=CONST |
DRC | 26b208d | 2010-02-18 13:14:29 +0000 | [diff] [blame] | 51 | %ifdef MSVC |
| 52 | %define EXTN(name) name ; foo() -> foo |
| 53 | %endif |
DRC | 8b014d7 | 2010-02-18 13:03:41 +0000 | [diff] [blame] | 54 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 55 | %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- |
| 56 | ; * Borland C++ (Win32) |
| 57 | |
| 58 | ; -- segment definition -- |
| 59 | ; |
| 60 | %define SEG_TEXT .text align=16 public use32 class=CODE |
| 61 | %define SEG_CONST .data align=16 public use32 class=DATA |
| 62 | |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 63 | %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 64 | ; * Linux |
| 65 | ; * *BSD family Unix using elf format |
| 66 | ; * Unix System V, including Solaris x86, UnixWare and SCO Unix |
| 67 | |
Adam Tkac | e54fb0b | 2010-01-27 10:10:23 +0000 | [diff] [blame] | 68 | ; mark stack as non-executable |
| 69 | section .note.GNU-stack noalloc noexec nowrite progbits |
| 70 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 71 | ; -- segment definition -- |
| 72 | ; |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 73 | %ifdef __x86_64__ |
| 74 | %define SEG_TEXT .text progbits align=16 |
| 75 | %define SEG_CONST .rodata progbits align=16 |
| 76 | %else |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 77 | %define SEG_TEXT .text progbits alloc exec nowrite align=16 |
| 78 | %define SEG_CONST .rodata progbits alloc noexec nowrite align=16 |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 79 | %endif |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 80 | |
| 81 | ; To make the code position-independent, append -DPIC to the commandline |
| 82 | ; |
| 83 | %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC |
| 84 | %define EXTN(name) name ; foo() -> foo |
| 85 | |
| 86 | %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- |
| 87 | ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) |
| 88 | ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) |
| 89 | |
| 90 | ; -- segment definition -- |
| 91 | ; |
| 92 | %define SEG_TEXT .text |
| 93 | %define SEG_CONST .data |
| 94 | |
| 95 | ; To make the code position-independent, append -DPIC to the commandline |
| 96 | ; |
| 97 | %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC |
| 98 | |
| 99 | %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- |
| 100 | ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) |
| 101 | |
| 102 | ; -- segment definition -- |
| 103 | ; |
| 104 | %define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? |
| 105 | %define SEG_CONST .rodata align=16 |
| 106 | |
| 107 | ; The generation of position-independent code (PIC) is the default on Darwin. |
| 108 | ; |
| 109 | %define PIC |
| 110 | %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing |
| 111 | |
| 112 | %else ; ----(Other case)---------------------- |
| 113 | |
| 114 | ; -- segment definition -- |
| 115 | ; |
| 116 | %define SEG_TEXT .text |
| 117 | %define SEG_CONST .data |
| 118 | |
| 119 | %endif ; ---------------------------------------------- |
| 120 | |
| 121 | ; ========================================================================== |
| 122 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 123 | ; -------------------------------------------------------------------------- |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 124 | ; Common types |
| 125 | ; |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 126 | %ifdef __x86_64__ |
| 127 | %define POINTER qword ; general pointer type |
| 128 | %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) |
| 129 | %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
| 130 | %else |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 131 | %define POINTER dword ; general pointer type |
| 132 | %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) |
| 133 | %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 134 | %endif |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 135 | |
Pierre Ossman | 5eb84ff | 2009-03-09 13:25:30 +0000 | [diff] [blame] | 136 | %define INT dword ; signed integer type |
| 137 | %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) |
| 138 | %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 139 | |
Pierre Ossman | 65d0317 | 2009-03-09 13:28:10 +0000 | [diff] [blame] | 140 | %define FP32 dword ; IEEE754 single |
| 141 | %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) |
| 142 | %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 143 | |
Pierre Ossman | 5eb84ff | 2009-03-09 13:25:30 +0000 | [diff] [blame] | 144 | %define MMWORD qword ; int64 (MMX register) |
| 145 | %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) |
| 146 | %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 147 | |
Pierre Ossman | 018fc42 | 2009-03-09 13:31:56 +0000 | [diff] [blame] | 148 | ; NASM is buggy and doesn't properly handle operand sizes for SSE |
| 149 | ; instructions, so for now we have to define XMMWORD as blank. |
| 150 | %define XMMWORD ; int128 (SSE register) |
| 151 | %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) |
| 152 | %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 153 | |
Pierre Ossman | eea7215 | 2009-03-09 13:34:17 +0000 | [diff] [blame] | 154 | ; Similar hacks for when we load a dword or MMWORD into an xmm# register |
| 155 | %define XMM_DWORD |
| 156 | %define XMM_MMWORD |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 157 | |
Pierre Ossman | 5eb84ff | 2009-03-09 13:25:30 +0000 | [diff] [blame] | 158 | %define SIZEOF_BYTE 1 ; sizeof(BYTE) |
| 159 | %define SIZEOF_WORD 2 ; sizeof(WORD) |
| 160 | %define SIZEOF_DWORD 4 ; sizeof(DWORD) |
| 161 | %define SIZEOF_QWORD 8 ; sizeof(QWORD) |
Pierre Ossman | 018fc42 | 2009-03-09 13:31:56 +0000 | [diff] [blame] | 162 | %define SIZEOF_OWORD 16 ; sizeof(OWORD) |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 163 | |
Pierre Ossman | 5eb84ff | 2009-03-09 13:25:30 +0000 | [diff] [blame] | 164 | %define BYTE_BIT 8 ; CHAR_BIT in C |
| 165 | %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT |
| 166 | %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT |
| 167 | %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT |
Pierre Ossman | 018fc42 | 2009-03-09 13:31:56 +0000 | [diff] [blame] | 168 | %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 169 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 170 | ; -------------------------------------------------------------------------- |
| 171 | ; External Symbol Name |
| 172 | ; |
| 173 | %ifndef EXTN |
| 174 | %define EXTN(name) _ %+ name ; foo() -> _foo |
| 175 | %endif |
| 176 | |
| 177 | ; -------------------------------------------------------------------------- |
| 178 | ; Macros for position-independent code (PIC) support |
| 179 | ; |
| 180 | %ifndef GOT_SYMBOL |
| 181 | %undef PIC |
| 182 | %endif |
| 183 | |
| 184 | %ifdef PIC ; ------------------------------------------- |
| 185 | |
| 186 | %ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- |
| 187 | |
| 188 | ; At present, nasm doesn't seem to support PIC generation for Mach-O. |
| 189 | ; The PIC support code below is a little tricky. |
| 190 | |
| 191 | SECTION SEG_CONST |
| 192 | const_base: |
| 193 | |
| 194 | %define GOTOFF(got,sym) (got) + (sym) - const_base |
| 195 | |
| 196 | %imacro get_GOT 1 |
| 197 | ; NOTE: this macro destroys ecx resister. |
| 198 | call %%geteip |
| 199 | add ecx, byte (%%ref - $) |
| 200 | jmp short %%adjust |
| 201 | %%geteip: |
| 202 | mov ecx, POINTER [esp] |
| 203 | ret |
| 204 | %%adjust: |
| 205 | push ebp |
| 206 | xor ebp,ebp ; ebp = 0 |
| 207 | %ifidni %1,ebx ; (%1 == ebx) |
| 208 | ; db 0x8D,0x9C + jmp near const_base = |
| 209 | ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) |
| 210 | db 0x8D,0x9C ; 8D,9C |
| 211 | jmp near const_base ; E9,(const_base-%%ref) |
| 212 | %%ref: |
| 213 | %else ; (%1 != ebx) |
| 214 | ; db 0x8D,0x8C + jmp near const_base = |
| 215 | ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) |
| 216 | db 0x8D,0x8C ; 8D,8C |
| 217 | jmp near const_base ; E9,(const_base-%%ref) |
| 218 | %%ref: mov %1, ecx |
| 219 | %endif ; (%1 == ebx) |
| 220 | pop ebp |
| 221 | %endmacro |
| 222 | |
| 223 | %else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- |
| 224 | |
| 225 | %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff |
| 226 | |
| 227 | %imacro get_GOT 1 |
| 228 | extern GOT_SYMBOL |
| 229 | call %%geteip |
| 230 | add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc |
| 231 | jmp short %%done |
| 232 | %%geteip: |
| 233 | mov %1, POINTER [esp] |
| 234 | ret |
| 235 | %%done: |
| 236 | %endmacro |
| 237 | |
| 238 | %endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- |
| 239 | |
| 240 | %imacro pushpic 1.nolist |
| 241 | push %1 |
| 242 | %endmacro |
| 243 | %imacro poppic 1.nolist |
| 244 | pop %1 |
| 245 | %endmacro |
| 246 | %imacro movpic 2.nolist |
| 247 | mov %1,%2 |
| 248 | %endmacro |
| 249 | |
| 250 | %else ; !PIC ----------------------------------------- |
| 251 | |
| 252 | %define GOTOFF(got,sym) (sym) |
| 253 | |
| 254 | %imacro get_GOT 1.nolist |
| 255 | %endmacro |
| 256 | %imacro pushpic 1.nolist |
| 257 | %endmacro |
| 258 | %imacro poppic 1.nolist |
| 259 | %endmacro |
| 260 | %imacro movpic 2.nolist |
| 261 | %endmacro |
| 262 | |
| 263 | %endif ; PIC ----------------------------------------- |
| 264 | |
| 265 | ; -------------------------------------------------------------------------- |
| 266 | ; Align the next instruction on {2,4,8,16,..}-byte boundary. |
| 267 | ; ".balign n,,m" in GNU as |
| 268 | ; |
| 269 | %define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) |
| 270 | %define FILLB(b,n) (($$-(b)) & ((n)-1)) |
| 271 | |
| 272 | %imacro alignx 1-2.nolist 0xFFFF |
| 273 | %%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ |
| 274 | db 0x90 ; nop |
| 275 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ |
| 276 | db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] |
| 277 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ |
| 278 | db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
| 279 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ |
| 280 | db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
| 281 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ |
| 282 | db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] |
| 283 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ |
| 284 | db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] |
| 285 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ |
| 286 | db 0x8B,0xED ; mov ebp,ebp |
| 287 | times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ |
| 288 | db 0x90 ; nop |
| 289 | %endmacro |
| 290 | |
| 291 | ; Align the next data on {2,4,8,16,..}-byte boundary. |
| 292 | ; |
| 293 | %imacro alignz 1.nolist |
| 294 | align %1, db 0 ; filling zeros |
| 295 | %endmacro |
| 296 | |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 297 | %ifdef __x86_64__ |
DRC | 8b014d7 | 2010-02-18 13:03:41 +0000 | [diff] [blame] | 298 | |
| 299 | %ifdef WIN64 |
| 300 | |
| 301 | %imacro collect_args 0 |
| 302 | push r10 |
| 303 | push r11 |
| 304 | push r12 |
| 305 | push r13 |
| 306 | push r14 |
| 307 | push r15 |
| 308 | mov r10, rcx |
| 309 | mov r11, rdx |
| 310 | mov r12, r8 |
| 311 | mov r13, r9 |
| 312 | mov r14, [rax+48] |
| 313 | mov r15, [rax+56] |
DRC | e728ed7 | 2010-04-20 19:15:09 +0000 | [diff] [blame] | 314 | push rsi |
| 315 | push rdi |
DRC | b6f097f | 2010-04-23 15:41:34 +0000 | [diff] [blame^] | 316 | sub rsp, SIZEOF_XMMWORD |
| 317 | movlpd XMMWORD [rsp], xmm6 |
| 318 | sub rsp, SIZEOF_XMMWORD |
| 319 | movlpd XMMWORD [rsp], xmm7 |
DRC | e728ed7 | 2010-04-20 19:15:09 +0000 | [diff] [blame] | 320 | %endmacro |
| 321 | |
| 322 | %imacro uncollect_args 0 |
DRC | b6f097f | 2010-04-23 15:41:34 +0000 | [diff] [blame^] | 323 | movlpd xmm7, XMMWORD [rsp] |
| 324 | add rsp, SIZEOF_XMMWORD |
| 325 | movlpd xmm6, XMMWORD [rsp] |
| 326 | add rsp, SIZEOF_XMMWORD |
DRC | e728ed7 | 2010-04-20 19:15:09 +0000 | [diff] [blame] | 327 | pop rdi |
| 328 | pop rsi |
| 329 | pop r15 |
| 330 | pop r14 |
| 331 | pop r13 |
| 332 | pop r12 |
| 333 | pop r11 |
| 334 | pop r10 |
DRC | 8b014d7 | 2010-02-18 13:03:41 +0000 | [diff] [blame] | 335 | %endmacro |
| 336 | |
| 337 | %else |
| 338 | |
DRC | cdc8ac3 | 2009-06-25 20:38:31 +0000 | [diff] [blame] | 339 | %imacro collect_args 0 |
| 340 | push r10 |
| 341 | push r11 |
| 342 | push r12 |
| 343 | push r13 |
| 344 | push r14 |
| 345 | push r15 |
| 346 | mov r10, rdi |
| 347 | mov r11, rsi |
| 348 | mov r12, rdx |
| 349 | mov r13, rcx |
| 350 | mov r14, r8 |
| 351 | mov r15, r9 |
| 352 | %endmacro |
| 353 | |
| 354 | %imacro uncollect_args 0 |
| 355 | pop r15 |
| 356 | pop r14 |
| 357 | pop r13 |
| 358 | pop r12 |
| 359 | pop r11 |
| 360 | pop r10 |
| 361 | %endmacro |
| 362 | |
| 363 | %endif |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 364 | |
DRC | e728ed7 | 2010-04-20 19:15:09 +0000 | [diff] [blame] | 365 | %endif |
| 366 | |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 367 | ; -------------------------------------------------------------------------- |
| 368 | ; Defines picked up from the C headers |
| 369 | ; |
Pierre Ossman | 3a65ef4 | 2009-03-16 13:34:18 +0000 | [diff] [blame] | 370 | %include "jsimdcfg.inc" |
Pierre Ossman | 2ae181c | 2009-03-09 13:21:27 +0000 | [diff] [blame] | 371 | |
MIYASAKA Masaru | a2e6a9d | 2006-02-04 00:00:00 +0000 | [diff] [blame] | 372 | ; -------------------------------------------------------------------------- |