blob: 4ea3d17c577a98a30cde2a123971cf2eeaac6268 [file] [log] [blame]
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00001;
2; jsimdext.inc - common declarations
3;
Pierre Ossman2ae181c2009-03-09 13:21:27 +00004; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
DRC4132b1d2010-04-20 20:54:03 +00005; Copyright 2010 D. R. Commander
Pierre Ossman2ae181c2009-03-09 13:21:27 +00006;
7; Based on
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +00008; x86 SIMD extension for IJG JPEG library - version 1.02
9;
10; Copyright (C) 1999-2006, MIYASAKA Masaru.
11;
12; This software is provided 'as-is', without any express or implied
13; warranty. In no event will the authors be held liable for any damages
14; arising from the use of this software.
15;
16; Permission is granted to anyone to use this software for any purpose,
17; including commercial applications, and to alter it and redistribute it
18; freely, subject to the following restrictions:
19;
20; 1. The origin of this software must not be misrepresented; you must not
21; claim that you wrote the original software. If you use this software
22; in a product, an acknowledgment in the product documentation would be
23; appreciated but is not required.
24; 2. Altered source versions must be plainly marked as such, and must not be
25; misrepresented as being the original software.
26; 3. This notice may not be removed or altered from any source distribution.
27;
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000028; [TAB8]
29
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000030; ==========================================================================
31; System-dependent configurations
32
33%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
34; * Microsoft Visual C++
35; * MinGW (Minimalist GNU for Windows)
36; * CygWin
37; * LCC-Win32
38
39; -- segment definition --
40;
41%define SEG_TEXT .text align=16 public use32 class=CODE
42%define SEG_CONST .rdata align=16 public use32 class=CONST
43
DRC8b014d72010-02-18 13:03:41 +000044%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
45; * Microsoft Visual C++
46
47; -- segment definition --
48;
49%define SEG_TEXT .text align=16 public use64 class=CODE
50%define SEG_CONST .rdata align=16 public use64 class=CONST
DRC26b208d2010-02-18 13:14:29 +000051%ifdef MSVC
52%define EXTN(name) name ; foo() -> foo
53%endif
DRC8b014d72010-02-18 13:03:41 +000054
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000055%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
56; * Borland C++ (Win32)
57
58; -- segment definition --
59;
60%define SEG_TEXT .text align=16 public use32 class=CODE
61%define SEG_CONST .data align=16 public use32 class=DATA
62
DRCcdc8ac32009-06-25 20:38:31 +000063%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000064; * Linux
65; * *BSD family Unix using elf format
66; * Unix System V, including Solaris x86, UnixWare and SCO Unix
67
Adam Tkace54fb0b2010-01-27 10:10:23 +000068; mark stack as non-executable
69section .note.GNU-stack noalloc noexec nowrite progbits
70
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000071; -- segment definition --
72;
DRCcdc8ac32009-06-25 20:38:31 +000073%ifdef __x86_64__
74%define SEG_TEXT .text progbits align=16
75%define SEG_CONST .rodata progbits align=16
76%else
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000077%define SEG_TEXT .text progbits alloc exec nowrite align=16
78%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
DRCcdc8ac32009-06-25 20:38:31 +000079%endif
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +000080
81; To make the code position-independent, append -DPIC to the commandline
82;
83%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
84%define EXTN(name) name ; foo() -> foo
85
86%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
87; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
88; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
89
90; -- segment definition --
91;
92%define SEG_TEXT .text
93%define SEG_CONST .data
94
95; To make the code position-independent, append -DPIC to the commandline
96;
97%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
98
99%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
100; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
101
102; -- segment definition --
103;
104%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
105%define SEG_CONST .rodata align=16
106
107; The generation of position-independent code (PIC) is the default on Darwin.
108;
109%define PIC
110%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
111
112%else ; ----(Other case)----------------------
113
114; -- segment definition --
115;
116%define SEG_TEXT .text
117%define SEG_CONST .data
118
119%endif ; ----------------------------------------------
120
121; ==========================================================================
122
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000123; --------------------------------------------------------------------------
Pierre Ossman2ae181c2009-03-09 13:21:27 +0000124; Common types
125;
DRCcdc8ac32009-06-25 20:38:31 +0000126%ifdef __x86_64__
127%define POINTER qword ; general pointer type
128%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
129%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
130%else
Pierre Ossman2ae181c2009-03-09 13:21:27 +0000131%define POINTER dword ; general pointer type
132%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
133%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
DRCcdc8ac32009-06-25 20:38:31 +0000134%endif
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000135
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000136%define INT dword ; signed integer type
137%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
138%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000139
Pierre Ossman65d03172009-03-09 13:28:10 +0000140%define FP32 dword ; IEEE754 single
141%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
142%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000143
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000144%define MMWORD qword ; int64 (MMX register)
145%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
146%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000147
Pierre Ossman018fc422009-03-09 13:31:56 +0000148; NASM is buggy and doesn't properly handle operand sizes for SSE
149; instructions, so for now we have to define XMMWORD as blank.
150%define XMMWORD ; int128 (SSE register)
151%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
152%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000153
Pierre Ossmaneea72152009-03-09 13:34:17 +0000154; Similar hacks for when we load a dword or MMWORD into an xmm# register
155%define XMM_DWORD
156%define XMM_MMWORD
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000157
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000158%define SIZEOF_BYTE 1 ; sizeof(BYTE)
159%define SIZEOF_WORD 2 ; sizeof(WORD)
160%define SIZEOF_DWORD 4 ; sizeof(DWORD)
161%define SIZEOF_QWORD 8 ; sizeof(QWORD)
Pierre Ossman018fc422009-03-09 13:31:56 +0000162%define SIZEOF_OWORD 16 ; sizeof(OWORD)
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000163
Pierre Ossman5eb84ff2009-03-09 13:25:30 +0000164%define BYTE_BIT 8 ; CHAR_BIT in C
165%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
166%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
167%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
Pierre Ossman018fc422009-03-09 13:31:56 +0000168%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000169
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000170; --------------------------------------------------------------------------
171; External Symbol Name
172;
173%ifndef EXTN
174%define EXTN(name) _ %+ name ; foo() -> _foo
175%endif
176
177; --------------------------------------------------------------------------
178; Macros for position-independent code (PIC) support
179;
180%ifndef GOT_SYMBOL
181%undef PIC
182%endif
183
184%ifdef PIC ; -------------------------------------------
185
186%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
187
188; At present, nasm doesn't seem to support PIC generation for Mach-O.
189; The PIC support code below is a little tricky.
190
191 SECTION SEG_CONST
192const_base:
193
194%define GOTOFF(got,sym) (got) + (sym) - const_base
195
196%imacro get_GOT 1
197 ; NOTE: this macro destroys ecx resister.
198 call %%geteip
199 add ecx, byte (%%ref - $)
200 jmp short %%adjust
201%%geteip:
202 mov ecx, POINTER [esp]
203 ret
204%%adjust:
205 push ebp
206 xor ebp,ebp ; ebp = 0
207%ifidni %1,ebx ; (%1 == ebx)
208 ; db 0x8D,0x9C + jmp near const_base =
209 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
210 db 0x8D,0x9C ; 8D,9C
211 jmp near const_base ; E9,(const_base-%%ref)
212%%ref:
213%else ; (%1 != ebx)
214 ; db 0x8D,0x8C + jmp near const_base =
215 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
216 db 0x8D,0x8C ; 8D,8C
217 jmp near const_base ; E9,(const_base-%%ref)
218%%ref: mov %1, ecx
219%endif ; (%1 == ebx)
220 pop ebp
221%endmacro
222
223%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
224
225%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
226
227%imacro get_GOT 1
228 extern GOT_SYMBOL
229 call %%geteip
230 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
231 jmp short %%done
232%%geteip:
233 mov %1, POINTER [esp]
234 ret
235%%done:
236%endmacro
237
238%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
239
240%imacro pushpic 1.nolist
241 push %1
242%endmacro
243%imacro poppic 1.nolist
244 pop %1
245%endmacro
246%imacro movpic 2.nolist
247 mov %1,%2
248%endmacro
249
250%else ; !PIC -----------------------------------------
251
252%define GOTOFF(got,sym) (sym)
253
254%imacro get_GOT 1.nolist
255%endmacro
256%imacro pushpic 1.nolist
257%endmacro
258%imacro poppic 1.nolist
259%endmacro
260%imacro movpic 2.nolist
261%endmacro
262
263%endif ; PIC -----------------------------------------
264
265; --------------------------------------------------------------------------
266; Align the next instruction on {2,4,8,16,..}-byte boundary.
267; ".balign n,,m" in GNU as
268;
269%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
270%define FILLB(b,n) (($$-(b)) & ((n)-1))
271
272%imacro alignx 1-2.nolist 0xFFFF
273%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
274 db 0x90 ; nop
275 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
276 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
277 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
278 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
279 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
280 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
281 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
282 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
283 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
284 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
285 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
286 db 0x8B,0xED ; mov ebp,ebp
287 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
288 db 0x90 ; nop
289%endmacro
290
291; Align the next data on {2,4,8,16,..}-byte boundary.
292;
293%imacro alignz 1.nolist
294 align %1, db 0 ; filling zeros
295%endmacro
296
DRCcdc8ac32009-06-25 20:38:31 +0000297%ifdef __x86_64__
DRC8b014d72010-02-18 13:03:41 +0000298
299%ifdef WIN64
300
301%imacro collect_args 0
302 push r10
303 push r11
304 push r12
305 push r13
306 push r14
307 push r15
308 mov r10, rcx
309 mov r11, rdx
310 mov r12, r8
311 mov r13, r9
312 mov r14, [rax+48]
313 mov r15, [rax+56]
DRCe728ed72010-04-20 19:15:09 +0000314 push rsi
315 push rdi
DRCb6f097f2010-04-23 15:41:34 +0000316 sub rsp, SIZEOF_XMMWORD
317 movlpd XMMWORD [rsp], xmm6
318 sub rsp, SIZEOF_XMMWORD
319 movlpd XMMWORD [rsp], xmm7
DRCe728ed72010-04-20 19:15:09 +0000320%endmacro
321
322%imacro uncollect_args 0
DRCb6f097f2010-04-23 15:41:34 +0000323 movlpd xmm7, XMMWORD [rsp]
324 add rsp, SIZEOF_XMMWORD
325 movlpd xmm6, XMMWORD [rsp]
326 add rsp, SIZEOF_XMMWORD
DRCe728ed72010-04-20 19:15:09 +0000327 pop rdi
328 pop rsi
329 pop r15
330 pop r14
331 pop r13
332 pop r12
333 pop r11
334 pop r10
DRC8b014d72010-02-18 13:03:41 +0000335%endmacro
336
337%else
338
DRCcdc8ac32009-06-25 20:38:31 +0000339%imacro collect_args 0
340 push r10
341 push r11
342 push r12
343 push r13
344 push r14
345 push r15
346 mov r10, rdi
347 mov r11, rsi
348 mov r12, rdx
349 mov r13, rcx
350 mov r14, r8
351 mov r15, r9
352%endmacro
353
354%imacro uncollect_args 0
355 pop r15
356 pop r14
357 pop r13
358 pop r12
359 pop r11
360 pop r10
361%endmacro
362
363%endif
Pierre Ossman2ae181c2009-03-09 13:21:27 +0000364
DRCe728ed72010-04-20 19:15:09 +0000365%endif
366
Pierre Ossman2ae181c2009-03-09 13:21:27 +0000367; --------------------------------------------------------------------------
368; Defines picked up from the C headers
369;
Pierre Ossman3a65ef42009-03-16 13:34:18 +0000370%include "jsimdcfg.inc"
Pierre Ossman2ae181c2009-03-09 13:21:27 +0000371
MIYASAKA Masarua2e6a9d2006-02-04 00:00:00 +0000372; --------------------------------------------------------------------------