;
; jsimdext.inc - common declarations
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2010, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
;
; Copyright (C) 1999-2006, MIYASAKA Masaru.
;
; This software is provided 'as-is', without any express or implied
; warranty.  In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
;    claim that you wrote the original software. If you use this software
;    in a product, an acknowledgment in the product documentation would be
;    appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
;    misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
;
; [TAB8]

; ==========================================================================
;  System-dependent configurations

%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
; * Microsoft Visual C++
; * MinGW (Minimalist GNU for Windows)
; * CygWin
; * LCC-Win32

; -- segment definition --
;
%ifdef __YASM_VER__
%define SEG_TEXT    .text  align=16
%define SEG_CONST   .rdata align=16
%else
%define SEG_TEXT    .text  align=16 public use32 class=CODE
%define SEG_CONST   .rdata align=16 public use32 class=CONST
%endif

%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
; * Microsoft Visual C++

; -- segment definition --
;
%ifdef __YASM_VER__
%define SEG_TEXT    .text  align=16
%define SEG_CONST   .rdata align=16
%else
%define SEG_TEXT    .text  align=16 public use64 class=CODE
%define SEG_CONST   .rdata align=16 public use64 class=CONST
%endif
%define EXTN(name)  name                        ; foo() -> foo

%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
; * Borland C++ (Win32)

; -- segment definition --
;
%define SEG_TEXT    _text  align=16 public use32 class=CODE
%define SEG_CONST   _data  align=16 public use32 class=DATA

%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
; * Linux
; * *BSD family Unix using elf format
; * Unix System V, including Solaris x86, UnixWare and SCO Unix

; mark stack as non-executable
section .note.GNU-stack noalloc noexec nowrite progbits

; -- segment definition --
;
%ifdef __x86_64__
%define SEG_TEXT    .text   progbits align=16
%define SEG_CONST   .rodata progbits align=16
%else
%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
%endif

; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
%define EXTN(name)  name                        ; foo() -> foo

%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)

; -- segment definition --
;
%define SEG_TEXT    .text
%define SEG_CONST   .data

; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC

%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)

; -- segment definition --
;
%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
%define SEG_CONST   .rodata align=16

; The generation of position-independent code (PIC) is the default on Darwin.
;
%define PIC
%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing

%else           ; ----(Other case)----------------------

; -- segment definition --
;
%define SEG_TEXT    .text
%define SEG_CONST   .data

%endif  ; ----------------------------------------------

; ==========================================================================

; --------------------------------------------------------------------------
;  Common types
;
%ifdef __x86_64__
%define POINTER                 qword           ; general pointer type
%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
%else
%define POINTER                 dword           ; general pointer type
%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
%endif

%define INT                     dword           ; signed integer type
%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT

%define FP32                    dword           ; IEEE754 single
%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT

%define MMWORD                  qword           ; int64  (MMX register)
%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT

; NASM is buggy and doesn't properly handle operand sizes for SSE
; instructions, so for now we have to define XMMWORD as blank.
%define XMMWORD                                 ; int128 (SSE register)
%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT

; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD
%define XMM_MMWORD

%define SIZEOF_BYTE             1               ; sizeof(BYTE)
%define SIZEOF_WORD             2               ; sizeof(WORD)
%define SIZEOF_DWORD            4               ; sizeof(DWORD)
%define SIZEOF_QWORD            8               ; sizeof(QWORD)
%define SIZEOF_OWORD            16              ; sizeof(OWORD)

%define BYTE_BIT                8               ; CHAR_BIT in C
%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT

; --------------------------------------------------------------------------
;  External Symbol Name
;
%ifndef EXTN
# Android Modification:
# The unmodified code from upstream appends an underscore to the front of
# "name" here.  It is unclear why.  Before removing the underscore, the
# code failed to link because the function names in the SIMD code did not
# match the callers (because of the extra underscore).  This fix only
# applies to x86 SIMD code.  x86_64 is handled properly by the code above.
%define EXTN(name)   name          ; foo() -> _foo
%endif

; --------------------------------------------------------------------------
;  Macros for position-independent code (PIC) support
;
%ifndef GOT_SYMBOL
%undef PIC
%endif

%ifdef PIC ; -------------------------------------------

%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------

; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky.

        SECTION SEG_CONST
const_base:

%define GOTOFF(got,sym) (got) + (sym) - const_base

%imacro get_GOT 1
        ; NOTE: this macro destroys ecx resister.
        call    %%geteip
        add     ecx, byte (%%ref - $)
        jmp     short %%adjust
%%geteip:
        mov     ecx, POINTER [esp]
        ret
%%adjust:
        push    ebp
        xor     ebp,ebp         ; ebp = 0
%ifidni %1,ebx  ; (%1 == ebx)
        ; db 0x8D,0x9C + jmp near const_base =
        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
        db      0x8D,0x9C               ; 8D,9C
        jmp     near const_base         ; E9,(const_base-%%ref)
%%ref:
%else  ; (%1 != ebx)
        ; db 0x8D,0x8C + jmp near const_base =
        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
        db      0x8D,0x8C               ; 8D,8C
        jmp     near const_base         ; E9,(const_base-%%ref)
%%ref:  mov     %1, ecx
%endif ; (%1 == ebx)
        pop     ebp
%endmacro

%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------

%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff

%imacro get_GOT 1
        extern  GOT_SYMBOL
        call    %%geteip
        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
        jmp     short %%done
%%geteip:
        mov     %1, POINTER [esp]
        ret
%%done:
%endmacro

%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------

%imacro pushpic 1.nolist
        push    %1
%endmacro
%imacro poppic  1.nolist
        pop     %1
%endmacro
%imacro movpic  2.nolist
        mov     %1,%2
%endmacro

%else   ; !PIC -----------------------------------------

%define GOTOFF(got,sym) (sym)

%imacro get_GOT 1.nolist
%endmacro
%imacro pushpic 1.nolist
%endmacro
%imacro poppic  1.nolist
%endmacro
%imacro movpic  2.nolist
%endmacro

%endif  ;  PIC -----------------------------------------

; --------------------------------------------------------------------------
;  Align the next instruction on {2,4,8,16,..}-byte boundary.
;  ".balign n,,m" in GNU as
;
%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
%define FILLB(b,n)  (($$-(b)) & ((n)-1))

%imacro alignx 1-2.nolist 0xFFFF
%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
               db 0x90                               ; nop
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
               db 0x8B,0xED                          ; mov ebp,ebp
        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
               db 0x90                               ; nop
%endmacro

; Align the next data on {2,4,8,16,..}-byte boundary.
;
%imacro alignz 1.nolist
        align %1, db 0          ; filling zeros
%endmacro

%ifdef __x86_64__

%ifdef WIN64

%imacro collect_args 0
        push r12
        push r13
        push r14
        push r15
        mov r10, rcx
        mov r11, rdx
        mov r12, r8
        mov r13, r9
        mov r14, [rax+48]
        mov r15, [rax+56]
        push rsi
        push rdi
        sub     rsp, SIZEOF_XMMWORD
        movaps  XMMWORD [rsp], xmm6
        sub     rsp, SIZEOF_XMMWORD
        movaps  XMMWORD [rsp], xmm7
%endmacro

%imacro uncollect_args 0
        movaps  xmm7, XMMWORD [rsp]
        add     rsp, SIZEOF_XMMWORD
        movaps  xmm6, XMMWORD [rsp]
        add     rsp, SIZEOF_XMMWORD
        pop rdi
        pop rsi
        pop r15
        pop r14
        pop r13
        pop r12
%endmacro

%else

%imacro collect_args 0
        push r10
        push r11
        push r12
        push r13
        push r14
        push r15
        mov r10, rdi
        mov r11, rsi
        mov r12, rdx
        mov r13, rcx
        mov r14, r8
        mov r15, r9
%endmacro

%imacro uncollect_args 0
        pop r15
        pop r14
        pop r13
        pop r12
        pop r11
        pop r10
%endmacro

%endif

%endif

; --------------------------------------------------------------------------
;  Defines picked up from the C headers
;
%include "jsimdcfg.inc"

; --------------------------------------------------------------------------