;; -----------------------------------------------------------------------
;;
;;   Copyright 1994-2009 H. Peter Anvin - All Rights Reserved
;;   Copyright 2009-2010 Intel Corporation; author: H. Peter Anvin
;;
;;   This program is free software; you can redistribute it and/or modify
;;   it under the terms of the GNU General Public License as published by
;;   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
;;   Boston MA 02111-1307, USA; either version 2 of the License, or
;;   (at your option) any later version; incorporated herein by reference.
;;
;; -----------------------------------------------------------------------

;;
;; bcopy32xx.inc
;;


;
; 32-bit bcopy routine
;
; This is the actual 32-bit portion of the bcopy and shuffle and boot
; routines.  ALL THIS CODE NEEDS TO BE POSITION-INDEPENDENT, with the
; sole exception being the actual relocation code at the beginning of
; pm_shuffle_boot.
;
; It also really needs to live all in a single segment, for the
; address calculcations to actually work.
;

		bits 32
		section .bcopyxx.text
		align 16
;
; pm_bcopy:
;
;	This is the protected-mode core of the "bcopy" routine.
;	Try to do aligned transfers; if the src and dst are relatively
;	misaligned, align the dst.
;
;	ECX is guaranteed to not be zero on entry.
;
;	Clobbers ESI, EDI, ECX.
;

pm_bcopy:
		push ebx
		push edx
		push eax

		cmp esi,-1
		je .bzero

		cmp esi,edi		; If source < destination, we might
		jb .reverse		; have to copy backwards

.forward:
		; Initial alignment
		mov edx,edi
		shr edx,1
		jnc .faa1
		movsb
		dec ecx
.faa1:
		mov al,cl
		cmp ecx,2
		jb .f_tiny

		shr edx,1
		jnc .faa2
		movsw
		sub ecx,2
.faa2:

		; Bulk transfer
		mov al,cl		; Save low bits
		shr ecx,2		; Convert to dwords
		rep movsd		; Do our business
		; At this point ecx == 0

		test al,2
		jz .fab2
		movsw
.fab2:
.f_tiny:
		test al,1
		jz .fab1
		movsb
.fab1:
.done:
		pop eax
		pop edx
		pop ebx
		ret

.reverse:
		lea eax,[esi+ecx-1]	; Point to final byte
		cmp edi,eax
		ja .forward		; No overlap, do forward copy

		std			; Reverse copy
		lea edi,[edi+ecx-1]
		mov esi,eax

		; Initial alignment
		mov edx,edi
		shr edx,1
		jc .raa1
		movsb
		dec ecx
.raa1:

		dec esi
		dec edi
		mov al,cl
		cmp ecx,2
		jb .r_tiny
		shr edx,1
		jc .raa2
		movsw
		sub ecx,2
.raa2:

		; Bulk copy
		sub esi,2
		sub edi,2
		mov al,cl		; Save low bits
		shr ecx,2
		rep movsd

		; Final alignment
.r_final:
		add esi,2
		add edi,2
		test al,2
		jz .rab2
		movsw
.rab2:
.r_tiny:
		inc esi
		inc edi
		test al,1
		jz .rab1
		movsb
.rab1:
		cld
		jmp short .done

.bzero:
		xor eax,eax

		; Initial alignment
		mov edx,edi
		shr edx,1
		jnc .zaa1
		stosb
		dec ecx
.zaa1:

		mov bl,cl
		cmp ecx,2
		jb .z_tiny
		shr edx,1
		jnc .zaa2
		stosw
		sub ecx,2
.zaa2:

		; Bulk
		mov bl,cl		; Save low bits
		shr ecx,2
		rep stosd

		test bl,2
		jz .zab2
		stosw
.zab2:
.z_tiny:
		test bl,1
		jz .zab1
		stosb
.zab1:
		jmp short .done

;
; shuffle_and_boot:
;
; This routine is used to shuffle memory around, followed by
; invoking an entry point somewhere in low memory.  This routine
; can clobber any memory outside the bcopy special area.
;
; IMPORTANT: This routine does not set up any registers.
; It is the responsibility of the caller to generate an appropriate entry
; stub; *especially* when going to real mode.
;
; Inputs:
;	ESI		-> Pointer to list of (dst, src, len) pairs(*)
;	EDI		-> Pointer to safe area for list + shuffler
;			   (must not overlap this code nor the RM stack)
;	ECX		-> Byte count of list area (for initial copy)
;
;     If src == -1: then the memory pointed to by (dst, len) is bzeroed;
;		    this is handled inside the bcopy routine.
;
;     If len == 0:  this marks the end of the list; dst indicates
;		    the entry point and src the mode (0 = pm, 1 = rm)
;
;     (*) dst, src, and len are four bytes each
;
; do_raw_shuffle_and_boot is the same entry point, but with a C ABI:
; do_raw_shuffle_and_boot(safearea, descriptors, bytecount)
;
		global do_raw_shuffle_and_boot
do_raw_shuffle_and_boot:
		mov edi,eax
		mov esi,edx

pm_shuffle:
		cli			; End interrupt service (for good)
		mov ebx,edi		; EBX <- descriptor list
		lea edx,[edi+ecx+15]	; EDX <- where to relocate our code to
		and edx,~15		; Align 16 to benefit the GDT
		call pm_bcopy
		mov esi,__bcopyxx_start	; Absolute source address
		mov edi,edx		; Absolute target address
		sub edx,esi		; EDX <- address delta
		mov ecx,__bcopyxx_dwords
		lea eax,[edx+.safe]	; Resume point
		; Relocate this code
		rep movsd
		jmp eax			; Jump to safe location
.safe:
		; Give ourselves a safe stack
		lea esp,[edx+bcopyxx_stack+__bcopyxx_end]
		add edx,bcopy_gdt	; EDX <- new GDT
		mov [edx+2],edx		; GDT self-pointer
		lgdt [edx]		; Switch to local GDT

		; Now for the actual shuffling...
.loop:
		mov edi,[ebx]
		mov esi,[ebx+4]
		mov ecx,[ebx+8]
		add ebx,12
		jecxz .done
		call pm_bcopy
		jmp .loop
.done:
		lidt [edx+RM_IDT_ptr-bcopy_gdt]	; RM-like IDT
		push ecx		; == 0, for cleaning the flags register
		and esi,esi
		jz pm_shuffle_16
		popfd			; Clean the flags
		jmp edi			; Protected mode entry

		; We have a 16-bit entry point, so we need to return
		; to 16-bit mode.  Note: EDX already points to the GDT.
pm_shuffle_16:
		mov eax,edi
		mov [edx+PM_CS16+2],ax
		mov [edx+PM_DS16+2],ax
		shr eax,16
		mov [edx+PM_CS16+4],al
		mov [edx+PM_CS16+7],ah
		mov [edx+PM_DS16+4],al
		mov [edx+PM_DS16+7],ah
		mov eax,cr0
		and al,~1
		popfd			; Clean the flags
		; No flag-changing instructions below...
		mov dx,PM_DS16
		mov ds,edx
		mov es,edx
		mov fs,edx
		mov gs,edx
		mov ss,edx
		jmp PM_CS16:0

		section	.bcopyxx.data

		alignz 16
; GDT descriptor entry
%macro desc 1
bcopy_gdt.%1:
PM_%1		equ bcopy_gdt.%1-bcopy_gdt
%endmacro

bcopy_gdt:
		dw bcopy_gdt_size-1	; Null descriptor - contains GDT
		dd bcopy_gdt		; pointer for LGDT instruction
		dw 0

		; TSS segment to keep Intel VT happy.  Intel VT is
		; unhappy about anything that doesn't smell like a
		; full-blown 32-bit OS.
	desc TSS
		dw 104-1, DummyTSS	; 08h 32-bit task state segment
		dd 00008900h		; present, dpl 0, 104 bytes @DummyTSS

	desc CS16
		dd 0000ffffh		; 10h Code segment, use16, readable,
		dd 00009b00h		; present, dpl 0, cover 64K
	desc DS16
		dd 0000ffffh		; 18h Data segment, use16, read/write,
		dd 00009300h		; present, dpl 0, cover 64K
	desc CS32
		dd 0000ffffh		; 20h Code segment, use32, readable,
		dd 00cf9b00h		; present, dpl 0, cover all 4G
	desc DS32
		dd 0000ffffh		; 28h Data segment, use32, read/write,
		dd 00cf9300h		; present, dpl 0, cover all 4G

bcopy_gdt_size:	equ $-bcopy_gdt
;
; Space for a dummy task state segment.  It should never be actually
; accessed, but just in case it is, point to a chunk of memory that
; has a chance to not be used for anything real...
;
DummyTSS	equ 0x580

		align 4
RM_IDT_ptr:	dw 0FFFFh		; Length (nonsense, but matches CPU)
		dd 0			; Offset

bcopyxx_stack	equ 128			; We want this much stack

		section .rodata
		global __syslinux_shuffler_size
		extern __bcopyxx_len
		align 4
__syslinux_shuffler_size:
		dd __bcopyxx_len

		bits 16
		section .text16