#!/usr/bin/env perl
# Copyright (c) 2018, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# This file defines helper functions for crypto/test/abi_test.h on x86_64. See
# that header for details on how to use this.
#
# For convenience, this file is linked into libcrypto, where consuming builds
# already support architecture-specific sources. The static linker should drop
# this code in non-test binaries. This includes a shared library build of
# libcrypto, provided --gc-sections (ELF), -dead_strip (Mac), or equivalent is
# used.
#
# References:
#
# SysV ABI: https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
# Win64 ABI: https://docs.microsoft.com/en-us/cpp/build/x64-software-conventions?view=vs-2017

use strict;

my $flavour = shift;
my $output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

my $win64 = 0;
$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;

# @inp is the registers used for function inputs, in order.
my @inp = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
                   ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");

# @caller_state is the list of registers that the callee must preserve for the
# caller. This must match the definition of CallerState in abi_test.h.
my @caller_state = ("%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15");
if ($win64) {
  @caller_state = ("%rbx", "%rbp", "%rdi", "%rsi", "%r12", "%r13", "%r14",
                   "%r15", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10",
                   "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
}

# $caller_state_size is the size of CallerState, in bytes.
my $caller_state_size = 0;
foreach (@caller_state) {
  if (/^%r/) {
    $caller_state_size += 8;
  } elsif (/^%xmm/) {
    $caller_state_size += 16;
  } else {
    die "unknown register $_";
  }
}

# load_caller_state returns code which loads a CallerState structure at
# $off($reg) into the respective registers. No other registers are touched, but
# $reg may not be a register in CallerState. $cb is an optional callback to
# add extra lines after each movq or movdqa. $cb is passed the offset, relative
# to $reg, and name of each register.
sub load_caller_state {
  my ($off, $reg, $cb) = @_;
  my $ret = "";
  foreach (@caller_state) {
    my $old_off = $off;
    if (/^%r/) {
      $ret .= "\tmovq\t$off($reg), $_\n";
      $off += 8;
    } elsif (/^%xmm/) {
      $ret .= "\tmovdqa\t$off($reg), $_\n";
      $off += 16;
    } else {
      die "unknown register $_";
    }
    $ret .= $cb->($old_off, $_) if (defined($cb));
  }
  return $ret;
}

# store_caller_state behaves like load_caller_state, except that it writes the
# current values of the registers into $off($reg).
sub store_caller_state {
  my ($off, $reg, $cb) = @_;
  my $ret = "";
  foreach (@caller_state) {
    my $old_off = $off;
    if (/^%r/) {
      $ret .= "\tmovq\t$_, $off($reg)\n";
      $off += 8;
    } elsif (/^%xmm/) {
      $ret .= "\tmovdqa\t$_, $off($reg)\n";
      $off += 16;
    } else {
      die "unknown register $_";
    }
    $ret .= $cb->($old_off, $_) if (defined($cb));
  }
  return $ret;
}

# $max_params is the maximum number of parameters abi_test_trampoline supports.
my $max_params = 10;

# Windows reserves stack space for the register-based parameters, while SysV
# only reserves space for the overflow ones.
my $stack_params_skip = $win64 ? scalar(@inp) : 0;
my $num_stack_params = $win64 ? $max_params : $max_params - scalar(@inp);

my ($func, $state, $argv, $argc, $unwind) = @inp;
my $code = <<____;
.text

# abi_test_trampoline loads callee-saved registers from |state|, calls |func|
# with |argv|, then saves the callee-saved registers into |state|. It returns
# the result of |func|. If |unwind| is non-zero, this function triggers unwind
# instrumentation.
# uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
#                              const uint64_t *argv, size_t argc,
#                              int unwind);
.type	abi_test_trampoline, \@abi-omnipotent
.globl	abi_test_trampoline
.align	16
abi_test_trampoline:
.Labi_test_trampoline_seh_begin:
.cfi_startproc
	# Stack layout:
	#   8 bytes - align
	#   $caller_state_size bytes - saved caller registers
	#   8 bytes - scratch space
	#   8 bytes - saved copy of \$unwind (SysV-only)
	#   8 bytes - saved copy of \$state
	#   8 bytes - saved copy of \$func
	#   8 bytes - if needed for stack alignment
	#   8*$num_stack_params bytes - parameters for \$func
____
my $stack_alloc_size = 8 + $caller_state_size + 8*3 + 8*$num_stack_params;
if (!$win64) {
  $stack_alloc_size += 8;
}
# SysV and Windows both require the stack to be 16-byte-aligned. The call
# instruction offsets it by 8, so stack allocations must be 8 mod 16.
if ($stack_alloc_size % 16 != 8) {
  $num_stack_params++;
  $stack_alloc_size += 8;
}
my $stack_params_offset = 8 * $stack_params_skip;
my $func_offset = 8 * $num_stack_params;
my $state_offset = $func_offset + 8;
# On Win64, unwind is already passed in memory. On SysV, it is passed in as
# register and we must reserve stack space for it.
my ($unwind_offset, $scratch_offset);
if ($win64) {
  $unwind_offset = $stack_alloc_size + 5*8;
  $scratch_offset = $state_offset + 8;
} else {
  $unwind_offset = $state_offset + 8;
  $scratch_offset = $unwind_offset + 8;
}
my $caller_state_offset = $scratch_offset + 8;
$code .= <<____;
	subq	\$$stack_alloc_size, %rsp
.cfi_adjust_cfa_offset	$stack_alloc_size
.Labi_test_trampoline_seh_prolog_alloc:
____
$code .= <<____ if (!$win64);
	movq	$unwind, $unwind_offset(%rsp)
____
# Store our caller's state. This is needed because we modify it ourselves, and
# also to isolate the test infrastruction from the function under test failing
# to save some register.
my %reg_offsets;
$code .= store_caller_state($caller_state_offset, "%rsp", sub {
  my ($off, $reg) = @_;
  $reg = substr($reg, 1);
  $reg_offsets{$reg} = $off;
  $off -= $stack_alloc_size + 8;
  return <<____;
.cfi_offset	$reg, $off
.Labi_test_trampoline_seh_prolog_$reg:
____
});
$code .= <<____;
.Labi_test_trampoline_seh_prolog_end:
____

$code .= load_caller_state(0, $state);
$code .= <<____;
	# Stash \$func and \$state, so they are available after the call returns.
	movq	$func, $func_offset(%rsp)
	movq	$state, $state_offset(%rsp)

	# Load parameters. Note this will clobber \$argv and \$argc, so we can
	# only use non-parameter volatile registers. There are three, and they
	# are the same between SysV and Win64: %rax, %r10, and %r11.
	movq	$argv, %r10
	movq	$argc, %r11
____
foreach (@inp) {
  $code .= <<____;
	dec	%r11
	js	.Largs_done
	movq	(%r10), $_
	addq	\$8, %r10
____
}
$code .= <<____;
	leaq	$stack_params_offset(%rsp), %rax
.Largs_loop:
	dec	%r11
	js	.Largs_done

	# This block should be:
	#    movq (%r10), %rtmp
	#    movq %rtmp, (%rax)
	# There are no spare registers available, so we spill into the scratch
	# space.
	movq	%r11, $scratch_offset(%rsp)
	movq	(%r10), %r11
	movq	%r11, (%rax)
	movq	$scratch_offset(%rsp), %r11

	addq	\$8, %r10
	addq	\$8, %rax
	jmp	.Largs_loop

.Largs_done:
	movq	$func_offset(%rsp), %rax
	movq	$unwind_offset(%rsp), %r10
	testq	%r10, %r10
	jz	.Lno_unwind

	# Set the trap flag.
	pushfq
	orq	\$0x100, 0(%rsp)
	popfq

	# Run an instruction to trigger a breakpoint immediately before the
	# call.
	nop
.globl	abi_test_unwind_start
abi_test_unwind_start:

	call	*%rax
.globl	abi_test_unwind_return
abi_test_unwind_return:

	# Clear the trap flag. Note this assumes the trap flag was clear on
	# entry. We do not support instrumenting an unwind-instrumented
	# |abi_test_trampoline|.
	pushfq
	andq	\$-0x101, 0(%rsp)	# -0x101 is ~0x100
	popfq
.globl	abi_test_unwind_stop
abi_test_unwind_stop:

	jmp	.Lcall_done

.Lno_unwind:
	call	*%rax

.Lcall_done:
	# Store what \$func did our state, so our caller can check.
	movq  $state_offset(%rsp), $state
____
$code .= store_caller_state(0, $state);

# Restore our caller's state.
$code .= load_caller_state($caller_state_offset, "%rsp", sub {
  my ($off, $reg) = @_;
  $reg = substr($reg, 1);
  return ".cfi_restore\t$reg\n";
});
$code .= <<____;
	addq	\$$stack_alloc_size, %rsp
.cfi_adjust_cfa_offset	-$stack_alloc_size

	# %rax already contains \$func's return value, unmodified.
	ret
.cfi_endproc
.Labi_test_trampoline_seh_end:
.size	abi_test_trampoline,.-abi_test_trampoline
____

# abi_test_clobber_* zeros the corresponding register. These are used to test
# the ABI-testing framework.
foreach ("ax", "bx", "cx", "dx", "di", "si", "bp", 8..15) {
  $code .= <<____;
.type	abi_test_clobber_r$_, \@abi-omnipotent
.globl	abi_test_clobber_r$_
.align	16
abi_test_clobber_r$_:
	xorq	%r$_, %r$_
	ret
.size	abi_test_clobber_r$_,.-abi_test_clobber_r$_
____
}

foreach (0..15) {
  $code .= <<____;
.type	abi_test_clobber_xmm$_, \@abi-omnipotent
.globl	abi_test_clobber_xmm$_
.align	16
abi_test_clobber_xmm$_:
	pxor	%xmm$_, %xmm$_
	ret
.size	abi_test_clobber_xmm$_,.-abi_test_clobber_xmm$_
____
}

$code .= <<____;
# abi_test_bad_unwind_wrong_register preserves the ABI, but annotates the wrong
# register in unwind metadata.
# void abi_test_bad_unwind_wrong_register(void);
.type	abi_test_bad_unwind_wrong_register, \@abi-omnipotent
.globl	abi_test_bad_unwind_wrong_register
.align	16
abi_test_bad_unwind_wrong_register:
.cfi_startproc
.Labi_test_bad_unwind_wrong_register_seh_begin:
	pushq	%r12
.cfi_push	%r13	# This should be %r12
.Labi_test_bad_unwind_wrong_register_seh_push_r13:
	# Windows evaluates epilogs directly in the unwinder, rather than using
	# unwind codes. Add a nop so there is one non-epilog point (immediately
	# before the nop) where the unwinder can observe the mistake.
	nop
	popq	%r12
.cfi_pop	%r12
	ret
.Labi_test_bad_unwind_wrong_register_seh_end:
.cfi_endproc
.size	abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register

# abi_test_bad_unwind_temporary preserves the ABI, but temporarily corrupts the
# storage space for a saved register, breaking unwind.
# void abi_test_bad_unwind_temporary(void);
.type	abi_test_bad_unwind_temporary, \@abi-omnipotent
.globl	abi_test_bad_unwind_temporary
.align	16
abi_test_bad_unwind_temporary:
.cfi_startproc
.Labi_test_bad_unwind_temporary_seh_begin:
	pushq	%r12
.cfi_push	%r12
.Labi_test_bad_unwind_temporary_seh_push_r12:

	movq	%r12, %rax
	inc	%rax
	movq	%rax, (%rsp)
	# Unwinding from here is incorrect. Although %r12 itself has not been
	# changed, the unwind codes say to look in (%rsp) instead.

	movq	%r12, (%rsp)
	# Unwinding is now fixed.

	popq	%r12
.cfi_pop	%r12
	ret
.Labi_test_bad_unwind_temporary_seh_end:
.cfi_endproc
.size	abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary

# abi_test_get_and_clear_direction_flag clears the direction flag. If the flag
# was previously set, it returns one. Otherwise, it returns zero.
# int abi_test_get_and_clear_direction_flag(void);
.type	abi_test_set_direction_flag, \@abi-omnipotent
.globl	abi_test_get_and_clear_direction_flag
abi_test_get_and_clear_direction_flag:
	pushfq
	popq	%rax
	andq	\$0x400, %rax
	shrq	\$10, %rax
	cld
	ret
.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag

# abi_test_set_direction_flag sets the direction flag.
# void abi_test_set_direction_flag(void);
.type	abi_test_set_direction_flag, \@abi-omnipotent
.globl	abi_test_set_direction_flag
abi_test_set_direction_flag:
	std
	ret
.size abi_test_set_direction_flag,.-abi_test_set_direction_flag
____

if ($win64) {
  $code .= <<____;
# abi_test_bad_unwind_epilog preserves the ABI, and correctly annotates the
# prolog, but the epilog does not match Win64's rules, breaking unwind during
# the epilog.
# void abi_test_bad_unwind_epilog(void);
.type	abi_test_bad_unwind_epilog, \@abi-omnipotent
.globl	abi_test_bad_unwind_epilog
.align	16
abi_test_bad_unwind_epilog:
.Labi_test_bad_unwind_epilog_seh_begin:
	pushq	%r12
.Labi_test_bad_unwind_epilog_seh_push_r12:

	nop

	# The epilog should begin here, but the nop makes it invalid.
	popq	%r12
	nop
	ret
.Labi_test_bad_unwind_epilog_seh_end:
.size	abi_test_bad_unwind_epilog,.-abi_test_bad_unwind_epilog
____

  # Add unwind metadata for SEH.
  #
  # TODO(davidben): This is all manual right now. Once we've added SEH tests,
  # add support for emitting these in x86_64-xlate.pl, probably based on MASM
  # and Yasm's unwind directives, and unify with CFI. (Sadly, NASM does not
  # support these directives.) Then push that upstream to replace the
  # error-prone and non-standard custom handlers.

  # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017
  my $UWOP_PUSH_NONVOL = 0;
  my $UWOP_ALLOC_LARGE = 1;
  my $UWOP_ALLOC_SMALL = 2;
  my $UWOP_SAVE_NONVOL = 4;
  my $UWOP_SAVE_XMM128 = 8;

  my %UWOP_REG_NUMBER = (rax => 0, rcx => 1, rdx => 2, rbx => 3, rsp => 4,
                         rbp => 5, rsi => 6, rdi => 7,
                         map(("r$_" => $_), (8..15)));

  my $unwind_codes = "";
  my $num_slots = 0;
  foreach my $reg (reverse @caller_state) {
    $reg = substr($reg, 1);
    die "unknown register $reg" unless exists($reg_offsets{$reg});
    if ($reg =~ /^r/) {
      die "unknown register $reg" unless exists($UWOP_REG_NUMBER{$reg});
      my $info = $UWOP_SAVE_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4);
      my $value = $reg_offsets{$reg} / 8;
      $unwind_codes .= <<____;
	.byte	.Labi_test_trampoline_seh_prolog_$reg-.Labi_test_trampoline_seh_begin
	.byte	$info
	.value	$value
____
      $num_slots += 2;
    } elsif ($reg =~ /^xmm/) {
      my $info = $UWOP_SAVE_XMM128 | (substr($reg, 3) << 4);
      my $value = $reg_offsets{$reg} / 16;
      $unwind_codes .= <<____;
	.byte	.Labi_test_trampoline_seh_prolog_$reg-.Labi_test_trampoline_seh_begin
	.byte	$info
	.value	$value
____
      $num_slots += 2;
    } else {
      die "unknown register $reg";
    }
  }

  if ($stack_alloc_size <= 128) {
    my $info = $UWOP_ALLOC_SMALL | ((($stack_alloc_size - 8) / 8) << 4);
    $unwind_codes .= <<____;
	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
	.byte	$info
____
    $num_slots++;
  } else {
    die "stack allocation needs three unwind slots" if ($stack_alloc_size > 512 * 1024 + 8);
    my $info = $UWOP_ALLOC_LARGE;
    my $value = $stack_alloc_size / 8;
    $unwind_codes .= <<____;
	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
	.byte	$info
	.value	$value
____
    $num_slots += 2;
  }

  $code .= <<____;
.section	.pdata
.align	4
	# https://docs.microsoft.com/en-us/cpp/build/struct-runtime-function?view=vs-2017
	.rva	.Labi_test_trampoline_seh_begin
	.rva	.Labi_test_trampoline_seh_end
	.rva	.Labi_test_trampoline_seh_info

	.rva	.Labi_test_bad_unwind_wrong_register_seh_begin
	.rva	.Labi_test_bad_unwind_wrong_register_seh_end
	.rva	.Labi_test_bad_unwind_wrong_register_seh_info

	.rva	.Labi_test_bad_unwind_temporary_seh_begin
	.rva	.Labi_test_bad_unwind_temporary_seh_end
	.rva	.Labi_test_bad_unwind_temporary_seh_info

	.rva	.Labi_test_bad_unwind_epilog_seh_begin
	.rva	.Labi_test_bad_unwind_epilog_seh_end
	.rva	.Labi_test_bad_unwind_epilog_seh_info

.section	.xdata
.align	8
.Labi_test_trampoline_seh_info:
	# https://docs.microsoft.com/en-us/cpp/build/struct-unwind-info?view=vs-2017
	.byte	1	# version 1, no flags
	.byte	.Labi_test_trampoline_seh_prolog_end-.Labi_test_trampoline_seh_begin
	.byte	$num_slots
	.byte	0	# no frame register
$unwind_codes

.align	8
.Labi_test_bad_unwind_wrong_register_seh_info:
	.byte	1	# version 1, no flags
	.byte	.Labi_test_bad_unwind_wrong_register_seh_push_r13-.Labi_test_bad_unwind_wrong_register_seh_begin
	.byte	1	# one slot
	.byte	0	# no frame register

	.byte	.Labi_test_bad_unwind_wrong_register_seh_push_r13-.Labi_test_bad_unwind_wrong_register_seh_begin
	.byte	@{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{r13} << 4)]}

.align	8
.Labi_test_bad_unwind_temporary_seh_info:
	.byte	1	# version 1, no flags
	.byte	.Labi_test_bad_unwind_temporary_seh_push_r12-.Labi_test_bad_unwind_temporary_seh_begin
	.byte	1	# one slot
	.byte	0	# no frame register

	.byte	.Labi_test_bad_unwind_temporary_seh_push_r12-.Labi_test_bad_unwind_temporary_seh_begin
	.byte	@{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{r12} << 4)]}

.align	8
.Labi_test_bad_unwind_epilog_seh_info:
	.byte	1	# version 1, no flags
	.byte	.Labi_test_bad_unwind_epilog_seh_push_r12-.Labi_test_bad_unwind_epilog_seh_begin
	.byte	1	# one slot
	.byte	0	# no frame register

	.byte	.Labi_test_bad_unwind_epilog_seh_push_r12-.Labi_test_bad_unwind_epilog_seh_begin
	.byte	@{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{r12} << 4)]}
____
}

print $code;
close STDOUT;