/* fuc microcode util functions for nvc0 PGRAPH
 *
 * Copyright 2011 Red Hat Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors: Ben Skeggs
 */

define(`mmctx_data', `.b32 eval((($2 - 1) << 26) | $1)')
define(`queue_init', `.skip eval((2 * 4) + ((8 * 4) * 2))')

ifdef(`include_code', `
// Error codes
define(`E_BAD_COMMAND', 0x01)
define(`E_CMD_OVERFLOW', 0x02)

// Util macros to help with debugging ucode hangs etc
define(`T_WAIT', 0)
define(`T_MMCTX', 1)
define(`T_STRWAIT', 2)
define(`T_STRINIT', 3)
define(`T_AUTO', 4)
define(`T_CHAN', 5)
define(`T_LOAD', 6)
define(`T_SAVE', 7)
define(`T_LCHAN', 8)
define(`T_LCTXH', 9)

define(`trace_set', `
	mov $r8 0x83c
	shl b32 $r8 6
	clear b32 $r9
	bset $r9 $1
	iowr I[$r8 + 0x000] $r9		// CC_SCRATCH[7]
')

define(`trace_clr', `
	mov $r8 0x85c
	shl b32 $r8 6
	clear b32 $r9
	bset $r9 $1
	iowr I[$r8 + 0x000] $r9		// CC_SCRATCH[7]
')

// queue_put - add request to queue
//
// In : $r13 queue pointer
//	$r14 command
//	$r15 data
//
queue_put:
	// make sure we have space..
	ld b32 $r8 D[$r13 + 0x0]	// GET
	ld b32 $r9 D[$r13 + 0x4]	// PUT
	xor $r8 8
	cmpu b32 $r8 $r9
	bra ne #queue_put_next
		mov $r15 E_CMD_OVERFLOW
		call #error
		ret

	// store cmd/data on queue
	queue_put_next:
	and $r8 $r9 7
	shl b32 $r8 3
	add b32 $r8 $r13
	add b32 $r8 8
	st b32 D[$r8 + 0x0] $r14
	st b32 D[$r8 + 0x4] $r15

	// update PUT
	add b32 $r9 1
	and $r9 0xf
	st b32 D[$r13 + 0x4] $r9
	ret

// queue_get - fetch request from queue
//
// In : $r13 queue pointer
//
// Out:	$p1  clear on success (data available)
//	$r14 command
// 	$r15 data
//
queue_get:
	bset $flags $p1
	ld b32 $r8 D[$r13 + 0x0]	// GET
	ld b32 $r9 D[$r13 + 0x4]	// PUT
	cmpu b32 $r8 $r9
	bra e #queue_get_done
		// fetch first cmd/data pair
		and $r9 $r8 7
		shl b32 $r9 3
		add b32 $r9 $r13
		add b32 $r9 8
		ld b32 $r14 D[$r9 + 0x0]
		ld b32 $r15 D[$r9 + 0x4]

		// update GET
		add b32 $r8 1
		and $r8 0xf
		st b32 D[$r13 + 0x0] $r8
		bclr $flags $p1
queue_get_done:
	ret

// nv_rd32 - read 32-bit value from nv register
//
// In : $r14 register
// Out: $r15 value
//
nv_rd32:
	mov $r11 0x728
	shl b32 $r11 6
	mov b32 $r12 $r14
	bset $r12 31			// MMIO_CTRL_PENDING
	iowr I[$r11 + 0x000] $r12	// MMIO_CTRL
	nv_rd32_wait:
		iord $r12 I[$r11 + 0x000]
		xbit $r12 $r12 31
		bra ne #nv_rd32_wait
	mov $r10 6			// DONE_MMIO_RD
	call #wait_doneo
	iord $r15 I[$r11 + 0x100]	// MMIO_RDVAL
	ret

// nv_wr32 - write 32-bit value to nv register
//
// In : $r14 register
//      $r15 value
//
nv_wr32:
	mov $r11 0x728
	shl b32 $r11 6
	iowr I[$r11 + 0x200] $r15	// MMIO_WRVAL
	mov b32 $r12 $r14
	bset $r12 31			// MMIO_CTRL_PENDING
	bset $r12 30			// MMIO_CTRL_WRITE
	iowr I[$r11 + 0x000] $r12	// MMIO_CTRL
	nv_wr32_wait:
		iord $r12 I[$r11 + 0x000]
		xbit $r12 $r12 31
		bra ne #nv_wr32_wait
	ret

// (re)set watchdog timer
//
// In : $r15 timeout
//
watchdog_reset:
	mov $r8 0x430
	shl b32 $r8 6
	bset $r15 31
	iowr I[$r8 + 0x000] $r15
	ret

// clear watchdog timer
watchdog_clear:
	mov $r8 0x430
	shl b32 $r8 6
	iowr I[$r8 + 0x000] $r0
	ret

// wait_done{z,o} - wait on FUC_DONE bit to become clear/set
//
// In : $r10 bit to wait on
//
define(`wait_done', `
$1:
	trace_set(T_WAIT);
	mov $r8 0x818
	shl b32 $r8 6
	iowr I[$r8 + 0x000] $r10	// CC_SCRATCH[6] = wait bit
	wait_done_$1:
		mov $r8 0x400
		shl b32 $r8 6
		iord $r8 I[$r8 + 0x000]	// DONE
		xbit $r8 $r8 $r10
		bra $2 #wait_done_$1
	trace_clr(T_WAIT)
	ret
')
wait_done(wait_donez, ne)
wait_done(wait_doneo, e)

// mmctx_size - determine size of a mmio list transfer
//
// In : $r14 mmio list head
//      $r15 mmio list tail
// Out: $r15 transfer size (in bytes)
//
mmctx_size:
	clear b32 $r9
	nv_mmctx_size_loop:
		ld b32 $r8 D[$r14]
		shr b32 $r8 26
		add b32 $r8 1
		shl b32 $r8 2
		add b32 $r9 $r8
		add b32 $r14 4
		cmpu b32 $r14 $r15
		bra ne #nv_mmctx_size_loop
	mov b32 $r15 $r9
	ret

// mmctx_xfer - execute a list of mmio transfers
//
// In : $r10 flags
//		bit 0: direction (0 = save, 1 = load)
//		bit 1: set if first transfer
//		bit 2: set if last transfer
//	$r11 base
//	$r12 mmio list head
//	$r13 mmio list tail
//	$r14 multi_stride
//	$r15 multi_mask
//
mmctx_xfer:
	trace_set(T_MMCTX)
	mov $r8 0x710
	shl b32 $r8 6
	clear b32 $r9
	or $r11 $r11
	bra e #mmctx_base_disabled
		iowr I[$r8 + 0x000] $r11	// MMCTX_BASE
		bset $r9 0			// BASE_EN
	mmctx_base_disabled:
	or $r14 $r14
	bra e #mmctx_multi_disabled
		iowr I[$r8 + 0x200] $r14 	// MMCTX_MULTI_STRIDE
		iowr I[$r8 + 0x300] $r15 	// MMCTX_MULTI_MASK
		bset $r9 1			// MULTI_EN
	mmctx_multi_disabled:
	add b32 $r8 0x100

	xbit $r11 $r10 0
	shl b32 $r11 16			// DIR
	bset $r11 12			// QLIMIT = 0x10
	xbit $r14 $r10 1
	shl b32 $r14 17
	or $r11 $r14			// START_TRIGGER
	iowr I[$r8 + 0x000] $r11	// MMCTX_CTRL

	// loop over the mmio list, and send requests to the hw
	mmctx_exec_loop:
		// wait for space in mmctx queue
		mmctx_wait_free:
			iord $r14 I[$r8 + 0x000] // MMCTX_CTRL
			and $r14 0x1f
			bra e #mmctx_wait_free

		// queue up an entry
		ld b32 $r14 D[$r12]
		or $r14 $r9
		iowr I[$r8 + 0x300] $r14
		add b32 $r12 4
		cmpu b32 $r12 $r13
		bra ne #mmctx_exec_loop

	xbit $r11 $r10 2
	bra ne #mmctx_stop
		// wait for queue to empty
		mmctx_fini_wait:
			iord $r11 I[$r8 + 0x000]	// MMCTX_CTRL
			and $r11 0x1f
			cmpu b32 $r11 0x10
			bra ne #mmctx_fini_wait
		mov $r10 2				// DONE_MMCTX
		call #wait_donez
		bra #mmctx_done
	mmctx_stop:
		xbit $r11 $r10 0
		shl b32 $r11 16			// DIR
		bset $r11 12			// QLIMIT = 0x10
		bset $r11 18			// STOP_TRIGGER
		iowr I[$r8 + 0x000] $r11	// MMCTX_CTRL
		mmctx_stop_wait:
			// wait for STOP_TRIGGER to clear
			iord $r11 I[$r8 + 0x000] // MMCTX_CTRL
			xbit $r11 $r11 18
			bra ne #mmctx_stop_wait
	mmctx_done:
	trace_clr(T_MMCTX)
	ret

// Wait for DONE_STRAND
//
strand_wait:
	push $r10
	mov $r10 2
	call #wait_donez
	pop $r10
	ret

// unknown - call before issuing strand commands
//
strand_pre:
	mov $r8 0x4afc
	sethi $r8 0x20000
	mov $r9 0xc
	iowr I[$r8] $r9
	call #strand_wait
	ret

// unknown - call after issuing strand commands
//
strand_post:
	mov $r8 0x4afc
	sethi $r8 0x20000
	mov $r9 0xd
	iowr I[$r8] $r9
	call #strand_wait
	ret

// Selects strand set?!
//
// In: $r14 id
//
strand_set:
	mov $r10 0x4ffc
	sethi $r10 0x20000
	sub b32 $r11 $r10 0x500
	mov $r12 0xf
	iowr I[$r10 + 0x000] $r12		// 0x93c = 0xf
	mov $r12 0xb
	iowr I[$r11 + 0x000] $r12		// 0x928 = 0xb
	call #strand_wait
	iowr I[$r10 + 0x000] $r14		// 0x93c = <id>
	mov $r12 0xa
	iowr I[$r11 + 0x000] $r12		// 0x928 = 0xa
	call #strand_wait
	ret

// Initialise strand context data
//
// In : $r15 context base
// Out: $r15 context size (in bytes)
//
// Strandset(?) 3 hardcoded currently
//
strand_ctx_init:
	trace_set(T_STRINIT)
	call #strand_pre
	mov $r14 3
	call #strand_set
	mov $r10 0x46fc
	sethi $r10 0x20000
	add b32 $r11 $r10 0x400
	iowr I[$r10 + 0x100] $r0	// STRAND_FIRST_GENE = 0
	mov $r12 1
	iowr I[$r11 + 0x000] $r12	// STRAND_CMD = LATCH_FIRST_GENE
	call #strand_wait
	sub b32 $r12 $r0 1
	iowr I[$r10 + 0x000] $r12	// STRAND_GENE_CNT = 0xffffffff
	mov $r12 2
	iowr I[$r11 + 0x000] $r12	// STRAND_CMD = LATCH_GENE_CNT
	call #strand_wait
	call #strand_post

	// read the size of each strand, poke the context offset of
	// each into STRAND_{SAVE,LOAD}_SWBASE now, no need to worry
	// about it later then.
	mov $r8 0x880
	shl b32 $r8 6
	iord $r9 I[$r8 + 0x000]		// STRANDS
	add b32 $r8 0x2200
	shr b32 $r14 $r15 8
	ctx_init_strand_loop:
		iowr I[$r8 + 0x000] $r14	// STRAND_SAVE_SWBASE
		iowr I[$r8 + 0x100] $r14	// STRAND_LOAD_SWBASE
		iord $r10 I[$r8 + 0x200]	// STRAND_SIZE
		shr b32 $r10 6
		add b32 $r10 1
		add b32 $r14 $r10
		add b32 $r8 4
		sub b32 $r9 1
		bra ne #ctx_init_strand_loop

	shl b32 $r14 8
	sub b32 $r15 $r14 $r15
	trace_clr(T_STRINIT)
	ret
')