/* Intel SIMD MMX implementation of Viterbi ACS butterflies
   for 64-state (k=7) convolutional code
   Copyright 2004 Phil Karn, KA9Q
   This code may be used under the terms of the GNU Lesser General Public License (LGPL)

   int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; 
*/
	# MMX (64-bit SIMD) version
	# requires Pentium-MMX, Pentium-II or better

	# These are offsets into struct v27, defined in viterbi27_mmx.c
	.set DP,128
	.set OLDMETRICS,132
	.set NEWMETRICS,136
	.text	
	.global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
	.type update_viterbi27_blk_mmx,@function
	.align 16
	
update_viterbi27_blk_mmx:
	pushl %ebp
	movl %esp,%ebp
	pushl %esi
	pushl %edi
	pushl %edx
	pushl %ebx
	
	movl 8(%ebp),%edx	# edx = vp
	testl %edx,%edx
	jnz  0f
	movl -1,%eax
	jmp  err		
0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
	movl DP(%edx),%edx	# edx -> decisions

1:	movl 16(%ebp),%eax	# eax = nbits
	decl %eax
	jl   2f			# passed zero, we're done
	movl %eax,16(%ebp)

	movl 12(%ebp),%ebx	# ebx = syms
	movw (%ebx),%ax		# ax = second symbol : first symbol
	addl $2,%ebx
	movl %ebx,12(%ebp)

	movb %ah,%bl
	andl $255,%eax
	andl $255,%ebx

	# shift into first array index dimension slot
	shll $5,%eax
	shll $5,%ebx

	# each invocation of this macro will do 8 butterflies in parallel
	.MACRO butterfly GROUP
	# Compute branch metrics
	movq (Mettab27_1+8*\GROUP)(%eax),%mm3
	movq fifteens,%mm0	

	paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
	paddb ones,%mm3  # emulate pavgb - this may not be necessary
	psrlq $1,%mm3
	pand %mm0,%mm3

	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
	movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
	movq %mm6,%mm1	
	movq %mm2,%mm7
	
	paddb %mm3,%mm6
	paddb %mm3,%mm2
	pxor  %mm0,%mm3		 # invert branch metric
	paddb %mm3,%mm7		 # path metric for inverted symbols
	paddb %mm3,%mm1

	# live registers 1 2 6 7
	# Compare mm6 and mm7;  mm1 and mm2
	pxor %mm3,%mm3	
	movq %mm6,%mm4
	movq %mm1,%mm5	
	psubb %mm7,%mm4		# mm4 = mm6 - mm7
	psubb %mm2,%mm5		# mm5 = mm1 - mm2
	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		

	# live registers 1 2 4 5 6 7
	# select survivors
	movq %mm4,%mm0
	pand %mm4,%mm7	
	movq %mm5,%mm3	
	pand %mm5,%mm2	
	pandn %mm6,%mm0
	pandn %mm1,%mm3	
	por %mm0,%mm7		# mm7 = first set of survivors
	por %mm3,%mm2		# mm2 = second set of survivors	

	# live registers 2 4 5 7
	# interleave & store decisions in mm4, mm5
	# interleave & store new branch metrics in mm2, mm7		
	movq %mm4,%mm3
	movq %mm7,%mm0	
	punpckhbw %mm5,%mm4
	punpcklbw %mm5,%mm3
	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
	movq %mm4,(16*\GROUP+8)(%edx)
	movq %mm3,(16*\GROUP)(%edx)
	movq %mm7,(16*\GROUP)(%edi)
	movq %mm0,(16*\GROUP+8)(%edi)	

	.endm

# invoke macro 4 times for a total of 32 butterflies
	butterfly GROUP=0
	butterfly GROUP=1
	butterfly GROUP=2
	butterfly GROUP=3

	addl $64,%edx		# bump decision pointer			

	# swap metrics
	movl %esi,%eax
	movl %edi,%esi
	movl %eax,%edi
	jmp 1b

2:	emms
	movl 8(%ebp),%ebx	# ebx = vp
	# stash metric pointers
	movl %esi,OLDMETRICS(%ebx)
	movl %edi,NEWMETRICS(%ebx)
	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
	xorl %eax,%eax
err:	popl %ebx
	popl %edx
	popl %edi
	popl %esi
	popl %ebp
	ret

	.data
	.align 8
fifteens:	
	.byte 15,15,15,15,15,15,15,15
	
	.align 8
ones:	.byte 1,1,1,1,1,1,1,1