# salsa20_pm.s version 20051229
# D. J. Bernstein
# Public domain.

#include <linux/linkage.h>

.text

# enter salsa20_encrypt_bytes
ENTRY(salsa20_encrypt_bytes)
	mov	%esp,%eax
	and	$31,%eax
	add	$256,%eax
	sub	%eax,%esp
	# eax_stack = eax
	movl	%eax,80(%esp)
	# ebx_stack = ebx
	movl	%ebx,84(%esp)
	# esi_stack = esi
	movl	%esi,88(%esp)
	# edi_stack = edi
	movl	%edi,92(%esp)
	# ebp_stack = ebp
	movl	%ebp,96(%esp)
	# x = arg1
	movl	4(%esp,%eax),%edx
	# m = arg2
	movl	8(%esp,%eax),%esi
	# out = arg3
	movl	12(%esp,%eax),%edi
	# bytes = arg4
	movl	16(%esp,%eax),%ebx
	# bytes -= 0
	sub	$0,%ebx
	# goto done if unsigned<=
	jbe	._done
._start:
	# in0 = *(uint32 *) (x + 0)
	movl	0(%edx),%eax
	# in1 = *(uint32 *) (x + 4)
	movl	4(%edx),%ecx
	# in2 = *(uint32 *) (x + 8)
	movl	8(%edx),%ebp
	# j0 = in0
	movl	%eax,164(%esp)
	# in3 = *(uint32 *) (x + 12)
	movl	12(%edx),%eax
	# j1 = in1
	movl	%ecx,168(%esp)
	# in4 = *(uint32 *) (x + 16)
	movl	16(%edx),%ecx
	# j2 = in2
	movl	%ebp,172(%esp)
	# in5 = *(uint32 *) (x + 20)
	movl	20(%edx),%ebp
	# j3 = in3
	movl	%eax,176(%esp)
	# in6 = *(uint32 *) (x + 24)
	movl	24(%edx),%eax
	# j4 = in4
	movl	%ecx,180(%esp)
	# in7 = *(uint32 *) (x + 28)
	movl	28(%edx),%ecx
	# j5 = in5
	movl	%ebp,184(%esp)
	# in8 = *(uint32 *) (x + 32)
	movl	32(%edx),%ebp
	# j6 = in6
	movl	%eax,188(%esp)
	# in9 = *(uint32 *) (x + 36)
	movl	36(%edx),%eax
	# j7 = in7
	movl	%ecx,192(%esp)
	# in10 = *(uint32 *) (x + 40)
	movl	40(%edx),%ecx
	# j8 = in8
	movl	%ebp,196(%esp)
	# in11 = *(uint32 *) (x + 44)
	movl	44(%edx),%ebp
	# j9 = in9
	movl	%eax,200(%esp)
	# in12 = *(uint32 *) (x + 48)
	movl	48(%edx),%eax
	# j10 = in10
	movl	%ecx,204(%esp)
	# in13 = *(uint32 *) (x + 52)
	movl	52(%edx),%ecx
	# j11 = in11
	movl	%ebp,208(%esp)
	# in14 = *(uint32 *) (x + 56)
	movl	56(%edx),%ebp
	# j12 = in12
	movl	%eax,212(%esp)
	# in15 = *(uint32 *) (x + 60)
	movl	60(%edx),%eax
	# j13 = in13
	movl	%ecx,216(%esp)
	# j14 = in14
	movl	%ebp,220(%esp)
	# j15 = in15
	movl	%eax,224(%esp)
	# x_backup = x
	movl	%edx,64(%esp)
._bytesatleast1:
	#   bytes - 64
	cmp	$64,%ebx
	#   goto nocopy if unsigned>=
	jae	._nocopy
	#     ctarget = out
	movl	%edi,228(%esp)
	#     out = &tmp
	leal	0(%esp),%edi
	#     i = bytes
	mov	%ebx,%ecx
	#     while (i) { *out++ = *m++; --i }
	rep	movsb
	#     out = &tmp
	leal	0(%esp),%edi
	#     m = &tmp
	leal	0(%esp),%esi
._nocopy:
	#   out_backup = out
	movl	%edi,72(%esp)
	#   m_backup = m
	movl	%esi,68(%esp)
	#   bytes_backup = bytes
	movl	%ebx,76(%esp)
	#   in0 = j0
	movl	164(%esp),%eax
	#   in1 = j1
	movl	168(%esp),%ecx
	#   in2 = j2
	movl	172(%esp),%edx
	#   in3 = j3
	movl	176(%esp),%ebx
	#   x0 = in0
	movl	%eax,100(%esp)
	#   x1 = in1
	movl	%ecx,104(%esp)
	#   x2 = in2
	movl	%edx,108(%esp)
	#   x3 = in3
	movl	%ebx,112(%esp)
	#   in4 = j4
	movl	180(%esp),%eax
	#   in5 = j5
	movl	184(%esp),%ecx
	#   in6 = j6
	movl	188(%esp),%edx
	#   in7 = j7
	movl	192(%esp),%ebx
	#   x4 = in4
	movl	%eax,116(%esp)
	#   x5 = in5
	movl	%ecx,120(%esp)
	#   x6 = in6
	movl	%edx,124(%esp)
	#   x7 = in7
	movl	%ebx,128(%esp)
	#   in8 = j8
	movl	196(%esp),%eax
	#   in9 = j9
	movl	200(%esp),%ecx
	#   in10 = j10
	movl	204(%esp),%edx
	#   in11 = j11
	movl	208(%esp),%ebx
	#   x8 = in8
	movl	%eax,132(%esp)
	#   x9 = in9
	movl	%ecx,136(%esp)
	#   x10 = in10
	movl	%edx,140(%esp)
	#   x11 = in11
	movl	%ebx,144(%esp)
	#   in12 = j12
	movl	212(%esp),%eax
	#   in13 = j13
	movl	216(%esp),%ecx
	#   in14 = j14
	movl	220(%esp),%edx
	#   in15 = j15
	movl	224(%esp),%ebx
	#   x12 = in12
	movl	%eax,148(%esp)
	#   x13 = in13
	movl	%ecx,152(%esp)
	#   x14 = in14
	movl	%edx,156(%esp)
	#   x15 = in15
	movl	%ebx,160(%esp)
	#   i = 20
	mov	$20,%ebp
	# p = x0
	movl	100(%esp),%eax
	# s = x5
	movl	120(%esp),%ecx
	# t = x10
	movl	140(%esp),%edx
	# w = x15
	movl	160(%esp),%ebx
._mainloop:
	# x0 = p
	movl	%eax,100(%esp)
	# 				x10 = t
	movl	%edx,140(%esp)
	# p += x12
	addl	148(%esp),%eax
	# 		x5 = s
	movl	%ecx,120(%esp)
	# 				t += x6
	addl	124(%esp),%edx
	# 						x15 = w
	movl	%ebx,160(%esp)
	# 		r = x1
	movl	104(%esp),%esi
	# 		r += s
	add	%ecx,%esi
	# 						v = x11
	movl	144(%esp),%edi
	# 						v += w
	add	%ebx,%edi
	# p <<<= 7
	rol	$7,%eax
	# p ^= x4
	xorl	116(%esp),%eax
	# 				t <<<= 7
	rol	$7,%edx
	# 				t ^= x14
	xorl	156(%esp),%edx
	# 		r <<<= 7
	rol	$7,%esi
	# 		r ^= x9
	xorl	136(%esp),%esi
	# 						v <<<= 7
	rol	$7,%edi
	# 						v ^= x3
	xorl	112(%esp),%edi
	# x4 = p
	movl	%eax,116(%esp)
	# 				x14 = t
	movl	%edx,156(%esp)
	# p += x0
	addl	100(%esp),%eax
	# 		x9 = r
	movl	%esi,136(%esp)
	# 				t += x10
	addl	140(%esp),%edx
	# 						x3 = v
	movl	%edi,112(%esp)
	# p <<<= 9
	rol	$9,%eax
	# p ^= x8
	xorl	132(%esp),%eax
	# 				t <<<= 9
	rol	$9,%edx
	# 				t ^= x2
	xorl	108(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 9
	rol	$9,%ecx
	# 		s ^= x13
	xorl	152(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 9
	rol	$9,%ebx
	# 						w ^= x7
	xorl	128(%esp),%ebx
	# x8 = p
	movl	%eax,132(%esp)
	# 				x2 = t
	movl	%edx,108(%esp)
	# p += x4
	addl	116(%esp),%eax
	# 		x13 = s
	movl	%ecx,152(%esp)
	# 				t += x14
	addl	156(%esp),%edx
	# 						x7 = w
	movl	%ebx,128(%esp)
	# p <<<= 13
	rol	$13,%eax
	# p ^= x12
	xorl	148(%esp),%eax
	# 				t <<<= 13
	rol	$13,%edx
	# 				t ^= x6
	xorl	124(%esp),%edx
	# 		r += s
	add	%ecx,%esi
	# 		r <<<= 13
	rol	$13,%esi
	# 		r ^= x1
	xorl	104(%esp),%esi
	# 						v += w
	add	%ebx,%edi
	# 						v <<<= 13
	rol	$13,%edi
	# 						v ^= x11
	xorl	144(%esp),%edi
	# x12 = p
	movl	%eax,148(%esp)
	# 				x6 = t
	movl	%edx,124(%esp)
	# p += x8
	addl	132(%esp),%eax
	# 		x1 = r
	movl	%esi,104(%esp)
	# 				t += x2
	addl	108(%esp),%edx
	# 						x11 = v
	movl	%edi,144(%esp)
	# p <<<= 18
	rol	$18,%eax
	# p ^= x0
	xorl	100(%esp),%eax
	# 				t <<<= 18
	rol	$18,%edx
	# 				t ^= x10
	xorl	140(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 18
	rol	$18,%ecx
	# 		s ^= x5
	xorl	120(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 18
	rol	$18,%ebx
	# 						w ^= x15
	xorl	160(%esp),%ebx
	# x0 = p
	movl	%eax,100(%esp)
	# 				x10 = t
	movl	%edx,140(%esp)
	# p += x3
	addl	112(%esp),%eax
	# p <<<= 7
	rol	$7,%eax
	# 		x5 = s
	movl	%ecx,120(%esp)
	# 				t += x9
	addl	136(%esp),%edx
	# 						x15 = w
	movl	%ebx,160(%esp)
	# 		r = x4
	movl	116(%esp),%esi
	# 		r += s
	add	%ecx,%esi
	# 						v = x14
	movl	156(%esp),%edi
	# 						v += w
	add	%ebx,%edi
	# p ^= x1
	xorl	104(%esp),%eax
	# 				t <<<= 7
	rol	$7,%edx
	# 				t ^= x11
	xorl	144(%esp),%edx
	# 		r <<<= 7
	rol	$7,%esi
	# 		r ^= x6
	xorl	124(%esp),%esi
	# 						v <<<= 7
	rol	$7,%edi
	# 						v ^= x12
	xorl	148(%esp),%edi
	# x1 = p
	movl	%eax,104(%esp)
	# 				x11 = t
	movl	%edx,144(%esp)
	# p += x0
	addl	100(%esp),%eax
	# 		x6 = r
	movl	%esi,124(%esp)
	# 				t += x10
	addl	140(%esp),%edx
	# 						x12 = v
	movl	%edi,148(%esp)
	# p <<<= 9
	rol	$9,%eax
	# p ^= x2
	xorl	108(%esp),%eax
	# 				t <<<= 9
	rol	$9,%edx
	# 				t ^= x8
	xorl	132(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 9
	rol	$9,%ecx
	# 		s ^= x7
	xorl	128(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 9
	rol	$9,%ebx
	# 						w ^= x13
	xorl	152(%esp),%ebx
	# x2 = p
	movl	%eax,108(%esp)
	# 				x8 = t
	movl	%edx,132(%esp)
	# p += x1
	addl	104(%esp),%eax
	# 		x7 = s
	movl	%ecx,128(%esp)
	# 				t += x11
	addl	144(%esp),%edx
	# 						x13 = w
	movl	%ebx,152(%esp)
	# p <<<= 13
	rol	$13,%eax
	# p ^= x3
	xorl	112(%esp),%eax
	# 				t <<<= 13
	rol	$13,%edx
	# 				t ^= x9
	xorl	136(%esp),%edx
	# 		r += s
	add	%ecx,%esi
	# 		r <<<= 13
	rol	$13,%esi
	# 		r ^= x4
	xorl	116(%esp),%esi
	# 						v += w
	add	%ebx,%edi
	# 						v <<<= 13
	rol	$13,%edi
	# 						v ^= x14
	xorl	156(%esp),%edi
	# x3 = p
	movl	%eax,112(%esp)
	# 				x9 = t
	movl	%edx,136(%esp)
	# p += x2
	addl	108(%esp),%eax
	# 		x4 = r
	movl	%esi,116(%esp)
	# 				t += x8
	addl	132(%esp),%edx
	# 						x14 = v
	movl	%edi,156(%esp)
	# p <<<= 18
	rol	$18,%eax
	# p ^= x0
	xorl	100(%esp),%eax
	# 				t <<<= 18
	rol	$18,%edx
	# 				t ^= x10
	xorl	140(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 18
	rol	$18,%ecx
	# 		s ^= x5
	xorl	120(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 18
	rol	$18,%ebx
	# 						w ^= x15
	xorl	160(%esp),%ebx
	# x0 = p
	movl	%eax,100(%esp)
	# 				x10 = t
	movl	%edx,140(%esp)
	# p += x12
	addl	148(%esp),%eax
	# 		x5 = s
	movl	%ecx,120(%esp)
	# 				t += x6
	addl	124(%esp),%edx
	# 						x15 = w
	movl	%ebx,160(%esp)
	# 		r = x1
	movl	104(%esp),%esi
	# 		r += s
	add	%ecx,%esi
	# 						v = x11
	movl	144(%esp),%edi
	# 						v += w
	add	%ebx,%edi
	# p <<<= 7
	rol	$7,%eax
	# p ^= x4
	xorl	116(%esp),%eax
	# 				t <<<= 7
	rol	$7,%edx
	# 				t ^= x14
	xorl	156(%esp),%edx
	# 		r <<<= 7
	rol	$7,%esi
	# 		r ^= x9
	xorl	136(%esp),%esi
	# 						v <<<= 7
	rol	$7,%edi
	# 						v ^= x3
	xorl	112(%esp),%edi
	# x4 = p
	movl	%eax,116(%esp)
	# 				x14 = t
	movl	%edx,156(%esp)
	# p += x0
	addl	100(%esp),%eax
	# 		x9 = r
	movl	%esi,136(%esp)
	# 				t += x10
	addl	140(%esp),%edx
	# 						x3 = v
	movl	%edi,112(%esp)
	# p <<<= 9
	rol	$9,%eax
	# p ^= x8
	xorl	132(%esp),%eax
	# 				t <<<= 9
	rol	$9,%edx
	# 				t ^= x2
	xorl	108(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 9
	rol	$9,%ecx
	# 		s ^= x13
	xorl	152(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 9
	rol	$9,%ebx
	# 						w ^= x7
	xorl	128(%esp),%ebx
	# x8 = p
	movl	%eax,132(%esp)
	# 				x2 = t
	movl	%edx,108(%esp)
	# p += x4
	addl	116(%esp),%eax
	# 		x13 = s
	movl	%ecx,152(%esp)
	# 				t += x14
	addl	156(%esp),%edx
	# 						x7 = w
	movl	%ebx,128(%esp)
	# p <<<= 13
	rol	$13,%eax
	# p ^= x12
	xorl	148(%esp),%eax
	# 				t <<<= 13
	rol	$13,%edx
	# 				t ^= x6
	xorl	124(%esp),%edx
	# 		r += s
	add	%ecx,%esi
	# 		r <<<= 13
	rol	$13,%esi
	# 		r ^= x1
	xorl	104(%esp),%esi
	# 						v += w
	add	%ebx,%edi
	# 						v <<<= 13
	rol	$13,%edi
	# 						v ^= x11
	xorl	144(%esp),%edi
	# x12 = p
	movl	%eax,148(%esp)
	# 				x6 = t
	movl	%edx,124(%esp)
	# p += x8
	addl	132(%esp),%eax
	# 		x1 = r
	movl	%esi,104(%esp)
	# 				t += x2
	addl	108(%esp),%edx
	# 						x11 = v
	movl	%edi,144(%esp)
	# p <<<= 18
	rol	$18,%eax
	# p ^= x0
	xorl	100(%esp),%eax
	# 				t <<<= 18
	rol	$18,%edx
	# 				t ^= x10
	xorl	140(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 18
	rol	$18,%ecx
	# 		s ^= x5
	xorl	120(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 18
	rol	$18,%ebx
	# 						w ^= x15
	xorl	160(%esp),%ebx
	# x0 = p
	movl	%eax,100(%esp)
	# 				x10 = t
	movl	%edx,140(%esp)
	# p += x3
	addl	112(%esp),%eax
	# p <<<= 7
	rol	$7,%eax
	# 		x5 = s
	movl	%ecx,120(%esp)
	# 				t += x9
	addl	136(%esp),%edx
	# 						x15 = w
	movl	%ebx,160(%esp)
	# 		r = x4
	movl	116(%esp),%esi
	# 		r += s
	add	%ecx,%esi
	# 						v = x14
	movl	156(%esp),%edi
	# 						v += w
	add	%ebx,%edi
	# p ^= x1
	xorl	104(%esp),%eax
	# 				t <<<= 7
	rol	$7,%edx
	# 				t ^= x11
	xorl	144(%esp),%edx
	# 		r <<<= 7
	rol	$7,%esi
	# 		r ^= x6
	xorl	124(%esp),%esi
	# 						v <<<= 7
	rol	$7,%edi
	# 						v ^= x12
	xorl	148(%esp),%edi
	# x1 = p
	movl	%eax,104(%esp)
	# 				x11 = t
	movl	%edx,144(%esp)
	# p += x0
	addl	100(%esp),%eax
	# 		x6 = r
	movl	%esi,124(%esp)
	# 				t += x10
	addl	140(%esp),%edx
	# 						x12 = v
	movl	%edi,148(%esp)
	# p <<<= 9
	rol	$9,%eax
	# p ^= x2
	xorl	108(%esp),%eax
	# 				t <<<= 9
	rol	$9,%edx
	# 				t ^= x8
	xorl	132(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 9
	rol	$9,%ecx
	# 		s ^= x7
	xorl	128(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 9
	rol	$9,%ebx
	# 						w ^= x13
	xorl	152(%esp),%ebx
	# x2 = p
	movl	%eax,108(%esp)
	# 				x8 = t
	movl	%edx,132(%esp)
	# p += x1
	addl	104(%esp),%eax
	# 		x7 = s
	movl	%ecx,128(%esp)
	# 				t += x11
	addl	144(%esp),%edx
	# 						x13 = w
	movl	%ebx,152(%esp)
	# p <<<= 13
	rol	$13,%eax
	# p ^= x3
	xorl	112(%esp),%eax
	# 				t <<<= 13
	rol	$13,%edx
	# 				t ^= x9
	xorl	136(%esp),%edx
	# 		r += s
	add	%ecx,%esi
	# 		r <<<= 13
	rol	$13,%esi
	# 		r ^= x4
	xorl	116(%esp),%esi
	# 						v += w
	add	%ebx,%edi
	# 						v <<<= 13
	rol	$13,%edi
	# 						v ^= x14
	xorl	156(%esp),%edi
	# x3 = p
	movl	%eax,112(%esp)
	# 				x9 = t
	movl	%edx,136(%esp)
	# p += x2
	addl	108(%esp),%eax
	# 		x4 = r
	movl	%esi,116(%esp)
	# 				t += x8
	addl	132(%esp),%edx
	# 						x14 = v
	movl	%edi,156(%esp)
	# p <<<= 18
	rol	$18,%eax
	# p ^= x0
	xorl	100(%esp),%eax
	# 				t <<<= 18
	rol	$18,%edx
	# 				t ^= x10
	xorl	140(%esp),%edx
	# 		s += r
	add	%esi,%ecx
	# 		s <<<= 18
	rol	$18,%ecx
	# 		s ^= x5
	xorl	120(%esp),%ecx
	# 						w += v
	add	%edi,%ebx
	# 						w <<<= 18
	rol	$18,%ebx
	# 						w ^= x15
	xorl	160(%esp),%ebx
	# i -= 4
	sub	$4,%ebp
	# goto mainloop if unsigned >
	ja	._mainloop
	# x0 = p
	movl	%eax,100(%esp)
	# x5 = s
	movl	%ecx,120(%esp)
	# x10 = t
	movl	%edx,140(%esp)
	# x15 = w
	movl	%ebx,160(%esp)
	#   out = out_backup
	movl	72(%esp),%edi
	#   m = m_backup
	movl	68(%esp),%esi
	#   in0 = x0
	movl	100(%esp),%eax
	#   in1 = x1
	movl	104(%esp),%ecx
	#   in0 += j0
	addl	164(%esp),%eax
	#   in1 += j1
	addl	168(%esp),%ecx
	#   in0 ^= *(uint32 *) (m + 0)
	xorl	0(%esi),%eax
	#   in1 ^= *(uint32 *) (m + 4)
	xorl	4(%esi),%ecx
	#   *(uint32 *) (out + 0) = in0
	movl	%eax,0(%edi)
	#   *(uint32 *) (out + 4) = in1
	movl	%ecx,4(%edi)
	#   in2 = x2
	movl	108(%esp),%eax
	#   in3 = x3
	movl	112(%esp),%ecx
	#   in2 += j2
	addl	172(%esp),%eax
	#   in3 += j3
	addl	176(%esp),%ecx
	#   in2 ^= *(uint32 *) (m + 8)
	xorl	8(%esi),%eax
	#   in3 ^= *(uint32 *) (m + 12)
	xorl	12(%esi),%ecx
	#   *(uint32 *) (out + 8) = in2
	movl	%eax,8(%edi)
	#   *(uint32 *) (out + 12) = in3
	movl	%ecx,12(%edi)
	#   in4 = x4
	movl	116(%esp),%eax
	#   in5 = x5
	movl	120(%esp),%ecx
	#   in4 += j4
	addl	180(%esp),%eax
	#   in5 += j5
	addl	184(%esp),%ecx
	#   in4 ^= *(uint32 *) (m + 16)
	xorl	16(%esi),%eax
	#   in5 ^= *(uint32 *) (m + 20)
	xorl	20(%esi),%ecx
	#   *(uint32 *) (out + 16) = in4
	movl	%eax,16(%edi)
	#   *(uint32 *) (out + 20) = in5
	movl	%ecx,20(%edi)
	#   in6 = x6
	movl	124(%esp),%eax
	#   in7 = x7
	movl	128(%esp),%ecx
	#   in6 += j6
	addl	188(%esp),%eax
	#   in7 += j7
	addl	192(%esp),%ecx
	#   in6 ^= *(uint32 *) (m + 24)
	xorl	24(%esi),%eax
	#   in7 ^= *(uint32 *) (m + 28)
	xorl	28(%esi),%ecx
	#   *(uint32 *) (out + 24) = in6
	movl	%eax,24(%edi)
	#   *(uint32 *) (out + 28) = in7
	movl	%ecx,28(%edi)
	#   in8 = x8
	movl	132(%esp),%eax
	#   in9 = x9
	movl	136(%esp),%ecx
	#   in8 += j8
	addl	196(%esp),%eax
	#   in9 += j9
	addl	200(%esp),%ecx
	#   in8 ^= *(uint32 *) (m + 32)
	xorl	32(%esi),%eax
	#   in9 ^= *(uint32 *) (m + 36)
	xorl	36(%esi),%ecx
	#   *(uint32 *) (out + 32) = in8
	movl	%eax,32(%edi)
	#   *(uint32 *) (out + 36) = in9
	movl	%ecx,36(%edi)
	#   in10 = x10
	movl	140(%esp),%eax
	#   in11 = x11
	movl	144(%esp),%ecx
	#   in10 += j10
	addl	204(%esp),%eax
	#   in11 += j11
	addl	208(%esp),%ecx
	#   in10 ^= *(uint32 *) (m + 40)
	xorl	40(%esi),%eax
	#   in11 ^= *(uint32 *) (m + 44)
	xorl	44(%esi),%ecx
	#   *(uint32 *) (out + 40) = in10
	movl	%eax,40(%edi)
	#   *(uint32 *) (out + 44) = in11
	movl	%ecx,44(%edi)
	#   in12 = x12
	movl	148(%esp),%eax
	#   in13 = x13
	movl	152(%esp),%ecx
	#   in12 += j12
	addl	212(%esp),%eax
	#   in13 += j13
	addl	216(%esp),%ecx
	#   in12 ^= *(uint32 *) (m + 48)
	xorl	48(%esi),%eax
	#   in13 ^= *(uint32 *) (m + 52)
	xorl	52(%esi),%ecx
	#   *(uint32 *) (out + 48) = in12
	movl	%eax,48(%edi)
	#   *(uint32 *) (out + 52) = in13
	movl	%ecx,52(%edi)
	#   in14 = x14
	movl	156(%esp),%eax
	#   in15 = x15
	movl	160(%esp),%ecx
	#   in14 += j14
	addl	220(%esp),%eax
	#   in15 += j15
	addl	224(%esp),%ecx
	#   in14 ^= *(uint32 *) (m + 56)
	xorl	56(%esi),%eax
	#   in15 ^= *(uint32 *) (m + 60)
	xorl	60(%esi),%ecx
	#   *(uint32 *) (out + 56) = in14
	movl	%eax,56(%edi)
	#   *(uint32 *) (out + 60) = in15
	movl	%ecx,60(%edi)
	#   bytes = bytes_backup
	movl	76(%esp),%ebx
	#   in8 = j8
	movl	196(%esp),%eax
	#   in9 = j9
	movl	200(%esp),%ecx
	#   in8 += 1
	add	$1,%eax
	#   in9 += 0 + carry
	adc	$0,%ecx
	#   j8 = in8
	movl	%eax,196(%esp)
	#   j9 = in9
	movl	%ecx,200(%esp)
	#   bytes - 64
	cmp	$64,%ebx
	#   goto bytesatleast65 if unsigned>
	ja	._bytesatleast65
	#     goto bytesatleast64 if unsigned>=
	jae	._bytesatleast64
	#       m = out
	mov	%edi,%esi
	#       out = ctarget
	movl	228(%esp),%edi
	#       i = bytes
	mov	%ebx,%ecx
	#       while (i) { *out++ = *m++; --i }
	rep	movsb
._bytesatleast64:
	#     x = x_backup
	movl	64(%esp),%eax
	#     in8 = j8
	movl	196(%esp),%ecx
	#     in9 = j9
	movl	200(%esp),%edx
	#     *(uint32 *) (x + 32) = in8
	movl	%ecx,32(%eax)
	#     *(uint32 *) (x + 36) = in9
	movl	%edx,36(%eax)
._done:
	#     eax = eax_stack
	movl	80(%esp),%eax
	#     ebx = ebx_stack
	movl	84(%esp),%ebx
	#     esi = esi_stack
	movl	88(%esp),%esi
	#     edi = edi_stack
	movl	92(%esp),%edi
	#     ebp = ebp_stack
	movl	96(%esp),%ebp
	#     leave
	add	%eax,%esp
	ret
._bytesatleast65:
	#   bytes -= 64
	sub	$64,%ebx
	#   out += 64
	add	$64,%edi
	#   m += 64
	add	$64,%esi
	# goto bytesatleast1
	jmp	._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)

# enter salsa20_keysetup
ENTRY(salsa20_keysetup)
	mov	%esp,%eax
	and	$31,%eax
	add	$256,%eax
	sub	%eax,%esp
	#   eax_stack = eax
	movl	%eax,64(%esp)
	#   ebx_stack = ebx
	movl	%ebx,68(%esp)
	#   esi_stack = esi
	movl	%esi,72(%esp)
	#   edi_stack = edi
	movl	%edi,76(%esp)
	#   ebp_stack = ebp
	movl	%ebp,80(%esp)
	#   k = arg2
	movl	8(%esp,%eax),%ecx
	#   kbits = arg3
	movl	12(%esp,%eax),%edx
	#   x = arg1
	movl	4(%esp,%eax),%eax
	#   in1 = *(uint32 *) (k + 0)
	movl	0(%ecx),%ebx
	#   in2 = *(uint32 *) (k + 4)
	movl	4(%ecx),%esi
	#   in3 = *(uint32 *) (k + 8)
	movl	8(%ecx),%edi
	#   in4 = *(uint32 *) (k + 12)
	movl	12(%ecx),%ebp
	#   *(uint32 *) (x + 4) = in1
	movl	%ebx,4(%eax)
	#   *(uint32 *) (x + 8) = in2
	movl	%esi,8(%eax)
	#   *(uint32 *) (x + 12) = in3
	movl	%edi,12(%eax)
	#   *(uint32 *) (x + 16) = in4
	movl	%ebp,16(%eax)
	#   kbits - 256
	cmp	$256,%edx
	#   goto kbits128 if unsigned<
	jb	._kbits128
._kbits256:
	#     in11 = *(uint32 *) (k + 16)
	movl	16(%ecx),%edx
	#     in12 = *(uint32 *) (k + 20)
	movl	20(%ecx),%ebx
	#     in13 = *(uint32 *) (k + 24)
	movl	24(%ecx),%esi
	#     in14 = *(uint32 *) (k + 28)
	movl	28(%ecx),%ecx
	#     *(uint32 *) (x + 44) = in11
	movl	%edx,44(%eax)
	#     *(uint32 *) (x + 48) = in12
	movl	%ebx,48(%eax)
	#     *(uint32 *) (x + 52) = in13
	movl	%esi,52(%eax)
	#     *(uint32 *) (x + 56) = in14
	movl	%ecx,56(%eax)
	#     in0 = 1634760805
	mov	$1634760805,%ecx
	#     in5 = 857760878
	mov	$857760878,%edx
	#     in10 = 2036477234
	mov	$2036477234,%ebx
	#     in15 = 1797285236
	mov	$1797285236,%esi
	#     *(uint32 *) (x + 0) = in0
	movl	%ecx,0(%eax)
	#     *(uint32 *) (x + 20) = in5
	movl	%edx,20(%eax)
	#     *(uint32 *) (x + 40) = in10
	movl	%ebx,40(%eax)
	#     *(uint32 *) (x + 60) = in15
	movl	%esi,60(%eax)
	#   goto keysetupdone
	jmp	._keysetupdone
._kbits128:
	#     in11 = *(uint32 *) (k + 0)
	movl	0(%ecx),%edx
	#     in12 = *(uint32 *) (k + 4)
	movl	4(%ecx),%ebx
	#     in13 = *(uint32 *) (k + 8)
	movl	8(%ecx),%esi
	#     in14 = *(uint32 *) (k + 12)
	movl	12(%ecx),%ecx
	#     *(uint32 *) (x + 44) = in11
	movl	%edx,44(%eax)
	#     *(uint32 *) (x + 48) = in12
	movl	%ebx,48(%eax)
	#     *(uint32 *) (x + 52) = in13
	movl	%esi,52(%eax)
	#     *(uint32 *) (x + 56) = in14
	movl	%ecx,56(%eax)
	#     in0 = 1634760805
	mov	$1634760805,%ecx
	#     in5 = 824206446
	mov	$824206446,%edx
	#     in10 = 2036477238
	mov	$2036477238,%ebx
	#     in15 = 1797285236
	mov	$1797285236,%esi
	#     *(uint32 *) (x + 0) = in0
	movl	%ecx,0(%eax)
	#     *(uint32 *) (x + 20) = in5
	movl	%edx,20(%eax)
	#     *(uint32 *) (x + 40) = in10
	movl	%ebx,40(%eax)
	#     *(uint32 *) (x + 60) = in15
	movl	%esi,60(%eax)
._keysetupdone:
	#   eax = eax_stack
	movl	64(%esp),%eax
	#   ebx = ebx_stack
	movl	68(%esp),%ebx
	#   esi = esi_stack
	movl	72(%esp),%esi
	#   edi = edi_stack
	movl	76(%esp),%edi
	#   ebp = ebp_stack
	movl	80(%esp),%ebp
	# leave
	add	%eax,%esp
	ret
ENDPROC(salsa20_keysetup)

# enter salsa20_ivsetup
ENTRY(salsa20_ivsetup)
	mov	%esp,%eax
	and	$31,%eax
	add	$256,%eax
	sub	%eax,%esp
	#   eax_stack = eax
	movl	%eax,64(%esp)
	#   ebx_stack = ebx
	movl	%ebx,68(%esp)
	#   esi_stack = esi
	movl	%esi,72(%esp)
	#   edi_stack = edi
	movl	%edi,76(%esp)
	#   ebp_stack = ebp
	movl	%ebp,80(%esp)
	#   iv = arg2
	movl	8(%esp,%eax),%ecx
	#   x = arg1
	movl	4(%esp,%eax),%eax
	#   in6 = *(uint32 *) (iv + 0)
	movl	0(%ecx),%edx
	#   in7 = *(uint32 *) (iv + 4)
	movl	4(%ecx),%ecx
	#   in8 = 0
	mov	$0,%ebx
	#   in9 = 0
	mov	$0,%esi
	#   *(uint32 *) (x + 24) = in6
	movl	%edx,24(%eax)
	#   *(uint32 *) (x + 28) = in7
	movl	%ecx,28(%eax)
	#   *(uint32 *) (x + 32) = in8
	movl	%ebx,32(%eax)
	#   *(uint32 *) (x + 36) = in9
	movl	%esi,36(%eax)
	#   eax = eax_stack
	movl	64(%esp),%eax
	#   ebx = ebx_stack
	movl	68(%esp),%ebx
	#   esi = esi_stack
	movl	72(%esp),%esi
	#   edi = edi_stack
	movl	76(%esp),%edi
	#   ebp = ebp_stack
	movl	80(%esp),%ebp
	# leave
	add	%eax,%esp
	ret
ENDPROC(salsa20_ivsetup)