@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
@
@ Licensed under the OpenSSL license (the "License").  You may not use
@ this file except in compliance with the License.  You can obtain a copy
@ in the file LICENSE in the source distribution or at
@ https://www.openssl.org/source/license.html


@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Permission to use under GPL terms is granted.
@ ====================================================================

@ SHA256 block procedure for ARMv4. May 2007.

@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
@ byte [on single-issue Xscale PXA250 core].

@ July 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
@ Cortex A8 core and ~20 cycles per processed byte.

@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 16%
@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.

@ September 2013.
@
@ Add NEON implementation. On Cortex A8 it was measured to process one
@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
@ code (meaning that latter performs sub-optimally, nothing was done
@ about it).

@ May 2014.
@
@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.

#ifndef __KERNEL__
# include <openssl/arm_arch.h>
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif

@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
@ instructions are manually-encoded. (See unsha256.)


.text
#if defined(__thumb2__)
.syntax	unified
.thumb
#else
.code	32
#endif


.align	5
K256:
.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

.word	0				@ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
LOPENSSL_armcap:
.word	OPENSSL_armcap_P-Lsha256_block_data_order
#endif
.align	5

.globl	_sha256_block_data_order
.private_extern	_sha256_block_data_order
#ifdef __thumb2__
.thumb_func	_sha256_block_data_order
#endif
_sha256_block_data_order:
Lsha256_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
	sub	r3,pc,#8		@ _sha256_block_data_order
#else
	adr	r3,Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
	ldr	r12,LOPENSSL_armcap
	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
#ifdef	__APPLE__
	ldr	r12,[r12]
#endif
	tst	r12,#ARMV8_SHA256
	bne	LARMv8
	tst	r12,#ARMV7_NEON
	bne	LNEON
#endif
	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
	sub	r14,r3,#256+32	@ K256
	sub	sp,sp,#16*4		@ alloca(X[16])
Loop:
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r5,r6		@ magic
	eor	r12,r12,r12
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 0
# if 0==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r8,r8,ror#5
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 0
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 0==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r8,r8,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r11,r11,r2			@ h+=X[i]
	str	r2,[sp,#0*4]
	eor	r2,r9,r10
	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r8
	add	r11,r11,r12			@ h+=K256[i]
	eor	r2,r2,r10			@ Ch(e,f,g)
	eor	r0,r4,r4,ror#11
	add	r11,r11,r2			@ h+=Ch(e,f,g)
#if 0==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 0<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r4,r5			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
	eor	r12,r4,r5			@ a^b, b^c in next round
	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r4,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r7,r7,r11			@ d+=h
	eor	r3,r3,r5			@ Maj(a,b,c)
	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 1
# if 1==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r7,r7,ror#5
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 1
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 1==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r7,r7,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r10,r10,r2			@ h+=X[i]
	str	r2,[sp,#1*4]
	eor	r2,r8,r9
	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r7
	add	r10,r10,r3			@ h+=K256[i]
	eor	r2,r2,r9			@ Ch(e,f,g)
	eor	r0,r11,r11,ror#11
	add	r10,r10,r2			@ h+=Ch(e,f,g)
#if 1==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 1<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r11,r4			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
	eor	r3,r11,r4			@ a^b, b^c in next round
	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r11,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r6,r6,r10			@ d+=h
	eor	r12,r12,r4			@ Maj(a,b,c)
	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 2
# if 2==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 2
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 2==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r6,r6,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r9,r9,r2			@ h+=X[i]
	str	r2,[sp,#2*4]
	eor	r2,r7,r8
	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r6
	add	r9,r9,r12			@ h+=K256[i]
	eor	r2,r2,r8			@ Ch(e,f,g)
	eor	r0,r10,r10,ror#11
	add	r9,r9,r2			@ h+=Ch(e,f,g)
#if 2==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 2<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r10,r11			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
	eor	r12,r10,r11			@ a^b, b^c in next round
	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r10,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r5,r5,r9			@ d+=h
	eor	r3,r3,r11			@ Maj(a,b,c)
	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 3
# if 3==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r5,r5,ror#5
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 3
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 3==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r5,r5,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r8,r8,r2			@ h+=X[i]
	str	r2,[sp,#3*4]
	eor	r2,r6,r7
	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r5
	add	r8,r8,r3			@ h+=K256[i]
	eor	r2,r2,r7			@ Ch(e,f,g)
	eor	r0,r9,r9,ror#11
	add	r8,r8,r2			@ h+=Ch(e,f,g)
#if 3==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 3<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r9,r10			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
	eor	r3,r9,r10			@ a^b, b^c in next round
	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r9,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r4,r4,r8			@ d+=h
	eor	r12,r12,r10			@ Maj(a,b,c)
	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 4
# if 4==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r4,r4,ror#5
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 4
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 4==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r4,r4,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r7,r7,r2			@ h+=X[i]
	str	r2,[sp,#4*4]
	eor	r2,r5,r6
	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r4
	add	r7,r7,r12			@ h+=K256[i]
	eor	r2,r2,r6			@ Ch(e,f,g)
	eor	r0,r8,r8,ror#11
	add	r7,r7,r2			@ h+=Ch(e,f,g)
#if 4==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 4<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r8,r9			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
	eor	r12,r8,r9			@ a^b, b^c in next round
	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r8,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r11,r11,r7			@ d+=h
	eor	r3,r3,r9			@ Maj(a,b,c)
	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 5
# if 5==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r11,r11,ror#5
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 5
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 5==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r11,r11,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r6,r6,r2			@ h+=X[i]
	str	r2,[sp,#5*4]
	eor	r2,r4,r5
	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r11
	add	r6,r6,r3			@ h+=K256[i]
	eor	r2,r2,r5			@ Ch(e,f,g)
	eor	r0,r7,r7,ror#11
	add	r6,r6,r2			@ h+=Ch(e,f,g)
#if 5==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 5<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r7,r8			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
	eor	r3,r7,r8			@ a^b, b^c in next round
	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r7,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r10,r10,r6			@ d+=h
	eor	r12,r12,r8			@ Maj(a,b,c)
	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 6
# if 6==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 6
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 6==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r10,r10,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r5,r5,r2			@ h+=X[i]
	str	r2,[sp,#6*4]
	eor	r2,r11,r4
	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r10
	add	r5,r5,r12			@ h+=K256[i]
	eor	r2,r2,r4			@ Ch(e,f,g)
	eor	r0,r6,r6,ror#11
	add	r5,r5,r2			@ h+=Ch(e,f,g)
#if 6==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 6<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r6,r7			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
	eor	r12,r6,r7			@ a^b, b^c in next round
	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r6,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r9,r9,r5			@ d+=h
	eor	r3,r3,r7			@ Maj(a,b,c)
	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 7
# if 7==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r9,r9,ror#5
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 7
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 7==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r9,r9,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r4,r4,r2			@ h+=X[i]
	str	r2,[sp,#7*4]
	eor	r2,r10,r11
	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r9
	add	r4,r4,r3			@ h+=K256[i]
	eor	r2,r2,r11			@ Ch(e,f,g)
	eor	r0,r5,r5,ror#11
	add	r4,r4,r2			@ h+=Ch(e,f,g)
#if 7==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 7<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r5,r6			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
	eor	r3,r5,r6			@ a^b, b^c in next round
	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r5,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r8,r8,r4			@ d+=h
	eor	r12,r12,r6			@ Maj(a,b,c)
	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 8
# if 8==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r8,r8,ror#5
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 8
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 8==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r8,r8,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r11,r11,r2			@ h+=X[i]
	str	r2,[sp,#8*4]
	eor	r2,r9,r10
	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r8
	add	r11,r11,r12			@ h+=K256[i]
	eor	r2,r2,r10			@ Ch(e,f,g)
	eor	r0,r4,r4,ror#11
	add	r11,r11,r2			@ h+=Ch(e,f,g)
#if 8==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 8<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r4,r5			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
	eor	r12,r4,r5			@ a^b, b^c in next round
	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r4,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r7,r7,r11			@ d+=h
	eor	r3,r3,r5			@ Maj(a,b,c)
	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 9
# if 9==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r7,r7,ror#5
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 9
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 9==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r7,r7,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r10,r10,r2			@ h+=X[i]
	str	r2,[sp,#9*4]
	eor	r2,r8,r9
	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r7
	add	r10,r10,r3			@ h+=K256[i]
	eor	r2,r2,r9			@ Ch(e,f,g)
	eor	r0,r11,r11,ror#11
	add	r10,r10,r2			@ h+=Ch(e,f,g)
#if 9==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 9<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r11,r4			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
	eor	r3,r11,r4			@ a^b, b^c in next round
	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r11,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r6,r6,r10			@ d+=h
	eor	r12,r12,r4			@ Maj(a,b,c)
	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 10
# if 10==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 10
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 10==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r6,r6,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r9,r9,r2			@ h+=X[i]
	str	r2,[sp,#10*4]
	eor	r2,r7,r8
	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r6
	add	r9,r9,r12			@ h+=K256[i]
	eor	r2,r2,r8			@ Ch(e,f,g)
	eor	r0,r10,r10,ror#11
	add	r9,r9,r2			@ h+=Ch(e,f,g)
#if 10==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 10<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r10,r11			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
	eor	r12,r10,r11			@ a^b, b^c in next round
	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r10,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r5,r5,r9			@ d+=h
	eor	r3,r3,r11			@ Maj(a,b,c)
	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 11
# if 11==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r5,r5,ror#5
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 11
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 11==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r5,r5,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r8,r8,r2			@ h+=X[i]
	str	r2,[sp,#11*4]
	eor	r2,r6,r7
	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r5
	add	r8,r8,r3			@ h+=K256[i]
	eor	r2,r2,r7			@ Ch(e,f,g)
	eor	r0,r9,r9,ror#11
	add	r8,r8,r2			@ h+=Ch(e,f,g)
#if 11==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 11<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r9,r10			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
	eor	r3,r9,r10			@ a^b, b^c in next round
	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r9,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r4,r4,r8			@ d+=h
	eor	r12,r12,r10			@ Maj(a,b,c)
	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 12
# if 12==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r4,r4,ror#5
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 12
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 12==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r4,r4,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r7,r7,r2			@ h+=X[i]
	str	r2,[sp,#12*4]
	eor	r2,r5,r6
	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r4
	add	r7,r7,r12			@ h+=K256[i]
	eor	r2,r2,r6			@ Ch(e,f,g)
	eor	r0,r8,r8,ror#11
	add	r7,r7,r2			@ h+=Ch(e,f,g)
#if 12==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 12<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r8,r9			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
	eor	r12,r8,r9			@ a^b, b^c in next round
	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r8,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r11,r11,r7			@ d+=h
	eor	r3,r3,r9			@ Maj(a,b,c)
	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 13
# if 13==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r11,r11,ror#5
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 13
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 13==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r11,r11,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r6,r6,r2			@ h+=X[i]
	str	r2,[sp,#13*4]
	eor	r2,r4,r5
	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r11
	add	r6,r6,r3			@ h+=K256[i]
	eor	r2,r2,r5			@ Ch(e,f,g)
	eor	r0,r7,r7,ror#11
	add	r6,r6,r2			@ h+=Ch(e,f,g)
#if 13==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 13<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r7,r8			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
	eor	r3,r7,r8			@ a^b, b^c in next round
	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r7,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r10,r10,r6			@ d+=h
	eor	r12,r12,r8			@ Maj(a,b,c)
	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 14
# if 14==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 14
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	ldrb	r12,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r12,lsl#8
	ldrb	r12,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 14==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r10,r10,ror#5
	orr	r2,r2,r12,lsl#24
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
#endif
	ldr	r12,[r14],#4			@ *K256++
	add	r5,r5,r2			@ h+=X[i]
	str	r2,[sp,#14*4]
	eor	r2,r11,r4
	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r10
	add	r5,r5,r12			@ h+=K256[i]
	eor	r2,r2,r4			@ Ch(e,f,g)
	eor	r0,r6,r6,ror#11
	add	r5,r5,r2			@ h+=Ch(e,f,g)
#if 14==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 14<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r6,r7			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
	eor	r12,r6,r7			@ a^b, b^c in next round
	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r6,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r9,r9,r5			@ d+=h
	eor	r3,r3,r7			@ Maj(a,b,c)
	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	@ ldr	r2,[r1],#4			@ 15
# if 15==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r9,r9,ror#5
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
# ifndef __ARMEB__
	rev	r2,r2
# endif
#else
	@ ldrb	r2,[r1,#3]			@ 15
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	ldrb	r3,[r1,#2]
	ldrb	r0,[r1,#1]
	orr	r2,r2,r3,lsl#8
	ldrb	r3,[r1],#4
	orr	r2,r2,r0,lsl#16
# if 15==15
	str	r1,[sp,#17*4]			@ make room for r1
# endif
	eor	r0,r9,r9,ror#5
	orr	r2,r2,r3,lsl#24
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
#endif
	ldr	r3,[r14],#4			@ *K256++
	add	r4,r4,r2			@ h+=X[i]
	str	r2,[sp,#15*4]
	eor	r2,r10,r11
	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r9
	add	r4,r4,r3			@ h+=K256[i]
	eor	r2,r2,r11			@ Ch(e,f,g)
	eor	r0,r5,r5,ror#11
	add	r4,r4,r2			@ h+=Ch(e,f,g)
#if 15==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 15<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r5,r6			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
	eor	r3,r5,r6			@ a^b, b^c in next round
	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r5,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r8,r8,r4			@ d+=h
	eor	r12,r12,r6			@ Maj(a,b,c)
	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
Lrounds_16_xx:
	@ ldr	r2,[sp,#1*4]		@ 16
	@ ldr	r1,[sp,#14*4]
	mov	r0,r2,ror#7
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#0*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#9*4]

	add	r12,r12,r0
	eor	r0,r8,r8,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r11,r11,r2			@ h+=X[i]
	str	r2,[sp,#0*4]
	eor	r2,r9,r10
	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r8
	add	r11,r11,r12			@ h+=K256[i]
	eor	r2,r2,r10			@ Ch(e,f,g)
	eor	r0,r4,r4,ror#11
	add	r11,r11,r2			@ h+=Ch(e,f,g)
#if 16==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 16<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r4,r5			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
	eor	r12,r4,r5			@ a^b, b^c in next round
	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r4,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r7,r7,r11			@ d+=h
	eor	r3,r3,r5			@ Maj(a,b,c)
	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#2*4]		@ 17
	@ ldr	r1,[sp,#15*4]
	mov	r0,r2,ror#7
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#1*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#10*4]

	add	r3,r3,r0
	eor	r0,r7,r7,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r10,r10,r2			@ h+=X[i]
	str	r2,[sp,#1*4]
	eor	r2,r8,r9
	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r7
	add	r10,r10,r3			@ h+=K256[i]
	eor	r2,r2,r9			@ Ch(e,f,g)
	eor	r0,r11,r11,ror#11
	add	r10,r10,r2			@ h+=Ch(e,f,g)
#if 17==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 17<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r11,r4			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
	eor	r3,r11,r4			@ a^b, b^c in next round
	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r11,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r6,r6,r10			@ d+=h
	eor	r12,r12,r4			@ Maj(a,b,c)
	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#3*4]		@ 18
	@ ldr	r1,[sp,#0*4]
	mov	r0,r2,ror#7
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#2*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#11*4]

	add	r12,r12,r0
	eor	r0,r6,r6,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r9,r9,r2			@ h+=X[i]
	str	r2,[sp,#2*4]
	eor	r2,r7,r8
	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r6
	add	r9,r9,r12			@ h+=K256[i]
	eor	r2,r2,r8			@ Ch(e,f,g)
	eor	r0,r10,r10,ror#11
	add	r9,r9,r2			@ h+=Ch(e,f,g)
#if 18==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 18<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r10,r11			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
	eor	r12,r10,r11			@ a^b, b^c in next round
	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r10,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r5,r5,r9			@ d+=h
	eor	r3,r3,r11			@ Maj(a,b,c)
	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#4*4]		@ 19
	@ ldr	r1,[sp,#1*4]
	mov	r0,r2,ror#7
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#3*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#12*4]

	add	r3,r3,r0
	eor	r0,r5,r5,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r8,r8,r2			@ h+=X[i]
	str	r2,[sp,#3*4]
	eor	r2,r6,r7
	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r5
	add	r8,r8,r3			@ h+=K256[i]
	eor	r2,r2,r7			@ Ch(e,f,g)
	eor	r0,r9,r9,ror#11
	add	r8,r8,r2			@ h+=Ch(e,f,g)
#if 19==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 19<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r9,r10			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
	eor	r3,r9,r10			@ a^b, b^c in next round
	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r9,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r4,r4,r8			@ d+=h
	eor	r12,r12,r10			@ Maj(a,b,c)
	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#5*4]		@ 20
	@ ldr	r1,[sp,#2*4]
	mov	r0,r2,ror#7
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#4*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#13*4]

	add	r12,r12,r0
	eor	r0,r4,r4,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r7,r7,r2			@ h+=X[i]
	str	r2,[sp,#4*4]
	eor	r2,r5,r6
	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r4
	add	r7,r7,r12			@ h+=K256[i]
	eor	r2,r2,r6			@ Ch(e,f,g)
	eor	r0,r8,r8,ror#11
	add	r7,r7,r2			@ h+=Ch(e,f,g)
#if 20==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 20<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r8,r9			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
	eor	r12,r8,r9			@ a^b, b^c in next round
	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r8,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r11,r11,r7			@ d+=h
	eor	r3,r3,r9			@ Maj(a,b,c)
	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#6*4]		@ 21
	@ ldr	r1,[sp,#3*4]
	mov	r0,r2,ror#7
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#5*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#14*4]

	add	r3,r3,r0
	eor	r0,r11,r11,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r6,r6,r2			@ h+=X[i]
	str	r2,[sp,#5*4]
	eor	r2,r4,r5
	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r11
	add	r6,r6,r3			@ h+=K256[i]
	eor	r2,r2,r5			@ Ch(e,f,g)
	eor	r0,r7,r7,ror#11
	add	r6,r6,r2			@ h+=Ch(e,f,g)
#if 21==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 21<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r7,r8			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
	eor	r3,r7,r8			@ a^b, b^c in next round
	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r7,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r10,r10,r6			@ d+=h
	eor	r12,r12,r8			@ Maj(a,b,c)
	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#7*4]		@ 22
	@ ldr	r1,[sp,#4*4]
	mov	r0,r2,ror#7
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#6*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#15*4]

	add	r12,r12,r0
	eor	r0,r10,r10,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r5,r5,r2			@ h+=X[i]
	str	r2,[sp,#6*4]
	eor	r2,r11,r4
	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r10
	add	r5,r5,r12			@ h+=K256[i]
	eor	r2,r2,r4			@ Ch(e,f,g)
	eor	r0,r6,r6,ror#11
	add	r5,r5,r2			@ h+=Ch(e,f,g)
#if 22==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 22<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r6,r7			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
	eor	r12,r6,r7			@ a^b, b^c in next round
	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r6,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r9,r9,r5			@ d+=h
	eor	r3,r3,r7			@ Maj(a,b,c)
	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#8*4]		@ 23
	@ ldr	r1,[sp,#5*4]
	mov	r0,r2,ror#7
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#7*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#0*4]

	add	r3,r3,r0
	eor	r0,r9,r9,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r4,r4,r2			@ h+=X[i]
	str	r2,[sp,#7*4]
	eor	r2,r10,r11
	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r9
	add	r4,r4,r3			@ h+=K256[i]
	eor	r2,r2,r11			@ Ch(e,f,g)
	eor	r0,r5,r5,ror#11
	add	r4,r4,r2			@ h+=Ch(e,f,g)
#if 23==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 23<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r5,r6			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
	eor	r3,r5,r6			@ a^b, b^c in next round
	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r5,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r8,r8,r4			@ d+=h
	eor	r12,r12,r6			@ Maj(a,b,c)
	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#9*4]		@ 24
	@ ldr	r1,[sp,#6*4]
	mov	r0,r2,ror#7
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#8*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#1*4]

	add	r12,r12,r0
	eor	r0,r8,r8,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r8,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r11,r11,r2			@ h+=X[i]
	str	r2,[sp,#8*4]
	eor	r2,r9,r10
	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r8
	add	r11,r11,r12			@ h+=K256[i]
	eor	r2,r2,r10			@ Ch(e,f,g)
	eor	r0,r4,r4,ror#11
	add	r11,r11,r2			@ h+=Ch(e,f,g)
#if 24==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 24<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r4,r5			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
	eor	r12,r4,r5			@ a^b, b^c in next round
	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r4,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r7,r7,r11			@ d+=h
	eor	r3,r3,r5			@ Maj(a,b,c)
	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#10*4]		@ 25
	@ ldr	r1,[sp,#7*4]
	mov	r0,r2,ror#7
	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#9*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#2*4]

	add	r3,r3,r0
	eor	r0,r7,r7,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r7,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r10,r10,r2			@ h+=X[i]
	str	r2,[sp,#9*4]
	eor	r2,r8,r9
	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r7
	add	r10,r10,r3			@ h+=K256[i]
	eor	r2,r2,r9			@ Ch(e,f,g)
	eor	r0,r11,r11,ror#11
	add	r10,r10,r2			@ h+=Ch(e,f,g)
#if 25==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 25<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r11,r4			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
	eor	r3,r11,r4			@ a^b, b^c in next round
	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r11,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r6,r6,r10			@ d+=h
	eor	r12,r12,r4			@ Maj(a,b,c)
	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#11*4]		@ 26
	@ ldr	r1,[sp,#8*4]
	mov	r0,r2,ror#7
	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#10*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#3*4]

	add	r12,r12,r0
	eor	r0,r6,r6,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r6,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r9,r9,r2			@ h+=X[i]
	str	r2,[sp,#10*4]
	eor	r2,r7,r8
	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r6
	add	r9,r9,r12			@ h+=K256[i]
	eor	r2,r2,r8			@ Ch(e,f,g)
	eor	r0,r10,r10,ror#11
	add	r9,r9,r2			@ h+=Ch(e,f,g)
#if 26==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 26<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r10,r11			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
	eor	r12,r10,r11			@ a^b, b^c in next round
	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r10,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r5,r5,r9			@ d+=h
	eor	r3,r3,r11			@ Maj(a,b,c)
	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#12*4]		@ 27
	@ ldr	r1,[sp,#9*4]
	mov	r0,r2,ror#7
	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#11*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#4*4]

	add	r3,r3,r0
	eor	r0,r5,r5,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r5,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r8,r8,r2			@ h+=X[i]
	str	r2,[sp,#11*4]
	eor	r2,r6,r7
	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r5
	add	r8,r8,r3			@ h+=K256[i]
	eor	r2,r2,r7			@ Ch(e,f,g)
	eor	r0,r9,r9,ror#11
	add	r8,r8,r2			@ h+=Ch(e,f,g)
#if 27==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 27<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r9,r10			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
	eor	r3,r9,r10			@ a^b, b^c in next round
	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r9,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r4,r4,r8			@ d+=h
	eor	r12,r12,r10			@ Maj(a,b,c)
	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#13*4]		@ 28
	@ ldr	r1,[sp,#10*4]
	mov	r0,r2,ror#7
	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#12*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#5*4]

	add	r12,r12,r0
	eor	r0,r4,r4,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r4,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r7,r7,r2			@ h+=X[i]
	str	r2,[sp,#12*4]
	eor	r2,r5,r6
	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r4
	add	r7,r7,r12			@ h+=K256[i]
	eor	r2,r2,r6			@ Ch(e,f,g)
	eor	r0,r8,r8,ror#11
	add	r7,r7,r2			@ h+=Ch(e,f,g)
#if 28==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 28<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r8,r9			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
	eor	r12,r8,r9			@ a^b, b^c in next round
	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r8,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r11,r11,r7			@ d+=h
	eor	r3,r3,r9			@ Maj(a,b,c)
	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#14*4]		@ 29
	@ ldr	r1,[sp,#11*4]
	mov	r0,r2,ror#7
	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#13*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#6*4]

	add	r3,r3,r0
	eor	r0,r11,r11,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r11,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r6,r6,r2			@ h+=X[i]
	str	r2,[sp,#13*4]
	eor	r2,r4,r5
	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r11
	add	r6,r6,r3			@ h+=K256[i]
	eor	r2,r2,r5			@ Ch(e,f,g)
	eor	r0,r7,r7,ror#11
	add	r6,r6,r2			@ h+=Ch(e,f,g)
#if 29==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 29<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r7,r8			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
	eor	r3,r7,r8			@ a^b, b^c in next round
	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r7,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r10,r10,r6			@ d+=h
	eor	r12,r12,r8			@ Maj(a,b,c)
	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#15*4]		@ 30
	@ ldr	r1,[sp,#12*4]
	mov	r0,r2,ror#7
	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
	mov	r12,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r12,r12,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#14*4]
	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#7*4]

	add	r12,r12,r0
	eor	r0,r10,r10,ror#5	@ from BODY_00_15
	add	r2,r2,r12
	eor	r0,r0,r10,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r12,[r14],#4			@ *K256++
	add	r5,r5,r2			@ h+=X[i]
	str	r2,[sp,#14*4]
	eor	r2,r11,r4
	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r10
	add	r5,r5,r12			@ h+=K256[i]
	eor	r2,r2,r4			@ Ch(e,f,g)
	eor	r0,r6,r6,ror#11
	add	r5,r5,r2			@ h+=Ch(e,f,g)
#if 30==31
	and	r12,r12,#0xff
	cmp	r12,#0xf2			@ done?
#endif
#if 30<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r12,r6,r7			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
	eor	r12,r6,r7			@ a^b, b^c in next round
	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r6,ror#20	@ Sigma0(a)
	and	r3,r3,r12			@ (b^c)&=(a^b)
	add	r9,r9,r5			@ d+=h
	eor	r3,r3,r7			@ Maj(a,b,c)
	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
	@ ldr	r2,[sp,#0*4]		@ 31
	@ ldr	r1,[sp,#13*4]
	mov	r0,r2,ror#7
	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
	mov	r3,r1,ror#17
	eor	r0,r0,r2,ror#18
	eor	r3,r3,r1,ror#19
	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
	ldr	r2,[sp,#15*4]
	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
	ldr	r1,[sp,#8*4]

	add	r3,r3,r0
	eor	r0,r9,r9,ror#5	@ from BODY_00_15
	add	r2,r2,r3
	eor	r0,r0,r9,ror#19	@ Sigma1(e)
	add	r2,r2,r1			@ X[i]
	ldr	r3,[r14],#4			@ *K256++
	add	r4,r4,r2			@ h+=X[i]
	str	r2,[sp,#15*4]
	eor	r2,r10,r11
	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
	and	r2,r2,r9
	add	r4,r4,r3			@ h+=K256[i]
	eor	r2,r2,r11			@ Ch(e,f,g)
	eor	r0,r5,r5,ror#11
	add	r4,r4,r2			@ h+=Ch(e,f,g)
#if 31==31
	and	r3,r3,#0xff
	cmp	r3,#0xf2			@ done?
#endif
#if 31<15
# if __ARM_ARCH__>=7
	ldr	r2,[r1],#4			@ prefetch
# else
	ldrb	r2,[r1,#3]
# endif
	eor	r3,r5,r6			@ a^b, b^c in next round
#else
	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
	eor	r3,r5,r6			@ a^b, b^c in next round
	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
#endif
	eor	r0,r0,r5,ror#20	@ Sigma0(a)
	and	r12,r12,r3			@ (b^c)&=(a^b)
	add	r8,r8,r4			@ d+=h
	eor	r12,r12,r6			@ Maj(a,b,c)
	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
	ite	eq			@ Thumb2 thing, sanity check in ARM
#endif
	ldreq	r3,[sp,#16*4]		@ pull ctx
	bne	Lrounds_16_xx

	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
	ldr	r0,[r3,#0]
	ldr	r2,[r3,#4]
	ldr	r12,[r3,#8]
	add	r4,r4,r0
	ldr	r0,[r3,#12]
	add	r5,r5,r2
	ldr	r2,[r3,#16]
	add	r6,r6,r12
	ldr	r12,[r3,#20]
	add	r7,r7,r0
	ldr	r0,[r3,#24]
	add	r8,r8,r2
	ldr	r2,[r3,#28]
	add	r9,r9,r12
	ldr	r1,[sp,#17*4]		@ pull inp
	ldr	r12,[sp,#18*4]		@ pull inp+len
	add	r10,r10,r0
	add	r11,r11,r2
	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
	cmp	r1,r12
	sub	r14,r14,#256	@ rewind Ktbl
	bne	Loop

	add	sp,sp,#19*4	@ destroy frame
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
#endif

#if __ARM_MAX_ARCH__>=7



.globl	_sha256_block_data_order_neon
.private_extern	_sha256_block_data_order_neon
#ifdef __thumb2__
.thumb_func	_sha256_block_data_order_neon
#endif
.align	5
.skip	16
_sha256_block_data_order_neon:
LNEON:
	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}

	sub	r11,sp,#16*4+16
	adr	r14,K256
	bic	r11,r11,#15		@ align for 128-bit stores
	mov	r12,sp
	mov	sp,r11			@ alloca
	add	r2,r1,r2,lsl#6	@ len to point at the end of inp

	vld1.8	{q0},[r1]!
	vld1.8	{q1},[r1]!
	vld1.8	{q2},[r1]!
	vld1.8	{q3},[r1]!
	vld1.32	{q8},[r14,:128]!
	vld1.32	{q9},[r14,:128]!
	vld1.32	{q10},[r14,:128]!
	vld1.32	{q11},[r14,:128]!
	vrev32.8	q0,q0		@ yes, even on
	str	r0,[sp,#64]
	vrev32.8	q1,q1		@ big-endian
	str	r1,[sp,#68]
	mov	r1,sp
	vrev32.8	q2,q2
	str	r2,[sp,#72]
	vrev32.8	q3,q3
	str	r12,[sp,#76]		@ save original sp
	vadd.i32	q8,q8,q0
	vadd.i32	q9,q9,q1
	vst1.32	{q8},[r1,:128]!
	vadd.i32	q10,q10,q2
	vst1.32	{q9},[r1,:128]!
	vadd.i32	q11,q11,q3
	vst1.32	{q10},[r1,:128]!
	vst1.32	{q11},[r1,:128]!

	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
	sub	r1,r1,#64
	ldr	r2,[sp,#0]
	eor	r12,r12,r12
	eor	r3,r5,r6
	b	L_00_48

.align	4
L_00_48:
	vext.8	q8,q0,q1,#4
	add	r11,r11,r2
	eor	r2,r9,r10
	eor	r0,r8,r8,ror#5
	vext.8	q9,q2,q3,#4
	add	r4,r4,r12
	and	r2,r2,r8
	eor	r12,r0,r8,ror#19
	vshr.u32	q10,q8,#7
	eor	r0,r4,r4,ror#11
	eor	r2,r2,r10
	vadd.i32	q0,q0,q9
	add	r11,r11,r12,ror#6
	eor	r12,r4,r5
	vshr.u32	q9,q8,#3
	eor	r0,r0,r4,ror#20
	add	r11,r11,r2
	vsli.32	q10,q8,#25
	ldr	r2,[sp,#4]
	and	r3,r3,r12
	vshr.u32	q11,q8,#18
	add	r7,r7,r11
	add	r11,r11,r0,ror#2
	eor	r3,r3,r5
	veor	q9,q9,q10
	add	r10,r10,r2
	vsli.32	q11,q8,#14
	eor	r2,r8,r9
	eor	r0,r7,r7,ror#5
	vshr.u32	d24,d7,#17
	add	r11,r11,r3
	and	r2,r2,r7
	veor	q9,q9,q11
	eor	r3,r0,r7,ror#19
	eor	r0,r11,r11,ror#11
	vsli.32	d24,d7,#15
	eor	r2,r2,r9
	add	r10,r10,r3,ror#6
	vshr.u32	d25,d7,#10
	eor	r3,r11,r4
	eor	r0,r0,r11,ror#20
	vadd.i32	q0,q0,q9
	add	r10,r10,r2
	ldr	r2,[sp,#8]
	veor	d25,d25,d24
	and	r12,r12,r3
	add	r6,r6,r10
	vshr.u32	d24,d7,#19
	add	r10,r10,r0,ror#2
	eor	r12,r12,r4
	vsli.32	d24,d7,#13
	add	r9,r9,r2
	eor	r2,r7,r8
	veor	d25,d25,d24
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12
	vadd.i32	d0,d0,d25
	and	r2,r2,r6
	eor	r12,r0,r6,ror#19
	vshr.u32	d24,d0,#17
	eor	r0,r10,r10,ror#11
	eor	r2,r2,r8
	vsli.32	d24,d0,#15
	add	r9,r9,r12,ror#6
	eor	r12,r10,r11
	vshr.u32	d25,d0,#10
	eor	r0,r0,r10,ror#20
	add	r9,r9,r2
	veor	d25,d25,d24
	ldr	r2,[sp,#12]
	and	r3,r3,r12
	vshr.u32	d24,d0,#19
	add	r5,r5,r9
	add	r9,r9,r0,ror#2
	eor	r3,r3,r11
	vld1.32	{q8},[r14,:128]!
	add	r8,r8,r2
	vsli.32	d24,d0,#13
	eor	r2,r6,r7
	eor	r0,r5,r5,ror#5
	veor	d25,d25,d24
	add	r9,r9,r3
	and	r2,r2,r5
	vadd.i32	d1,d1,d25
	eor	r3,r0,r5,ror#19
	eor	r0,r9,r9,ror#11
	vadd.i32	q8,q8,q0
	eor	r2,r2,r7
	add	r8,r8,r3,ror#6
	eor	r3,r9,r10
	eor	r0,r0,r9,ror#20
	add	r8,r8,r2
	ldr	r2,[sp,#16]
	and	r12,r12,r3
	add	r4,r4,r8
	vst1.32	{q8},[r1,:128]!
	add	r8,r8,r0,ror#2
	eor	r12,r12,r10
	vext.8	q8,q1,q2,#4
	add	r7,r7,r2
	eor	r2,r5,r6
	eor	r0,r4,r4,ror#5
	vext.8	q9,q3,q0,#4
	add	r8,r8,r12
	and	r2,r2,r4
	eor	r12,r0,r4,ror#19
	vshr.u32	q10,q8,#7
	eor	r0,r8,r8,ror#11
	eor	r2,r2,r6
	vadd.i32	q1,q1,q9
	add	r7,r7,r12,ror#6
	eor	r12,r8,r9
	vshr.u32	q9,q8,#3
	eor	r0,r0,r8,ror#20
	add	r7,r7,r2
	vsli.32	q10,q8,#25
	ldr	r2,[sp,#20]
	and	r3,r3,r12
	vshr.u32	q11,q8,#18
	add	r11,r11,r7
	add	r7,r7,r0,ror#2
	eor	r3,r3,r9
	veor	q9,q9,q10
	add	r6,r6,r2
	vsli.32	q11,q8,#14
	eor	r2,r4,r5
	eor	r0,r11,r11,ror#5
	vshr.u32	d24,d1,#17
	add	r7,r7,r3
	and	r2,r2,r11
	veor	q9,q9,q11
	eor	r3,r0,r11,ror#19
	eor	r0,r7,r7,ror#11
	vsli.32	d24,d1,#15
	eor	r2,r2,r5
	add	r6,r6,r3,ror#6
	vshr.u32	d25,d1,#10
	eor	r3,r7,r8
	eor	r0,r0,r7,ror#20
	vadd.i32	q1,q1,q9
	add	r6,r6,r2
	ldr	r2,[sp,#24]
	veor	d25,d25,d24
	and	r12,r12,r3
	add	r10,r10,r6
	vshr.u32	d24,d1,#19
	add	r6,r6,r0,ror#2
	eor	r12,r12,r8
	vsli.32	d24,d1,#13
	add	r5,r5,r2
	eor	r2,r11,r4
	veor	d25,d25,d24
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12
	vadd.i32	d2,d2,d25
	and	r2,r2,r10
	eor	r12,r0,r10,ror#19
	vshr.u32	d24,d2,#17
	eor	r0,r6,r6,ror#11
	eor	r2,r2,r4
	vsli.32	d24,d2,#15
	add	r5,r5,r12,ror#6
	eor	r12,r6,r7
	vshr.u32	d25,d2,#10
	eor	r0,r0,r6,ror#20
	add	r5,r5,r2
	veor	d25,d25,d24
	ldr	r2,[sp,#28]
	and	r3,r3,r12
	vshr.u32	d24,d2,#19
	add	r9,r9,r5
	add	r5,r5,r0,ror#2
	eor	r3,r3,r7
	vld1.32	{q8},[r14,:128]!
	add	r4,r4,r2
	vsli.32	d24,d2,#13
	eor	r2,r10,r11
	eor	r0,r9,r9,ror#5
	veor	d25,d25,d24
	add	r5,r5,r3
	and	r2,r2,r9
	vadd.i32	d3,d3,d25
	eor	r3,r0,r9,ror#19
	eor	r0,r5,r5,ror#11
	vadd.i32	q8,q8,q1
	eor	r2,r2,r11
	add	r4,r4,r3,ror#6
	eor	r3,r5,r6
	eor	r0,r0,r5,ror#20
	add	r4,r4,r2
	ldr	r2,[sp,#32]
	and	r12,r12,r3
	add	r8,r8,r4
	vst1.32	{q8},[r1,:128]!
	add	r4,r4,r0,ror#2
	eor	r12,r12,r6
	vext.8	q8,q2,q3,#4
	add	r11,r11,r2
	eor	r2,r9,r10
	eor	r0,r8,r8,ror#5
	vext.8	q9,q0,q1,#4
	add	r4,r4,r12
	and	r2,r2,r8
	eor	r12,r0,r8,ror#19
	vshr.u32	q10,q8,#7
	eor	r0,r4,r4,ror#11
	eor	r2,r2,r10
	vadd.i32	q2,q2,q9
	add	r11,r11,r12,ror#6
	eor	r12,r4,r5
	vshr.u32	q9,q8,#3
	eor	r0,r0,r4,ror#20
	add	r11,r11,r2
	vsli.32	q10,q8,#25
	ldr	r2,[sp,#36]
	and	r3,r3,r12
	vshr.u32	q11,q8,#18
	add	r7,r7,r11
	add	r11,r11,r0,ror#2
	eor	r3,r3,r5
	veor	q9,q9,q10
	add	r10,r10,r2
	vsli.32	q11,q8,#14
	eor	r2,r8,r9
	eor	r0,r7,r7,ror#5
	vshr.u32	d24,d3,#17
	add	r11,r11,r3
	and	r2,r2,r7
	veor	q9,q9,q11
	eor	r3,r0,r7,ror#19
	eor	r0,r11,r11,ror#11
	vsli.32	d24,d3,#15
	eor	r2,r2,r9
	add	r10,r10,r3,ror#6
	vshr.u32	d25,d3,#10
	eor	r3,r11,r4
	eor	r0,r0,r11,ror#20
	vadd.i32	q2,q2,q9
	add	r10,r10,r2
	ldr	r2,[sp,#40]
	veor	d25,d25,d24
	and	r12,r12,r3
	add	r6,r6,r10
	vshr.u32	d24,d3,#19
	add	r10,r10,r0,ror#2
	eor	r12,r12,r4
	vsli.32	d24,d3,#13
	add	r9,r9,r2
	eor	r2,r7,r8
	veor	d25,d25,d24
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12
	vadd.i32	d4,d4,d25
	and	r2,r2,r6
	eor	r12,r0,r6,ror#19
	vshr.u32	d24,d4,#17
	eor	r0,r10,r10,ror#11
	eor	r2,r2,r8
	vsli.32	d24,d4,#15
	add	r9,r9,r12,ror#6
	eor	r12,r10,r11
	vshr.u32	d25,d4,#10
	eor	r0,r0,r10,ror#20
	add	r9,r9,r2
	veor	d25,d25,d24
	ldr	r2,[sp,#44]
	and	r3,r3,r12
	vshr.u32	d24,d4,#19
	add	r5,r5,r9
	add	r9,r9,r0,ror#2
	eor	r3,r3,r11
	vld1.32	{q8},[r14,:128]!
	add	r8,r8,r2
	vsli.32	d24,d4,#13
	eor	r2,r6,r7
	eor	r0,r5,r5,ror#5
	veor	d25,d25,d24
	add	r9,r9,r3
	and	r2,r2,r5
	vadd.i32	d5,d5,d25
	eor	r3,r0,r5,ror#19
	eor	r0,r9,r9,ror#11
	vadd.i32	q8,q8,q2
	eor	r2,r2,r7
	add	r8,r8,r3,ror#6
	eor	r3,r9,r10
	eor	r0,r0,r9,ror#20
	add	r8,r8,r2
	ldr	r2,[sp,#48]
	and	r12,r12,r3
	add	r4,r4,r8
	vst1.32	{q8},[r1,:128]!
	add	r8,r8,r0,ror#2
	eor	r12,r12,r10
	vext.8	q8,q3,q0,#4
	add	r7,r7,r2
	eor	r2,r5,r6
	eor	r0,r4,r4,ror#5
	vext.8	q9,q1,q2,#4
	add	r8,r8,r12
	and	r2,r2,r4
	eor	r12,r0,r4,ror#19
	vshr.u32	q10,q8,#7
	eor	r0,r8,r8,ror#11
	eor	r2,r2,r6
	vadd.i32	q3,q3,q9
	add	r7,r7,r12,ror#6
	eor	r12,r8,r9
	vshr.u32	q9,q8,#3
	eor	r0,r0,r8,ror#20
	add	r7,r7,r2
	vsli.32	q10,q8,#25
	ldr	r2,[sp,#52]
	and	r3,r3,r12
	vshr.u32	q11,q8,#18
	add	r11,r11,r7
	add	r7,r7,r0,ror#2
	eor	r3,r3,r9
	veor	q9,q9,q10
	add	r6,r6,r2
	vsli.32	q11,q8,#14
	eor	r2,r4,r5
	eor	r0,r11,r11,ror#5
	vshr.u32	d24,d5,#17
	add	r7,r7,r3
	and	r2,r2,r11
	veor	q9,q9,q11
	eor	r3,r0,r11,ror#19
	eor	r0,r7,r7,ror#11
	vsli.32	d24,d5,#15
	eor	r2,r2,r5
	add	r6,r6,r3,ror#6
	vshr.u32	d25,d5,#10
	eor	r3,r7,r8
	eor	r0,r0,r7,ror#20
	vadd.i32	q3,q3,q9
	add	r6,r6,r2
	ldr	r2,[sp,#56]
	veor	d25,d25,d24
	and	r12,r12,r3
	add	r10,r10,r6
	vshr.u32	d24,d5,#19
	add	r6,r6,r0,ror#2
	eor	r12,r12,r8
	vsli.32	d24,d5,#13
	add	r5,r5,r2
	eor	r2,r11,r4
	veor	d25,d25,d24
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12
	vadd.i32	d6,d6,d25
	and	r2,r2,r10
	eor	r12,r0,r10,ror#19
	vshr.u32	d24,d6,#17
	eor	r0,r6,r6,ror#11
	eor	r2,r2,r4
	vsli.32	d24,d6,#15
	add	r5,r5,r12,ror#6
	eor	r12,r6,r7
	vshr.u32	d25,d6,#10
	eor	r0,r0,r6,ror#20
	add	r5,r5,r2
	veor	d25,d25,d24
	ldr	r2,[sp,#60]
	and	r3,r3,r12
	vshr.u32	d24,d6,#19
	add	r9,r9,r5
	add	r5,r5,r0,ror#2
	eor	r3,r3,r7
	vld1.32	{q8},[r14,:128]!
	add	r4,r4,r2
	vsli.32	d24,d6,#13
	eor	r2,r10,r11
	eor	r0,r9,r9,ror#5
	veor	d25,d25,d24
	add	r5,r5,r3
	and	r2,r2,r9
	vadd.i32	d7,d7,d25
	eor	r3,r0,r9,ror#19
	eor	r0,r5,r5,ror#11
	vadd.i32	q8,q8,q3
	eor	r2,r2,r11
	add	r4,r4,r3,ror#6
	eor	r3,r5,r6
	eor	r0,r0,r5,ror#20
	add	r4,r4,r2
	ldr	r2,[r14]
	and	r12,r12,r3
	add	r8,r8,r4
	vst1.32	{q8},[r1,:128]!
	add	r4,r4,r0,ror#2
	eor	r12,r12,r6
	teq	r2,#0				@ check for K256 terminator
	ldr	r2,[sp,#0]
	sub	r1,r1,#64
	bne	L_00_48

	ldr	r1,[sp,#68]
	ldr	r0,[sp,#72]
	sub	r14,r14,#256	@ rewind r14
	teq	r1,r0
	it	eq
	subeq	r1,r1,#64		@ avoid SEGV
	vld1.8	{q0},[r1]!		@ load next input block
	vld1.8	{q1},[r1]!
	vld1.8	{q2},[r1]!
	vld1.8	{q3},[r1]!
	it	ne
	strne	r1,[sp,#68]
	mov	r1,sp
	add	r11,r11,r2
	eor	r2,r9,r10
	eor	r0,r8,r8,ror#5
	add	r4,r4,r12
	vld1.32	{q8},[r14,:128]!
	and	r2,r2,r8
	eor	r12,r0,r8,ror#19
	eor	r0,r4,r4,ror#11
	eor	r2,r2,r10
	vrev32.8	q0,q0
	add	r11,r11,r12,ror#6
	eor	r12,r4,r5
	eor	r0,r0,r4,ror#20
	add	r11,r11,r2
	vadd.i32	q8,q8,q0
	ldr	r2,[sp,#4]
	and	r3,r3,r12
	add	r7,r7,r11
	add	r11,r11,r0,ror#2
	eor	r3,r3,r5
	add	r10,r10,r2
	eor	r2,r8,r9
	eor	r0,r7,r7,ror#5
	add	r11,r11,r3
	and	r2,r2,r7
	eor	r3,r0,r7,ror#19
	eor	r0,r11,r11,ror#11
	eor	r2,r2,r9
	add	r10,r10,r3,ror#6
	eor	r3,r11,r4
	eor	r0,r0,r11,ror#20
	add	r10,r10,r2
	ldr	r2,[sp,#8]
	and	r12,r12,r3
	add	r6,r6,r10
	add	r10,r10,r0,ror#2
	eor	r12,r12,r4
	add	r9,r9,r2
	eor	r2,r7,r8
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12
	and	r2,r2,r6
	eor	r12,r0,r6,ror#19
	eor	r0,r10,r10,ror#11
	eor	r2,r2,r8
	add	r9,r9,r12,ror#6
	eor	r12,r10,r11
	eor	r0,r0,r10,ror#20
	add	r9,r9,r2
	ldr	r2,[sp,#12]
	and	r3,r3,r12
	add	r5,r5,r9
	add	r9,r9,r0,ror#2
	eor	r3,r3,r11
	add	r8,r8,r2
	eor	r2,r6,r7
	eor	r0,r5,r5,ror#5
	add	r9,r9,r3
	and	r2,r2,r5
	eor	r3,r0,r5,ror#19
	eor	r0,r9,r9,ror#11
	eor	r2,r2,r7
	add	r8,r8,r3,ror#6
	eor	r3,r9,r10
	eor	r0,r0,r9,ror#20
	add	r8,r8,r2
	ldr	r2,[sp,#16]
	and	r12,r12,r3
	add	r4,r4,r8
	add	r8,r8,r0,ror#2
	eor	r12,r12,r10
	vst1.32	{q8},[r1,:128]!
	add	r7,r7,r2
	eor	r2,r5,r6
	eor	r0,r4,r4,ror#5
	add	r8,r8,r12
	vld1.32	{q8},[r14,:128]!
	and	r2,r2,r4
	eor	r12,r0,r4,ror#19
	eor	r0,r8,r8,ror#11
	eor	r2,r2,r6
	vrev32.8	q1,q1
	add	r7,r7,r12,ror#6
	eor	r12,r8,r9
	eor	r0,r0,r8,ror#20
	add	r7,r7,r2
	vadd.i32	q8,q8,q1
	ldr	r2,[sp,#20]
	and	r3,r3,r12
	add	r11,r11,r7
	add	r7,r7,r0,ror#2
	eor	r3,r3,r9
	add	r6,r6,r2
	eor	r2,r4,r5
	eor	r0,r11,r11,ror#5
	add	r7,r7,r3
	and	r2,r2,r11
	eor	r3,r0,r11,ror#19
	eor	r0,r7,r7,ror#11
	eor	r2,r2,r5
	add	r6,r6,r3,ror#6
	eor	r3,r7,r8
	eor	r0,r0,r7,ror#20
	add	r6,r6,r2
	ldr	r2,[sp,#24]
	and	r12,r12,r3
	add	r10,r10,r6
	add	r6,r6,r0,ror#2
	eor	r12,r12,r8
	add	r5,r5,r2
	eor	r2,r11,r4
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12
	and	r2,r2,r10
	eor	r12,r0,r10,ror#19
	eor	r0,r6,r6,ror#11
	eor	r2,r2,r4
	add	r5,r5,r12,ror#6
	eor	r12,r6,r7
	eor	r0,r0,r6,ror#20
	add	r5,r5,r2
	ldr	r2,[sp,#28]
	and	r3,r3,r12
	add	r9,r9,r5
	add	r5,r5,r0,ror#2
	eor	r3,r3,r7
	add	r4,r4,r2
	eor	r2,r10,r11
	eor	r0,r9,r9,ror#5
	add	r5,r5,r3
	and	r2,r2,r9
	eor	r3,r0,r9,ror#19
	eor	r0,r5,r5,ror#11
	eor	r2,r2,r11
	add	r4,r4,r3,ror#6
	eor	r3,r5,r6
	eor	r0,r0,r5,ror#20
	add	r4,r4,r2
	ldr	r2,[sp,#32]
	and	r12,r12,r3
	add	r8,r8,r4
	add	r4,r4,r0,ror#2
	eor	r12,r12,r6
	vst1.32	{q8},[r1,:128]!
	add	r11,r11,r2
	eor	r2,r9,r10
	eor	r0,r8,r8,ror#5
	add	r4,r4,r12
	vld1.32	{q8},[r14,:128]!
	and	r2,r2,r8
	eor	r12,r0,r8,ror#19
	eor	r0,r4,r4,ror#11
	eor	r2,r2,r10
	vrev32.8	q2,q2
	add	r11,r11,r12,ror#6
	eor	r12,r4,r5
	eor	r0,r0,r4,ror#20
	add	r11,r11,r2
	vadd.i32	q8,q8,q2
	ldr	r2,[sp,#36]
	and	r3,r3,r12
	add	r7,r7,r11
	add	r11,r11,r0,ror#2
	eor	r3,r3,r5
	add	r10,r10,r2
	eor	r2,r8,r9
	eor	r0,r7,r7,ror#5
	add	r11,r11,r3
	and	r2,r2,r7
	eor	r3,r0,r7,ror#19
	eor	r0,r11,r11,ror#11
	eor	r2,r2,r9
	add	r10,r10,r3,ror#6
	eor	r3,r11,r4
	eor	r0,r0,r11,ror#20
	add	r10,r10,r2
	ldr	r2,[sp,#40]
	and	r12,r12,r3
	add	r6,r6,r10
	add	r10,r10,r0,ror#2
	eor	r12,r12,r4
	add	r9,r9,r2
	eor	r2,r7,r8
	eor	r0,r6,r6,ror#5
	add	r10,r10,r12
	and	r2,r2,r6
	eor	r12,r0,r6,ror#19
	eor	r0,r10,r10,ror#11
	eor	r2,r2,r8
	add	r9,r9,r12,ror#6
	eor	r12,r10,r11
	eor	r0,r0,r10,ror#20
	add	r9,r9,r2
	ldr	r2,[sp,#44]
	and	r3,r3,r12
	add	r5,r5,r9
	add	r9,r9,r0,ror#2
	eor	r3,r3,r11
	add	r8,r8,r2
	eor	r2,r6,r7
	eor	r0,r5,r5,ror#5
	add	r9,r9,r3
	and	r2,r2,r5
	eor	r3,r0,r5,ror#19
	eor	r0,r9,r9,ror#11
	eor	r2,r2,r7
	add	r8,r8,r3,ror#6
	eor	r3,r9,r10
	eor	r0,r0,r9,ror#20
	add	r8,r8,r2
	ldr	r2,[sp,#48]
	and	r12,r12,r3
	add	r4,r4,r8
	add	r8,r8,r0,ror#2
	eor	r12,r12,r10
	vst1.32	{q8},[r1,:128]!
	add	r7,r7,r2
	eor	r2,r5,r6
	eor	r0,r4,r4,ror#5
	add	r8,r8,r12
	vld1.32	{q8},[r14,:128]!
	and	r2,r2,r4
	eor	r12,r0,r4,ror#19
	eor	r0,r8,r8,ror#11
	eor	r2,r2,r6
	vrev32.8	q3,q3
	add	r7,r7,r12,ror#6
	eor	r12,r8,r9
	eor	r0,r0,r8,ror#20
	add	r7,r7,r2
	vadd.i32	q8,q8,q3
	ldr	r2,[sp,#52]
	and	r3,r3,r12
	add	r11,r11,r7
	add	r7,r7,r0,ror#2
	eor	r3,r3,r9
	add	r6,r6,r2
	eor	r2,r4,r5
	eor	r0,r11,r11,ror#5
	add	r7,r7,r3
	and	r2,r2,r11
	eor	r3,r0,r11,ror#19
	eor	r0,r7,r7,ror#11
	eor	r2,r2,r5
	add	r6,r6,r3,ror#6
	eor	r3,r7,r8
	eor	r0,r0,r7,ror#20
	add	r6,r6,r2
	ldr	r2,[sp,#56]
	and	r12,r12,r3
	add	r10,r10,r6
	add	r6,r6,r0,ror#2
	eor	r12,r12,r8
	add	r5,r5,r2
	eor	r2,r11,r4
	eor	r0,r10,r10,ror#5
	add	r6,r6,r12
	and	r2,r2,r10
	eor	r12,r0,r10,ror#19
	eor	r0,r6,r6,ror#11
	eor	r2,r2,r4
	add	r5,r5,r12,ror#6
	eor	r12,r6,r7
	eor	r0,r0,r6,ror#20
	add	r5,r5,r2
	ldr	r2,[sp,#60]
	and	r3,r3,r12
	add	r9,r9,r5
	add	r5,r5,r0,ror#2
	eor	r3,r3,r7
	add	r4,r4,r2
	eor	r2,r10,r11
	eor	r0,r9,r9,ror#5
	add	r5,r5,r3
	and	r2,r2,r9
	eor	r3,r0,r9,ror#19
	eor	r0,r5,r5,ror#11
	eor	r2,r2,r11
	add	r4,r4,r3,ror#6
	eor	r3,r5,r6
	eor	r0,r0,r5,ror#20
	add	r4,r4,r2
	ldr	r2,[sp,#64]
	and	r12,r12,r3
	add	r8,r8,r4
	add	r4,r4,r0,ror#2
	eor	r12,r12,r6
	vst1.32	{q8},[r1,:128]!
	ldr	r0,[r2,#0]
	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
	ldr	r12,[r2,#4]
	ldr	r3,[r2,#8]
	ldr	r1,[r2,#12]
	add	r4,r4,r0			@ accumulate
	ldr	r0,[r2,#16]
	add	r5,r5,r12
	ldr	r12,[r2,#20]
	add	r6,r6,r3
	ldr	r3,[r2,#24]
	add	r7,r7,r1
	ldr	r1,[r2,#28]
	add	r8,r8,r0
	str	r4,[r2],#4
	add	r9,r9,r12
	str	r5,[r2],#4
	add	r10,r10,r3
	str	r6,[r2],#4
	add	r11,r11,r1
	str	r7,[r2],#4
	stmia	r2,{r8,r9,r10,r11}

	ittte	ne
	movne	r1,sp
	ldrne	r2,[sp,#0]
	eorne	r12,r12,r12
	ldreq	sp,[sp,#76]			@ restore original sp
	itt	ne
	eorne	r3,r5,r6
	bne	L_00_48

	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}

#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)

# if defined(__thumb2__)
#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
# else
#  define INST(a,b,c,d)	.byte	a,b,c,d
# endif

#ifdef __thumb2__
.thumb_func	sha256_block_data_order_armv8
#endif
.align	5
sha256_block_data_order_armv8:
LARMv8:
	vld1.32	{q0,q1},[r0]
	sub	r3,r3,#256+32
	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
	b	Loop_v8

.align	4
Loop_v8:
	vld1.8	{q8,q9},[r1]!
	vld1.8	{q10,q11},[r1]!
	vld1.32	{q12},[r3]!
	vrev32.8	q8,q8
	vrev32.8	q9,q9
	vrev32.8	q10,q10
	vrev32.8	q11,q11
	vmov	q14,q0	@ offload
	vmov	q15,q1
	teq	r1,r2
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q8
	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q9
	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q10
	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q11
	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q8
	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q9
	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q10
	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q11
	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q8
	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q9
	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q10
	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q11
	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
	vld1.32	{q13},[r3]!
	vadd.i32	q12,q12,q8
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12

	vld1.32	{q12},[r3]!
	vadd.i32	q13,q13,q9
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13

	vld1.32	{q13},[r3]
	vadd.i32	q12,q12,q10
	sub	r3,r3,#256-16	@ rewind
	vmov	q2,q0
	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12

	vadd.i32	q13,q13,q11
	vmov	q2,q0
	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13

	vadd.i32	q0,q0,q14
	vadd.i32	q1,q1,q15
	it	ne
	bne	Loop_v8

	vst1.32	{q0,q1},[r0]

	bx	lr		@ bx lr

#endif
.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align	2
.align	2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm	_OPENSSL_armcap_P,4
.non_lazy_symbol_pointer
OPENSSL_armcap_P:
.indirect_symbol	_OPENSSL_armcap_P
.long	0
.private_extern	_OPENSSL_armcap_P
#endif