.global RestoreRegisters_NEON
  .global ReverseLine_NEON
  .global ReverseLineUV_NEON
  .global SaveRegisters_NEON
  .global TransposeWx8_NEON
  .global TransposeUVWx8_NEON
  .type RestoreRegisters_NEON, function
  .type ReverseLine_NEON, function
  .type ReverseLineUV_NEON, function
  .type SaveRegisters_NEON, function
  .type TransposeWx8_NEON, function
  .type TransposeUVWx8_NEON, function

@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
@ r0 const uint8* src
@ r1 uint8* dst
@ r2 width
ReverseLine_NEON:

  @ compute where to start writing destination
  add         r1, r2      @ dst + width

  @ work on segments that are multiples of 16
  lsrs        r3, r2, #4

  @ the output is written in two block.  8 bytes followed
  @ by another 8.  reading is done sequentially, from left to
  @ right.  writing is done from right to left in block sizes
  @ r1, the destination pointer is incremented after writing
  @ the first of the two blocks.  need to subtract that 8 off
  @ along with 16 to get the next location.
  mov         r3, #-24

  beq         Lline_residuals

  @ back of destination by the size of the register that is
  @ going to be reversed
  sub         r1, #16

  @ the loop needs to run on blocks of 16.  what will be left
  @ over is either a negative number, the residuals that need
  @ to be done, or 0.  if this isn't subtracted off here the
  @ loop will run one extra time.
  sub         r2, #16

Lsegments_of_16:
    vld1.8      {q0}, [r0]!               @ src += 16

    @ reverse the bytes in the 64 bit segments.  unable to reverse
    @ the bytes in the entire 128 bits in one go.
    vrev64.8    q0, q0

    @ because of the inability to reverse the entire 128 bits
    @ reverse the writing out of the two 64 bit segments.
    vst1.8      {d1}, [r1]!
    vst1.8      {d0}, [r1], r3            @ dst -= 16

    subs        r2, #16
    bge         Lsegments_of_16

  @ add 16 back to the counter.  if the result is 0 there is no
  @ residuals so return
  adds        r2, #16
  bxeq        lr

  add         r1, #16

Lline_residuals:

  mov         r3, #-3

  sub         r1, #2
  subs        r2, #2
  @ check for 16*n+1 scenarios where segments_of_2 should not
  @ be run, but there is something left over.
  blt         Lsegment_of_1

@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
Lsegments_of_2:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2

    vst1.8      {d1[0]}, [r1]!
    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2

    subs        r2, #2
    bge         Lsegments_of_2

  adds        r2, #2
  bxeq        lr

Lsegment_of_1:
  add         r1, #1
  vld1.8      {d0[0]}, [r0]
  vst1.8      {d0[0]}, [r1]

  bx          lr

@ void TransposeWx8_NEON (const uint8* src, int src_stride,
@                         uint8* dst, int dst_stride,
@                         int w)
@ r0 const uint8* src
@ r1 int src_stride
@ r2 uint8* dst
@ r3 int dst_stride
@ stack int w
TransposeWx8_NEON:
  push        {r4,r8,r9,lr}

  ldr         r8, [sp, #16]        @ width

  @ loops are on blocks of 8.  loop will stop when
  @ counter gets to or below 0.  starting the counter
  @ at w-8 allow for this
  sub         r8, #8

@ handle 8x8 blocks.  this should be the majority of the plane
Lloop_8x8:
    mov         r9, r0

    vld1.8      {d0}, [r9], r1
    vld1.8      {d1}, [r9], r1
    vld1.8      {d2}, [r9], r1
    vld1.8      {d3}, [r9], r1
    vld1.8      {d4}, [r9], r1
    vld1.8      {d5}, [r9], r1
    vld1.8      {d6}, [r9], r1
    vld1.8      {d7}, [r9]

    vtrn.8      d1, d0
    vtrn.8      d3, d2
    vtrn.8      d5, d4
    vtrn.8      d7, d6

    vtrn.16     d1, d3
    vtrn.16     d0, d2
    vtrn.16     d5, d7
    vtrn.16     d4, d6

    vtrn.32     d1, d5
    vtrn.32     d0, d4
    vtrn.32     d3, d7
    vtrn.32     d2, d6

    vrev16.8    q0, q0
    vrev16.8    q1, q1
    vrev16.8    q2, q2
    vrev16.8    q3, q3

    mov         r9, r2

    vst1.8      {d1}, [r9], r3
    vst1.8      {d0}, [r9], r3
    vst1.8      {d3}, [r9], r3
    vst1.8      {d2}, [r9], r3
    vst1.8      {d5}, [r9], r3
    vst1.8      {d4}, [r9], r3
    vst1.8      {d7}, [r9], r3
    vst1.8      {d6}, [r9]

    add         r0, #8            @ src += 8
    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
    subs        r8,  #8           @ w   -= 8
    bge         Lloop_8x8

  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
  beq         Ldone

  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
  blt         Lblock_1x8

  cmp         r8, #4
  blt         Lblock_2x8

Lblock_4x8:
  mov         r9, r0
  vld1.32     {d0[0]}, [r9], r1
  vld1.32     {d0[1]}, [r9], r1
  vld1.32     {d1[0]}, [r9], r1
  vld1.32     {d1[1]}, [r9], r1
  vld1.32     {d2[0]}, [r9], r1
  vld1.32     {d2[1]}, [r9], r1
  vld1.32     {d3[0]}, [r9], r1
  vld1.32     {d3[1]}, [r9]

  mov         r9, r2

  adr         r12, vtbl_4x4_transpose
  vld1.8      {q3}, [r12]

  vtbl.8      d4, {d0, d1}, d6
  vtbl.8      d5, {d0, d1}, d7
  vtbl.8      d0, {d2, d3}, d6
  vtbl.8      d1, {d2, d3}, d7

  @ TODO: rework shuffle above to write
  @       out with 4 instead of 8 writes
  vst1.32     {d4[0]}, [r9], r3
  vst1.32     {d4[1]}, [r9], r3
  vst1.32     {d5[0]}, [r9], r3
  vst1.32     {d5[1]}, [r9]

  add         r9, r2, #4
  vst1.32     {d0[0]}, [r9], r3
  vst1.32     {d0[1]}, [r9], r3
  vst1.32     {d1[0]}, [r9], r3
  vst1.32     {d1[1]}, [r9]

  add         r0, #4            @ src += 4
  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
  subs        r8,  #4           @ w   -= 4
  beq         Ldone

  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
  blt         Lblock_1x8

Lblock_2x8:
  mov         r9, r0
  vld1.16     {d0[0]}, [r9], r1
  vld1.16     {d1[0]}, [r9], r1
  vld1.16     {d0[1]}, [r9], r1
  vld1.16     {d1[1]}, [r9], r1
  vld1.16     {d0[2]}, [r9], r1
  vld1.16     {d1[2]}, [r9], r1
  vld1.16     {d0[3]}, [r9], r1
  vld1.16     {d1[3]}, [r9]

  vtrn.8      d0, d1

  mov         r9, r2

  vst1.64     {d0}, [r9], r3
  vst1.64     {d1}, [r9]

  add         r0, #2            @ src += 2
  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
  subs        r8,  #2           @ w   -= 2
  beq         Ldone

Lblock_1x8:
  vld1.8      {d0[0]}, [r0], r1
  vld1.8      {d0[1]}, [r0], r1
  vld1.8      {d0[2]}, [r0], r1
  vld1.8      {d0[3]}, [r0], r1
  vld1.8      {d0[4]}, [r0], r1
  vld1.8      {d0[5]}, [r0], r1
  vld1.8      {d0[6]}, [r0], r1
  vld1.8      {d0[7]}, [r0]

  vst1.64     {d0}, [r2]

Ldone:

  pop         {r4,r8,r9,pc}

vtbl_4x4_transpose:
  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

@ void SaveRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
SaveRegisters_NEON:
  vst1.i64    {d8, d9, d10, d11}, [r0]!
  vst1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr

@ void RestoreRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
RestoreRegisters_NEON:
  vld1.i64    {d8, d9, d10, d11}, [r0]!
  vld1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr

@ void ReverseLineUV_NEON (const uint8* src,
@                          uint8* dst_a,
@                          uint8* dst_b,
@                          int width)
@ r0 const uint8* src
@ r1 uint8* dst_a
@ r2 uint8* dst_b
@ r3 width
ReverseLineUV_NEON:

  @ compute where to start writing destination
  add         r1, r1, r3      @ dst_a + width
  add         r2, r2, r3      @ dst_b + width

  @ work on input segments that are multiples of 16, but
  @ width that has been passed is output segments, half
  @ the size of input.
  lsrs        r12, r3, #3

  beq         Lline_residuals_di

  @ the output is written in to two blocks.
  mov         r12, #-8

  @ back of destination by the size of the register that is
  @ going to be reversed
  sub         r1, r1, #8
  sub         r2, r2, #8

  @ the loop needs to run on blocks of 8.  what will be left
  @ over is either a negative number, the residuals that need
  @ to be done, or 0.  if this isn't subtracted off here the
  @ loop will run one extra time.
  sub         r3, r3, #8

Lsegments_of_8_di:
    vld2.8      {d0, d1}, [r0]!         @ src += 16

    @ reverse the bytes in the 64 bit segments
    vrev64.8    q0, q0

    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
    vst1.8      {d1}, [r2], r12         @ dst_b -= 8

    subs        r3, r3, #8
    bge         Lsegments_of_8_di

  @ add 8 back to the counter.  if the result is 0 there is no
  @ residuals so return
  adds        r3, r3, #8
  bxeq        lr

  add         r1, r1, #8
  add         r2, r2, #8

Lline_residuals_di:

  mov         r12, #-1

  sub         r1, r1, #1
  sub         r2, r2, #1

@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
Lsegments_of_1:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2

    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1

    subs        r3, r3, #1
    bgt         Lsegments_of_1

  bx          lr

@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
@                           uint8* dst_a, int dst_stride_a,
@                           uint8* dst_b, int dst_stride_b,
@                           int width)
@ r0 const uint8* src
@ r1 int src_stride
@ r2 uint8* dst_a
@ r3 int dst_stride_a
@ stack uint8* dst_b
@ stack int dst_stride_b
@ stack int width
TransposeUVWx8_NEON:
  push        {r4-r9,lr}

  ldr         r4, [sp, #28]         @ dst_b
  ldr         r5, [sp, #32]         @ dst_stride_b
  ldr         r8, [sp, #36]         @ width
  @ loops are on blocks of 8.  loop will stop when
  @ counter gets to or below 0.  starting the counter
  @ at w-8 allow for this
  sub         r8, #8

@ handle 8x8 blocks.  this should be the majority of the plane
Lloop_8x8_di:
    mov         r9, r0

    vld2.8      {d0,  d1},  [r9], r1
    vld2.8      {d2,  d3},  [r9], r1
    vld2.8      {d4,  d5},  [r9], r1
    vld2.8      {d6,  d7},  [r9], r1
    vld2.8      {d8,  d9},  [r9], r1
    vld2.8      {d10, d11}, [r9], r1
    vld2.8      {d12, d13}, [r9], r1
    vld2.8      {d14, d15}, [r9]

    vtrn.8      q1, q0
    vtrn.8      q3, q2
    vtrn.8      q5, q4
    vtrn.8      q7, q6

    vtrn.16     q1, q3
    vtrn.16     q0, q2
    vtrn.16     q5, q7
    vtrn.16     q4, q6

    vtrn.32     q1, q5
    vtrn.32     q0, q4
    vtrn.32     q3, q7
    vtrn.32     q2, q6

    vrev16.8    q0, q0
    vrev16.8    q1, q1
    vrev16.8    q2, q2
    vrev16.8    q3, q3
    vrev16.8    q4, q4
    vrev16.8    q5, q5
    vrev16.8    q6, q6
    vrev16.8    q7, q7

    mov         r9, r2

    vst1.8      {d2},  [r9], r3
    vst1.8      {d0},  [r9], r3
    vst1.8      {d6},  [r9], r3
    vst1.8      {d4},  [r9], r3
    vst1.8      {d10}, [r9], r3
    vst1.8      {d8},  [r9], r3
    vst1.8      {d14}, [r9], r3
    vst1.8      {d12}, [r9]

    mov         r9, r4

    vst1.8      {d3},  [r9], r5
    vst1.8      {d1},  [r9], r5
    vst1.8      {d7},  [r9], r5
    vst1.8      {d5},  [r9], r5
    vst1.8      {d11}, [r9], r5
    vst1.8      {d9},  [r9], r5
    vst1.8      {d15}, [r9], r5
    vst1.8      {d13}, [r9]

    add         r0, #8*2          @ src   += 8*2
    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
    subs        r8,  #8           @ w     -= 8
    bge         Lloop_8x8_di

  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
  beq         Ldone_di

  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
  blt         Lblock_1x8_di

  cmp         r8, #4
  blt         Lblock_2x8_di

@ TODO(frkoenig) : clean this up
Lblock_4x8_di:
  mov         r9, r0
  vld1.64     {d0}, [r9], r1
  vld1.64     {d1}, [r9], r1
  vld1.64     {d2}, [r9], r1
  vld1.64     {d3}, [r9], r1
  vld1.64     {d4}, [r9], r1
  vld1.64     {d5}, [r9], r1
  vld1.64     {d6}, [r9], r1
  vld1.64     {d7}, [r9]

  adr         r12, vtbl_4x4_transpose_di
  vld1.8      {q7}, [r12]

  vtrn.8      q0, q1
  vtrn.8      q2, q3

  vtbl.8      d8,  {d0, d1}, d14
  vtbl.8      d9,  {d0, d1}, d15
  vtbl.8      d10, {d2, d3}, d14
  vtbl.8      d11, {d2, d3}, d15
  vtbl.8      d12, {d4, d5}, d14
  vtbl.8      d13, {d4, d5}, d15
  vtbl.8      d0,  {d6, d7}, d14
  vtbl.8      d1,  {d6, d7}, d15

  mov         r9, r2

  vst1.32     {d8[0]},  [r9], r3
  vst1.32     {d8[1]},  [r9], r3
  vst1.32     {d9[0]},  [r9], r3
  vst1.32     {d9[1]},  [r9], r3

  add         r9, r2, #4
  vst1.32     {d12[0]}, [r9], r3
  vst1.32     {d12[1]}, [r9], r3
  vst1.32     {d13[0]}, [r9], r3
  vst1.32     {d13[1]}, [r9]

  mov         r9, r4

  vst1.32     {d10[0]}, [r9], r5
  vst1.32     {d10[1]}, [r9], r5
  vst1.32     {d11[0]}, [r9], r5
  vst1.32     {d11[1]}, [r9], r5

  add         r9, r4, #4
  vst1.32     {d0[0]},  [r9], r5
  vst1.32     {d0[1]},  [r9], r5
  vst1.32     {d1[0]},  [r9], r5
  vst1.32     {d1[1]},  [r9]

  add         r0, #4*2          @ src   += 4 * 2
  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
  subs        r8,  #4           @ w     -= 4
  beq         Ldone_di

  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
  blt         Lblock_1x8_di

Lblock_2x8_di:
  mov         r9, r0
  vld2.16     {d0[0], d2[0]}, [r9], r1
  vld2.16     {d1[0], d3[0]}, [r9], r1
  vld2.16     {d0[1], d2[1]}, [r9], r1
  vld2.16     {d1[1], d3[1]}, [r9], r1
  vld2.16     {d0[2], d2[2]}, [r9], r1
  vld2.16     {d1[2], d3[2]}, [r9], r1
  vld2.16     {d0[3], d2[3]}, [r9], r1
  vld2.16     {d1[3], d3[3]}, [r9]

  vtrn.8      d0, d1
  vtrn.8      d2, d3

  mov         r9, r2

  vst1.64     {d0}, [r9], r3
  vst1.64     {d2}, [r9]

  mov         r9, r4

  vst1.64     {d1}, [r9], r5
  vst1.64     {d3}, [r9]

  add         r0, #2*2          @ src   += 2 * 2
  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
  subs        r8,  #2           @ w     -= 2
  beq         Ldone_di

Lblock_1x8_di:
  vld2.8      {d0[0], d1[0]}, [r0], r1
  vld2.8      {d0[1], d1[1]}, [r0], r1
  vld2.8      {d0[2], d1[2]}, [r0], r1
  vld2.8      {d0[3], d1[3]}, [r0], r1
  vld2.8      {d0[4], d1[4]}, [r0], r1
  vld2.8      {d0[5], d1[5]}, [r0], r1
  vld2.8      {d0[6], d1[6]}, [r0], r1
  vld2.8      {d0[7], d1[7]}, [r0]

  vst1.64     {d0}, [r2]
  vst1.64     {d1}, [r4]

Ldone_di:
  pop         {r4-r9, pc}

vtbl_4x4_transpose_di:
  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15