/* * arch/score/lib/csum_partial.S * * Score Processor version. * * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. * Lennox Wu <lennox.wu@sunplusct.com> * Chen Liqin <liqin.chen@sunplusct.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see the file COPYING, or write * to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/linkage.h> #define ADDC(sum,reg) \ add sum, sum, reg; \ cmp.c reg, sum; \ bleu 9f; \ addi sum, 0x1; \ 9: #define CSUM_BIGCHUNK(src, offset, sum) \ lw r8, [src, offset + 0x00]; \ lw r9, [src, offset + 0x04]; \ lw r10, [src, offset + 0x08]; \ lw r11, [src, offset + 0x0c]; \ ADDC(sum, r8); \ ADDC(sum, r9); \ ADDC(sum, r10); \ ADDC(sum, r11); \ lw r8, [src, offset + 0x10]; \ lw r9, [src, offset + 0x14]; \ lw r10, [src, offset + 0x18]; \ lw r11, [src, offset + 0x1c]; \ ADDC(sum, r8); \ ADDC(sum, r9); \ ADDC(sum, r10); \ ADDC(sum, r11); \ #define src r4 #define dest r5 #define sum r27 .text /* unknown src alignment and < 8 bytes to go */ small_csumcpy: mv r5, r10 ldi r9, 0x0 cmpi.c r25, 0x1 beq pass_small_set_t7 /*already set, jump to pass_small_set_t7*/ andri.c r25,r4 , 0x1 /*Is src 2 bytes aligned?*/ pass_small_set_t7: beq aligned cmpi.c r5, 0x0 beq fold lbu r9, [src] slli r9,r9, 0x8 /*Little endian*/ ADDC(sum, r9) addi src, 0x1 subi.c r5, 0x1 /*len still a full word */ aligned: andri.c r8, r5, 0x4 /*Len >= 4?*/ beq len_less_4bytes /* Still a full word (4byte) to go,and the src is word aligned.*/ andri.c r8, src, 0x3 /*src is 4bytes aligned, so use LW!!*/ beq four_byte_aligned lhu r9, [src] addi src, 2 ADDC(sum, r9) lhu r9, [src] addi src, 2 ADDC(sum, r9) b len_less_4bytes four_byte_aligned: /* Len >=4 and four byte aligned */ lw r9, [src] addi src, 4 ADDC(sum, r9) len_less_4bytes: /* 2 byte aligned aligned and length<4B */ andri.c r8, r5, 0x2 beq len_less_2bytes lhu r9, [src] addi src, 0x2 /* src+=2 */ ADDC(sum, r9) len_less_2bytes: /* len = 1 */ andri.c r8, r5, 0x1 beq fold /* less than 2 and not equal 1--> len=0 -> fold */ lbu r9, [src] fold_ADDC: ADDC(sum, r9) fold: /* fold checksum */ slli r26, sum, 16 add sum, sum, r26 cmp.c r26, sum srli sum, sum, 16 bleu 1f /* if r26<=sum */ addi sum, 0x1 /* r26>sum */ 1: /* odd buffer alignment? r25 was set in csum_partial */ cmpi.c r25, 0x0 beq 1f slli r26, sum, 8 srli sum, sum, 8 or sum, sum, r26 andi sum, 0xffff 1: .set optimize /* Add the passed partial csum. */ ADDC(sum, r6) mv r4, sum br r3 .set volatile .align 5 ENTRY(csum_partial) ldi sum, 0 ldi r25, 0 mv r10, r5 cmpi.c r5, 0x8 blt small_csumcpy /* < 8(singed) bytes to copy */ cmpi.c r5, 0x0 beq out andri.c r25, src, 0x1 /* odd buffer? */ beq word_align hword_align: /* 1 byte */ lbu r8, [src] subi r5, 0x1 slli r8, r8, 8 ADDC(sum, r8) addi src, 0x1 word_align: /* 2 bytes */ andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */ beq dword_align /* not, maybe dword_align */ lhu r8, [src] subi r5, 0x2 ADDC(sum, r8) addi src, 0x2 dword_align: /* 4bytes */ mv r26, r5 /* maybe useless when len >=56 */ ldi r8, 56 cmp.c r8, r5 bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */ andri.c r26, src, 0x4 beq qword_align lw r8, [src] subi r5, 0x4 ADDC(sum, r8) addi src, 0x4 qword_align: /* 8 bytes */ andri.c r26, src, 0x8 beq oword_align lw r8, [src, 0x0] lw r9, [src, 0x4] subi r5, 0x8 /* len-=0x8 */ ADDC(sum, r8) ADDC(sum, r9) addi src, 0x8 oword_align: /* 16bytes */ andri.c r26, src, 0x10 beq begin_movement lw r10, [src, 0x08] lw r11, [src, 0x0c] lw r8, [src, 0x00] lw r9, [src, 0x04] ADDC(sum, r10) ADDC(sum, r11) ADDC(sum, r8) ADDC(sum, r9) subi r5, 0x10 addi src, 0x10 begin_movement: srli.c r26, r5, 0x7 /* len>=128? */ beq 1f /* len<128 */ /* r26 is the result that computed in oword_align */ move_128bytes: CSUM_BIGCHUNK(src, 0x00, sum) CSUM_BIGCHUNK(src, 0x20, sum) CSUM_BIGCHUNK(src, 0x40, sum) CSUM_BIGCHUNK(src, 0x60, sum) subi.c r26, 0x01 /* r26 equals len/128 */ addi src, 0x80 bne move_128bytes 1: /* len<128,we process 64byte here */ andri.c r10, r5, 0x40 beq 1f move_64bytes: CSUM_BIGCHUNK(src, 0x00, sum) CSUM_BIGCHUNK(src, 0x20, sum) addi src, 0x40 1: /* len<64 */ andri r26, r5, 0x1c /* 0x1c=28 */ andri.c r10, r5, 0x20 beq do_end_words /* decided by andri */ move_32bytes: CSUM_BIGCHUNK(src, 0x00, sum) andri r26, r5, 0x1c addri src, src, 0x20 do_end_words: /* len<32 */ /* r26 was set already in dword_align */ cmpi.c r26, 0x0 beq maybe_end_cruft /* len<28 or len<56 */ srli r26, r26, 0x2 end_words: lw r8, [src] subi.c r26, 0x1 /* unit is 4 byte */ ADDC(sum, r8) addi src, 0x4 cmpi.c r26, 0x0 bne end_words /* r26!=0 */ maybe_end_cruft: /* len<4 */ andri r10, r5, 0x3 small_memcpy: mv r5, r10 j small_csumcpy out: mv r4, sum br r3 END(csum_partial)