256 lines
5.4 KiB
ArmAsm
256 lines
5.4 KiB
ArmAsm
|
/*
|
||
|
* arch/score/lib/csum_partial.S
|
||
|
*
|
||
|
* Score Processor version.
|
||
|
*
|
||
|
* Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
|
||
|
* Lennox Wu <lennox.wu@sunplusct.com>
|
||
|
* Chen Liqin <liqin.chen@sunplusct.com>
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation; either version 2 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with this program; if not, see the file COPYING, or write
|
||
|
* to the Free Software Foundation, Inc.,
|
||
|
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
#include <linux/linkage.h>
|
||
|
|
||
|
#define ADDC(sum,reg) \
|
||
|
add sum, sum, reg; \
|
||
|
cmp.c reg, sum; \
|
||
|
bleu 9f; \
|
||
|
addi sum, 0x1; \
|
||
|
9:
|
||
|
|
||
|
#define CSUM_BIGCHUNK(src, offset, sum) \
|
||
|
lw r8, [src, offset + 0x00]; \
|
||
|
lw r9, [src, offset + 0x04]; \
|
||
|
lw r10, [src, offset + 0x08]; \
|
||
|
lw r11, [src, offset + 0x0c]; \
|
||
|
ADDC(sum, r8); \
|
||
|
ADDC(sum, r9); \
|
||
|
ADDC(sum, r10); \
|
||
|
ADDC(sum, r11); \
|
||
|
lw r8, [src, offset + 0x10]; \
|
||
|
lw r9, [src, offset + 0x14]; \
|
||
|
lw r10, [src, offset + 0x18]; \
|
||
|
lw r11, [src, offset + 0x1c]; \
|
||
|
ADDC(sum, r8); \
|
||
|
ADDC(sum, r9); \
|
||
|
ADDC(sum, r10); \
|
||
|
ADDC(sum, r11); \
|
||
|
|
||
|
#define src r4
|
||
|
#define dest r5
|
||
|
#define sum r27
|
||
|
|
||
|
.text
|
||
|
/* unknown src alignment and < 8 bytes to go */
|
||
|
small_csumcpy:
|
||
|
mv r5, r10
|
||
|
ldi r9, 0x0
|
||
|
cmpi.c r25, 0x1
|
||
|
beq pass_small_set_t7 /*already set, jump to pass_small_set_t7*/
|
||
|
andri.c r25,r4 , 0x1 /*Is src 2 bytes aligned?*/
|
||
|
|
||
|
pass_small_set_t7:
|
||
|
beq aligned
|
||
|
cmpi.c r5, 0x0
|
||
|
beq fold
|
||
|
lbu r9, [src]
|
||
|
slli r9,r9, 0x8 /*Little endian*/
|
||
|
ADDC(sum, r9)
|
||
|
addi src, 0x1
|
||
|
subi.c r5, 0x1
|
||
|
|
||
|
/*len still a full word */
|
||
|
aligned:
|
||
|
andri.c r8, r5, 0x4 /*Len >= 4?*/
|
||
|
beq len_less_4bytes
|
||
|
|
||
|
/* Still a full word (4byte) to go,and the src is word aligned.*/
|
||
|
andri.c r8, src, 0x3 /*src is 4bytes aligned, so use LW!!*/
|
||
|
beq four_byte_aligned
|
||
|
lhu r9, [src]
|
||
|
addi src, 2
|
||
|
ADDC(sum, r9)
|
||
|
lhu r9, [src]
|
||
|
addi src, 2
|
||
|
ADDC(sum, r9)
|
||
|
b len_less_4bytes
|
||
|
|
||
|
four_byte_aligned: /* Len >=4 and four byte aligned */
|
||
|
lw r9, [src]
|
||
|
addi src, 4
|
||
|
ADDC(sum, r9)
|
||
|
|
||
|
len_less_4bytes: /* 2 byte aligned aligned and length<4B */
|
||
|
andri.c r8, r5, 0x2
|
||
|
beq len_less_2bytes
|
||
|
lhu r9, [src]
|
||
|
addi src, 0x2 /* src+=2 */
|
||
|
ADDC(sum, r9)
|
||
|
|
||
|
len_less_2bytes: /* len = 1 */
|
||
|
andri.c r8, r5, 0x1
|
||
|
beq fold /* less than 2 and not equal 1--> len=0 -> fold */
|
||
|
lbu r9, [src]
|
||
|
|
||
|
fold_ADDC:
|
||
|
ADDC(sum, r9)
|
||
|
fold:
|
||
|
/* fold checksum */
|
||
|
slli r26, sum, 16
|
||
|
add sum, sum, r26
|
||
|
cmp.c r26, sum
|
||
|
srli sum, sum, 16
|
||
|
bleu 1f /* if r26<=sum */
|
||
|
addi sum, 0x1 /* r26>sum */
|
||
|
1:
|
||
|
/* odd buffer alignment? r25 was set in csum_partial */
|
||
|
cmpi.c r25, 0x0
|
||
|
beq 1f
|
||
|
slli r26, sum, 8
|
||
|
srli sum, sum, 8
|
||
|
or sum, sum, r26
|
||
|
andi sum, 0xffff
|
||
|
1:
|
||
|
.set optimize
|
||
|
/* Add the passed partial csum. */
|
||
|
ADDC(sum, r6)
|
||
|
mv r4, sum
|
||
|
br r3
|
||
|
.set volatile
|
||
|
|
||
|
.align 5
|
||
|
ENTRY(csum_partial)
|
||
|
ldi sum, 0
|
||
|
ldi r25, 0
|
||
|
mv r10, r5
|
||
|
cmpi.c r5, 0x8
|
||
|
blt small_csumcpy /* < 8(signed) bytes to copy */
|
||
|
cmpi.c r5, 0x0
|
||
|
beq out
|
||
|
andri.c r25, src, 0x1 /* odd buffer? */
|
||
|
|
||
|
beq word_align
|
||
|
hword_align: /* 1 byte */
|
||
|
lbu r8, [src]
|
||
|
subi r5, 0x1
|
||
|
slli r8, r8, 8
|
||
|
ADDC(sum, r8)
|
||
|
addi src, 0x1
|
||
|
|
||
|
word_align: /* 2 bytes */
|
||
|
andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */
|
||
|
beq dword_align /* not, maybe dword_align */
|
||
|
lhu r8, [src]
|
||
|
subi r5, 0x2
|
||
|
ADDC(sum, r8)
|
||
|
addi src, 0x2
|
||
|
|
||
|
dword_align: /* 4bytes */
|
||
|
mv r26, r5 /* maybe useless when len >=56 */
|
||
|
ldi r8, 56
|
||
|
cmp.c r8, r5
|
||
|
bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */
|
||
|
andri.c r26, src, 0x4
|
||
|
beq qword_align
|
||
|
lw r8, [src]
|
||
|
subi r5, 0x4
|
||
|
ADDC(sum, r8)
|
||
|
addi src, 0x4
|
||
|
|
||
|
qword_align: /* 8 bytes */
|
||
|
andri.c r26, src, 0x8
|
||
|
beq oword_align
|
||
|
lw r8, [src, 0x0]
|
||
|
lw r9, [src, 0x4]
|
||
|
subi r5, 0x8 /* len-=0x8 */
|
||
|
ADDC(sum, r8)
|
||
|
ADDC(sum, r9)
|
||
|
addi src, 0x8
|
||
|
|
||
|
oword_align: /* 16bytes */
|
||
|
andri.c r26, src, 0x10
|
||
|
beq begin_movement
|
||
|
lw r10, [src, 0x08]
|
||
|
lw r11, [src, 0x0c]
|
||
|
lw r8, [src, 0x00]
|
||
|
lw r9, [src, 0x04]
|
||
|
ADDC(sum, r10)
|
||
|
ADDC(sum, r11)
|
||
|
ADDC(sum, r8)
|
||
|
ADDC(sum, r9)
|
||
|
subi r5, 0x10
|
||
|
addi src, 0x10
|
||
|
|
||
|
begin_movement:
|
||
|
srli.c r26, r5, 0x7 /* len>=128? */
|
||
|
beq 1f /* len<128 */
|
||
|
|
||
|
/* r26 is the result that computed in oword_align */
|
||
|
move_128bytes:
|
||
|
CSUM_BIGCHUNK(src, 0x00, sum)
|
||
|
CSUM_BIGCHUNK(src, 0x20, sum)
|
||
|
CSUM_BIGCHUNK(src, 0x40, sum)
|
||
|
CSUM_BIGCHUNK(src, 0x60, sum)
|
||
|
subi.c r26, 0x01 /* r26 equals len/128 */
|
||
|
addi src, 0x80
|
||
|
bne move_128bytes
|
||
|
|
||
|
1: /* len<128,we process 64byte here */
|
||
|
andri.c r10, r5, 0x40
|
||
|
beq 1f
|
||
|
|
||
|
move_64bytes:
|
||
|
CSUM_BIGCHUNK(src, 0x00, sum)
|
||
|
CSUM_BIGCHUNK(src, 0x20, sum)
|
||
|
addi src, 0x40
|
||
|
|
||
|
1: /* len<64 */
|
||
|
andri r26, r5, 0x1c /* 0x1c=28 */
|
||
|
andri.c r10, r5, 0x20
|
||
|
beq do_end_words /* decided by andri */
|
||
|
|
||
|
move_32bytes:
|
||
|
CSUM_BIGCHUNK(src, 0x00, sum)
|
||
|
andri r26, r5, 0x1c
|
||
|
addri src, src, 0x20
|
||
|
|
||
|
do_end_words: /* len<32 */
|
||
|
/* r26 was set already in dword_align */
|
||
|
cmpi.c r26, 0x0
|
||
|
beq maybe_end_cruft /* len<28 or len<56 */
|
||
|
srli r26, r26, 0x2
|
||
|
|
||
|
end_words:
|
||
|
lw r8, [src]
|
||
|
subi.c r26, 0x1 /* unit is 4 byte */
|
||
|
ADDC(sum, r8)
|
||
|
addi src, 0x4
|
||
|
cmpi.c r26, 0x0
|
||
|
bne end_words /* r26!=0 */
|
||
|
|
||
|
maybe_end_cruft: /* len<4 */
|
||
|
andri r10, r5, 0x3
|
||
|
|
||
|
small_memcpy:
|
||
|
mv r5, r10
|
||
|
j small_csumcpy
|
||
|
|
||
|
out:
|
||
|
mv r4, sum
|
||
|
br r3
|
||
|
|
||
|
END(csum_partial)
|