arch/score/lib/checksum.S - kernel/bruno - Git at Google

 /*
  * arch/score/lib/csum_partial.S
  *
  * Score Processor version.
  *
  * Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
  *  Lennox Wu <lennox.wu@sunplusct.com>
  *  Chen Liqin <liqin.chen@sunplusct.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, see the file COPYING, or write
  * to the Free Software Foundation, Inc.,
  * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/linkage.h>

 #define ADDC(sum,reg)			\
 	add	sum, sum, reg;		\
 	cmp.c	reg, sum;		\
 	bleu	9f;			\
 	addi	sum, 0x1;		\
 9:

 #define CSUM_BIGCHUNK(src, offset, sum)		\
 	lw	r8, [src, offset + 0x00];	\
 	lw	r9, [src, offset + 0x04];	\
 	lw	r10, [src, offset + 0x08];	\
 	lw	r11, [src, offset + 0x0c];	\
 	ADDC(sum, r8);				\
 	ADDC(sum, r9);				\
 	ADDC(sum, r10);				\
 	ADDC(sum, r11);				\
 	lw	r8, [src, offset + 0x10];	\
 	lw	r9, [src, offset + 0x14];	\
 	lw	r10, [src, offset + 0x18]; 	\
 	lw	r11, [src, offset + 0x1c]; 	\
 	ADDC(sum, r8);				\
 	ADDC(sum, r9);				\
 	ADDC(sum, r10);				\
 	ADDC(sum, r11);				\

 #define src r4
 #define dest r5
 #define sum r27

 	.text
 /* unknown src alignment and < 8 bytes to go */
 small_csumcpy:
 	mv	r5, r10
 	ldi	r9, 0x0
 	cmpi.c	r25, 0x1
 	beq pass_small_set_t7	/*already set, jump to pass_small_set_t7*/
 	andri.c	r25,r4 , 0x1	/*Is src 2 bytes aligned?*/

 pass_small_set_t7:
 	beq	aligned
 	cmpi.c	r5, 0x0
 	beq	fold
 	lbu	r9, [src]
 	slli	r9,r9, 0x8	/*Little endian*/
 	ADDC(sum, r9)
 	addi	src, 0x1
 	subi.c	r5, 0x1

 	/*len still a full word */
 aligned:
 	andri.c r8, r5, 0x4	/*Len >= 4?*/
 	beq	len_less_4bytes

 	/* Still a full word (4byte) to go,and the src is word aligned.*/
 	andri.c	r8, src, 0x3	/*src is 4bytes aligned, so use LW!!*/
 	beq	four_byte_aligned
 	lhu 	r9, [src]
 	addi	src, 2
 	ADDC(sum, r9)
 	lhu 	r9, [src]
 	addi	src, 2
 	ADDC(sum, r9)
 	b len_less_4bytes

 four_byte_aligned:		/* Len >=4 and four byte aligned */
 	lw	r9, [src]
 	addi	src, 4
 	ADDC(sum, r9)

 len_less_4bytes:		/* 2 byte aligned aligned and length<4B */
 	andri.c r8, r5, 0x2
 	beq	len_less_2bytes
 	lhu	r9, [src]
 	addi	src, 0x2	/* src+=2 */
 	ADDC(sum, r9)

 len_less_2bytes:		/* len = 1 */
 	andri.c r8, r5, 0x1
 	beq 	fold		/* less than 2 and not equal 1--> len=0 -> fold */
 	lbu	r9, [src]

 fold_ADDC:
 	ADDC(sum, r9)
 fold:
 	/* fold checksum */
 	slli	r26, sum, 16
 	add	sum, sum, r26
 	cmp.c	r26, sum
 	srli	sum, sum, 16
 	bleu 	1f 		/* if r26<=sum */
 	addi	sum, 0x1 	/* r26>sum */
 1:
 	/* odd buffer alignment? r25 was set in csum_partial */
 	cmpi.c	r25, 0x0
 	beq	1f
 	slli	r26, sum, 8
 	srli	sum, sum, 8
 	or	sum, sum, r26
 	andi	sum, 0xffff
 1:
 	.set	optimize
 	/* Add the passed partial csum. */
 	ADDC(sum, r6)
 	mv	r4, sum
 	br	r3
 	.set	volatile

 	.align	5
 ENTRY(csum_partial)
 	ldi sum, 0
 	ldi r25, 0
 	mv r10, r5
 	cmpi.c	r5, 0x8
 	blt	small_csumcpy		/* < 8(signed) bytes to copy */
 	cmpi.c	r5, 0x0
 	beq	out
 	andri.c	r25, src, 0x1		/* odd buffer? */

 	beq	word_align
 hword_align:				/* 1 byte */
 	lbu	r8, [src]
 	subi	r5, 0x1
 	slli	r8, r8, 8
 	ADDC(sum, r8)
 	addi	src, 0x1

 word_align:				/* 2 bytes */
 	andri.c r8, src, 0x2		/* 4bytes(dword)_aligned? */
 	beq	dword_align		/* not, maybe dword_align */
 	lhu	r8, [src]
 	subi	r5, 0x2
 	ADDC(sum, r8)
 	addi	src, 0x2

 dword_align:				/* 4bytes */
 	mv 	r26, r5			/* maybe useless when len >=56 */
 	ldi 	r8, 56
 	cmp.c	r8, r5
 	bgtu	do_end_words		/* if a1(len)<t0(56) ,unsigned */
 	andri.c	r26, src, 0x4
 	beq	qword_align
 	lw	r8, [src]
 	subi	r5, 0x4
 	ADDC(sum, r8)
 	addi	src, 0x4

 qword_align:				/* 8 bytes */
 	andri.c r26, src, 0x8
 	beq	oword_align
 	lw	r8, [src, 0x0]
 	lw	r9, [src, 0x4]
 	subi	r5, 0x8			/* len-=0x8 */
 	ADDC(sum, r8)
 	ADDC(sum, r9)
 	addi	src, 0x8

 oword_align:				/* 16bytes */
 	andri.c	r26, src, 0x10
 	beq	begin_movement
 	lw	r10, [src, 0x08]
 	lw	r11, [src, 0x0c]
 	lw	r8, [src, 0x00]
 	lw	r9, [src, 0x04]
 	ADDC(sum, r10)
 	ADDC(sum, r11)
 	ADDC(sum, r8)
 	ADDC(sum, r9)
 	subi	r5, 0x10
 	addi	src, 0x10

 begin_movement:
 	srli.c	r26, r5, 0x7		/* len>=128? */
 	beq	1f			/* len<128 */

 /* r26 is the result that computed in oword_align */
 move_128bytes:
 	CSUM_BIGCHUNK(src, 0x00, sum)
 	CSUM_BIGCHUNK(src, 0x20, sum)
 	CSUM_BIGCHUNK(src, 0x40, sum)
 	CSUM_BIGCHUNK(src, 0x60, sum)
 	subi.c	r26, 0x01		/* r26 equals len/128 */
 	addi	src, 0x80
 	bne	move_128bytes

 1:	/* len<128,we process 64byte here */
 	andri.c	r10, r5, 0x40
 	beq	1f

 move_64bytes:
 	CSUM_BIGCHUNK(src, 0x00, sum)
 	CSUM_BIGCHUNK(src, 0x20, sum)
 	addi	src, 0x40

 1:					/* len<64 */
 	andri	r26, r5, 0x1c		/* 0x1c=28 */
 	andri.c	r10, r5, 0x20
 	beq	do_end_words		/* decided by andri */

 move_32bytes:
 	CSUM_BIGCHUNK(src, 0x00, sum)
 	andri	r26, r5, 0x1c
 	addri	src, src, 0x20

 do_end_words:				/* len<32 */
 	/* r26 was set already in dword_align */
 	cmpi.c	r26, 0x0
 	beq	maybe_end_cruft		/* len<28 or len<56 */
 	srli	r26, r26, 0x2

 end_words:
 	lw	r8, [src]
 	subi.c	r26, 0x1		/* unit is 4 byte */
 	ADDC(sum, r8)
 	addi	src, 0x4
 	cmpi.c	r26, 0x0
 	bne	end_words		/* r26!=0 */

 maybe_end_cruft:			/* len<4 */
 	andri	r10, r5, 0x3

 small_memcpy:
 	mv	r5, r10
 	j	small_csumcpy

 out:
 	mv	r4, sum
 	br	r3

 END(csum_partial)
	/*
	* arch/score/lib/csum_partial.S
	*
	* Score Processor version.
	*
	* Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
	* Lennox Wu <lennox.wu@sunplusct.com>
	* Chen Liqin <liqin.chen@sunplusct.com>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, see the file COPYING, or write
	* to the Free Software Foundation, Inc.,
	* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*/
	#include <linux/linkage.h>

	#define ADDC(sum,reg) \
	add sum, sum, reg; \
	cmp.c reg, sum; \
	bleu 9f; \
	addi sum, 0x1; \
	9:

	#define CSUM_BIGCHUNK(src, offset, sum) \
	lw r8, [src, offset + 0x00]; \
	lw r9, [src, offset + 0x04]; \
	lw r10, [src, offset + 0x08]; \
	lw r11, [src, offset + 0x0c]; \
	ADDC(sum, r8); \
	ADDC(sum, r9); \
	ADDC(sum, r10); \
	ADDC(sum, r11); \
	lw r8, [src, offset + 0x10]; \
	lw r9, [src, offset + 0x14]; \
	lw r10, [src, offset + 0x18]; \
	lw r11, [src, offset + 0x1c]; \
	ADDC(sum, r8); \
	ADDC(sum, r9); \
	ADDC(sum, r10); \
	ADDC(sum, r11); \

	#define src r4
	#define dest r5
	#define sum r27

	.text
	/* unknown src alignment and < 8 bytes to go */
	small_csumcpy:
	mv r5, r10
	ldi r9, 0x0
	cmpi.c r25, 0x1
	beq pass_small_set_t7 /already set, jump to pass_small_set_t7/
	andri.c r25,r4 , 0x1 /Is src 2 bytes aligned?/

	pass_small_set_t7:
	beq aligned
	cmpi.c r5, 0x0
	beq fold
	lbu r9, [src]
	slli r9,r9, 0x8 /Little endian/
	ADDC(sum, r9)
	addi src, 0x1
	subi.c r5, 0x1

	/len still a full word /
	aligned:
	andri.c r8, r5, 0x4 /Len >= 4?/
	beq len_less_4bytes

	/* Still a full word (4byte) to go,and the src is word aligned.*/
	andri.c r8, src, 0x3 /src is 4bytes aligned, so use LW!!/
	beq four_byte_aligned
	lhu r9, [src]
	addi src, 2
	ADDC(sum, r9)
	lhu r9, [src]
	addi src, 2
	ADDC(sum, r9)
	b len_less_4bytes

	four_byte_aligned: /* Len >=4 and four byte aligned */
	lw r9, [src]
	addi src, 4
	ADDC(sum, r9)

	len_less_4bytes: /* 2 byte aligned aligned and length<4B */
	andri.c r8, r5, 0x2
	beq len_less_2bytes
	lhu r9, [src]
	addi src, 0x2 /* src+=2 */
	ADDC(sum, r9)

	len_less_2bytes: /* len = 1 */
	andri.c r8, r5, 0x1
	beq fold /* less than 2 and not equal 1--> len=0 -> fold */
	lbu r9, [src]

	fold_ADDC:
	ADDC(sum, r9)
	fold:
	/* fold checksum */
	slli r26, sum, 16
	add sum, sum, r26
	cmp.c r26, sum
	srli sum, sum, 16
	bleu 1f /* if r26<=sum */
	addi sum, 0x1 /* r26>sum */
	1:
	/* odd buffer alignment? r25 was set in csum_partial */
	cmpi.c r25, 0x0
	beq 1f
	slli r26, sum, 8
	srli sum, sum, 8
	or sum, sum, r26
	andi sum, 0xffff
	1:
	.set optimize
	/* Add the passed partial csum. */
	ADDC(sum, r6)
	mv r4, sum
	br r3
	.set volatile

	.align 5
	ENTRY(csum_partial)
	ldi sum, 0
	ldi r25, 0
	mv r10, r5
	cmpi.c r5, 0x8
	blt small_csumcpy /* < 8(signed) bytes to copy */
	cmpi.c r5, 0x0
	beq out
	andri.c r25, src, 0x1 /* odd buffer? */

	beq word_align
	hword_align: /* 1 byte */
	lbu r8, [src]
	subi r5, 0x1
	slli r8, r8, 8
	ADDC(sum, r8)
	addi src, 0x1

	word_align: /* 2 bytes */
	andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */
	beq dword_align /* not, maybe dword_align */
	lhu r8, [src]
	subi r5, 0x2
	ADDC(sum, r8)
	addi src, 0x2

	dword_align: /* 4bytes */
	mv r26, r5 /* maybe useless when len >=56 */
	ldi r8, 56
	cmp.c r8, r5
	bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */
	andri.c r26, src, 0x4
	beq qword_align
	lw r8, [src]
	subi r5, 0x4
	ADDC(sum, r8)
	addi src, 0x4

	qword_align: /* 8 bytes */
	andri.c r26, src, 0x8
	beq oword_align
	lw r8, [src, 0x0]
	lw r9, [src, 0x4]
	subi r5, 0x8 /* len-=0x8 */
	ADDC(sum, r8)
	ADDC(sum, r9)
	addi src, 0x8

	oword_align: /* 16bytes */
	andri.c r26, src, 0x10
	beq begin_movement
	lw r10, [src, 0x08]
	lw r11, [src, 0x0c]
	lw r8, [src, 0x00]
	lw r9, [src, 0x04]
	ADDC(sum, r10)
	ADDC(sum, r11)
	ADDC(sum, r8)
	ADDC(sum, r9)
	subi r5, 0x10
	addi src, 0x10

	begin_movement:
	srli.c r26, r5, 0x7 /* len>=128? */
	beq 1f /* len<128 */

	/* r26 is the result that computed in oword_align */
	move_128bytes:
	CSUM_BIGCHUNK(src, 0x00, sum)
	CSUM_BIGCHUNK(src, 0x20, sum)
	CSUM_BIGCHUNK(src, 0x40, sum)
	CSUM_BIGCHUNK(src, 0x60, sum)
	subi.c r26, 0x01 /* r26 equals len/128 */
	addi src, 0x80
	bne move_128bytes

	1: /* len<128,we process 64byte here */
	andri.c r10, r5, 0x40
	beq 1f

	move_64bytes:
	CSUM_BIGCHUNK(src, 0x00, sum)
	CSUM_BIGCHUNK(src, 0x20, sum)
	addi src, 0x40

	1: /* len<64 */
	andri r26, r5, 0x1c /* 0x1c=28 */
	andri.c r10, r5, 0x20
	beq do_end_words /* decided by andri */

	move_32bytes:
	CSUM_BIGCHUNK(src, 0x00, sum)
	andri r26, r5, 0x1c
	addri src, src, 0x20

	do_end_words: /* len<32 */
	/* r26 was set already in dword_align */
	cmpi.c r26, 0x0
	beq maybe_end_cruft /* len<28 or len<56 */
	srli r26, r26, 0x2

	end_words:
	lw r8, [src]
	subi.c r26, 0x1 /* unit is 4 byte */
	ADDC(sum, r8)
	addi src, 0x4
	cmpi.c r26, 0x0
	bne end_words /* r26!=0 */

	maybe_end_cruft: /* len<4 */
	andri r10, r5, 0x3

	small_memcpy:
	mv r5, r10
	j small_csumcpy

	out:
	mv r4, sum
	br r3

	END(csum_partial)