libavcodec/alpha/motion_est_mvi_asm.S - vendor/opensource/ffmpeg - Git at Google

 /*
  * Alpha optimized DSP utils
  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include "regdef.h"

 /* Some nicer register names.  */
 #define ta t10
 #define tb t11
 #define tc t12
 #define td AT
 /* Danger: these overlap with the argument list and the return value */
 #define te a5
 #define tf a4
 #define tg a3
 #define th v0

         .set noat
         .set noreorder
         .arch pca56
         .text

 /*****************************************************************************
  * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
  *
  * This code is written with a pca56 in mind. For ev6, one should
  * really take the increased latency of 3 cycles for MVI instructions
  * into account.
  *
  * It is important to keep the loading and first use of a register as
  * far apart as possible, because if a register is accessed before it
  * has been fetched from memory, the CPU will stall.
  */
         .align 4
         .globl pix_abs16x16_mvi_asm
         .ent pix_abs16x16_mvi_asm
 pix_abs16x16_mvi_asm:
         .frame sp, 0, ra, 0
         .prologue 0

 #if CONFIG_GPROF
         lda     AT, _mcount
         jsr     AT, (AT), _mcount
 #endif

         and     a2, 7, t0
         clr     v0
         beq     t0, $aligned
         .align 4
 $unaligned:
         /* Registers:
            line 0:
            t0:  left_u -> left lo -> left
            t1:  mid
            t2:  right_u -> right hi -> right
            t3:  ref left
            t4:  ref right
            line 1:
            t5:  left_u -> left lo -> left
            t6:  mid
            t7:  right_u -> right hi -> right
            t8:  ref left
            t9:  ref right
            temp:
            ta:  left hi
            tb:  right lo
            tc:  error left
            td:  error right  */

         /* load line 0 */
         ldq_u   t0, 0(a2)       # left_u
         ldq_u   t1, 8(a2)       # mid
         ldq_u   t2, 16(a2)      # right_u
         ldq     t3, 0(a1)       # ref left
         ldq     t4, 8(a1)       # ref right
         addq    a1, a3, a1      # pix1
         addq    a2, a3, a2      # pix2
         /* load line 1 */
         ldq_u   t5, 0(a2)       # left_u
         ldq_u   t6, 8(a2)       # mid
         ldq_u   t7, 16(a2)      # right_u
         ldq     t8, 0(a1)       # ref left
         ldq     t9, 8(a1)       # ref right
         addq    a1, a3, a1      # pix1
         addq    a2, a3, a2      # pix2
         /* calc line 0 */
         extql   t0, a2, t0      # left lo
         extqh   t1, a2, ta      # left hi
         extql   t1, a2, tb      # right lo
         or      t0, ta, t0      # left
         extqh   t2, a2, t2      # right hi
         perr    t3, t0, tc      # error left
         or      t2, tb, t2      # right
         perr    t4, t2, td      # error right
         addq    v0, tc, v0      # add error left
         addq    v0, td, v0      # add error left
         /* calc line 1 */
         extql   t5, a2, t5      # left lo
         extqh   t6, a2, ta      # left hi
         extql   t6, a2, tb      # right lo
         or      t5, ta, t5      # left
         extqh   t7, a2, t7      # right hi
         perr    t8, t5, tc      # error left
         or      t7, tb, t7      # right
         perr    t9, t7, td      # error right
         addq    v0, tc, v0      # add error left
         addq    v0, td, v0      # add error left
         /* loop */
         subq    a4,  2, a4      # h -= 2
         bne     a4, $unaligned
         ret

         .align 4
 $aligned:
         /* load line 0 */
         ldq     t0, 0(a2)       # left
         ldq     t1, 8(a2)       # right
         addq    a2, a3, a2      # pix2
         ldq     t2, 0(a1)       # ref left
         ldq     t3, 8(a1)       # ref right
         addq    a1, a3, a1      # pix1
         /* load line 1 */
         ldq     t4, 0(a2)       # left
         ldq     t5, 8(a2)       # right
         addq    a2, a3, a2      # pix2
         ldq     t6, 0(a1)       # ref left
         ldq     t7, 8(a1)       # ref right
         addq    a1, a3, a1      # pix1
         /* load line 2 */
         ldq     t8, 0(a2)       # left
         ldq     t9, 8(a2)       # right
         addq    a2, a3, a2      # pix2
         ldq     ta, 0(a1)       # ref left
         ldq     tb, 8(a1)       # ref right
         addq    a1, a3, a1      # pix1
         /* load line 3 */
         ldq     tc, 0(a2)       # left
         ldq     td, 8(a2)       # right
         addq    a2, a3, a2      # pix2
         ldq     te, 0(a1)       # ref left
         ldq     a0, 8(a1)       # ref right
         /* calc line 0 */
         perr    t0, t2, t0      # error left
         addq    a1, a3, a1      # pix1
         perr    t1, t3, t1      # error right
         addq    v0, t0, v0      # add error left
         /* calc line 1 */
         perr    t4, t6, t0      # error left
         addq    v0, t1, v0      # add error right
         perr    t5, t7, t1      # error right
         addq    v0, t0, v0      # add error left
         /* calc line 2 */
         perr    t8, ta, t0      # error left
         addq    v0, t1, v0      # add error right
         perr    t9, tb, t1      # error right
         addq    v0, t0, v0      # add error left
         /* calc line 3 */
         perr    tc, te, t0      # error left
         addq    v0, t1, v0      # add error right
         perr    td, a0, t1      # error right
         addq    v0, t0, v0      # add error left
         addq    v0, t1, v0      # add error right
         /* loop */
         subq    a4,  4, a4      # h -= 4
         bne     a4, $aligned
         ret
         .end pix_abs16x16_mvi_asm
	/*
	* Alpha optimized DSP utils
	* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include "regdef.h"

	/* Some nicer register names. */
	#define ta t10
	#define tb t11
	#define tc t12
	#define td AT
	/* Danger: these overlap with the argument list and the return value */
	#define te a5
	#define tf a4
	#define tg a3
	#define th v0

	.set noat
	.set noreorder
	.arch pca56
	.text

	/*****************************************************************************
	* int pix_abs16x16_mvi_asm(uint8_t pix1, uint8_t pix2, int line_size)
	*
	* This code is written with a pca56 in mind. For ev6, one should
	* really take the increased latency of 3 cycles for MVI instructions
	* into account.
	*
	* It is important to keep the loading and first use of a register as
	* far apart as possible, because if a register is accessed before it
	* has been fetched from memory, the CPU will stall.
	*/
	.align 4
	.globl pix_abs16x16_mvi_asm
	.ent pix_abs16x16_mvi_asm
	pix_abs16x16_mvi_asm:
	.frame sp, 0, ra, 0
	.prologue 0

	#if CONFIG_GPROF
	lda AT, _mcount
	jsr AT, (AT), _mcount
	#endif

	and a2, 7, t0
	clr v0
	beq t0, $aligned
	.align 4
	$unaligned:
	/* Registers:
	line 0:
	t0: left_u -> left lo -> left
	t1: mid
	t2: right_u -> right hi -> right
	t3: ref left
	t4: ref right
	line 1:
	t5: left_u -> left lo -> left
	t6: mid
	t7: right_u -> right hi -> right
	t8: ref left
	t9: ref right
	temp:
	ta: left hi
	tb: right lo
	tc: error left
	td: error right */

	/* load line 0 */
	ldq_u t0, 0(a2) # left_u
	ldq_u t1, 8(a2) # mid
	ldq_u t2, 16(a2) # right_u
	ldq t3, 0(a1) # ref left
	ldq t4, 8(a1) # ref right
	addq a1, a3, a1 # pix1
	addq a2, a3, a2 # pix2
	/* load line 1 */
	ldq_u t5, 0(a2) # left_u
	ldq_u t6, 8(a2) # mid
	ldq_u t7, 16(a2) # right_u
	ldq t8, 0(a1) # ref left
	ldq t9, 8(a1) # ref right
	addq a1, a3, a1 # pix1
	addq a2, a3, a2 # pix2
	/* calc line 0 */
	extql t0, a2, t0 # left lo
	extqh t1, a2, ta # left hi
	extql t1, a2, tb # right lo
	or t0, ta, t0 # left
	extqh t2, a2, t2 # right hi
	perr t3, t0, tc # error left
	or t2, tb, t2 # right
	perr t4, t2, td # error right
	addq v0, tc, v0 # add error left
	addq v0, td, v0 # add error left
	/* calc line 1 */
	extql t5, a2, t5 # left lo
	extqh t6, a2, ta # left hi
	extql t6, a2, tb # right lo
	or t5, ta, t5 # left
	extqh t7, a2, t7 # right hi
	perr t8, t5, tc # error left
	or t7, tb, t7 # right
	perr t9, t7, td # error right
	addq v0, tc, v0 # add error left
	addq v0, td, v0 # add error left
	/* loop */
	subq a4, 2, a4 # h -= 2
	bne a4, $unaligned
	ret

	.align 4
	$aligned:
	/* load line 0 */
	ldq t0, 0(a2) # left
	ldq t1, 8(a2) # right
	addq a2, a3, a2 # pix2
	ldq t2, 0(a1) # ref left
	ldq t3, 8(a1) # ref right
	addq a1, a3, a1 # pix1
	/* load line 1 */
	ldq t4, 0(a2) # left
	ldq t5, 8(a2) # right
	addq a2, a3, a2 # pix2
	ldq t6, 0(a1) # ref left
	ldq t7, 8(a1) # ref right
	addq a1, a3, a1 # pix1
	/* load line 2 */
	ldq t8, 0(a2) # left
	ldq t9, 8(a2) # right
	addq a2, a3, a2 # pix2
	ldq ta, 0(a1) # ref left
	ldq tb, 8(a1) # ref right
	addq a1, a3, a1 # pix1
	/* load line 3 */
	ldq tc, 0(a2) # left
	ldq td, 8(a2) # right
	addq a2, a3, a2 # pix2
	ldq te, 0(a1) # ref left
	ldq a0, 8(a1) # ref right
	/* calc line 0 */
	perr t0, t2, t0 # error left
	addq a1, a3, a1 # pix1
	perr t1, t3, t1 # error right
	addq v0, t0, v0 # add error left
	/* calc line 1 */
	perr t4, t6, t0 # error left
	addq v0, t1, v0 # add error right
	perr t5, t7, t1 # error right
	addq v0, t0, v0 # add error left
	/* calc line 2 */
	perr t8, ta, t0 # error left
	addq v0, t1, v0 # add error right
	perr t9, tb, t1 # error right
	addq v0, t0, v0 # add error left
	/* calc line 3 */
	perr tc, te, t0 # error left
	addq v0, t1, v0 # add error right
	perr td, a0, t1 # error right
	addq v0, t0, v0 # add error left
	addq v0, t1, v0 # add error right
	/* loop */
	subq a4, 4, a4 # h -= 4
	bne a4, $aligned
	ret
	.end pix_abs16x16_mvi_asm