libavcodec/ppc/fft_altivec.c - vendor/opensource/ffmpeg - Git at Google

 /*
  * FFT/IFFT transforms
  * AltiVec-enabled
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  * Based on code Copyright (c) 2002 Fabrice Bellard
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavcodec/dsputil.h"

 #include "gcc_fixes.h"

 #include "dsputil_ppc.h"
 #include "util_altivec.h"
 /**
  * Do a complex FFT with the parameters defined in ff_fft_init(). The
  * input data must be permuted before with s->revtab table. No
  * 1.0/sqrt(n) normalization is done.
  * AltiVec-enabled
  * This code assumes that the 'z' pointer is 16 bytes-aligned
  * It also assumes all FFTComplex are 8 bytes-aligned pair of float
  * The code is exactly the same as the SSE version, except
  * that successive MUL + ADD/SUB have been merged into
  * fused multiply-add ('vec_madd' in altivec)
  */
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
 {
 POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
     register const vector float vczero = (const vector float)vec_splat_u32(0.);

     int ln = s->nbits;
     int j, np, np2;
     int nblocks, nloops;
     register FFTComplex *p, *q;
     FFTComplex *cptr, *cptr1;
     int k;

 POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);

     np = 1 << ln;

     {
         vector float *r, a, b, a1, c1, c2;

         r = (vector float *)&z[0];

         c1 = vcii(p,p,n,n);

         if (s->inverse) {
             c2 = vcii(p,p,n,p);
         } else {
             c2 = vcii(p,p,p,n);
         }

         j = (np >> 2);
         do {
             a = vec_ld(0, r);
             a1 = vec_ld(sizeof(vector float), r);

             b = vec_perm(a,a,vcprmle(1,0,3,2));
             a = vec_madd(a,c1,b);
             /* do the pass 0 butterfly */

             b = vec_perm(a1,a1,vcprmle(1,0,3,2));
             b = vec_madd(a1,c1,b);
             /* do the pass 0 butterfly */

             /* multiply third by -i */
             b = vec_perm(b,b,vcprmle(2,3,1,0));

             /* do the pass 1 butterfly */
             vec_st(vec_madd(b,c2,a), 0, r);
             vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);

             r += 2;
         } while (--j != 0);
     }
     /* pass 2 .. ln-1 */

     nblocks = np >> 3;
     nloops = 1 << 2;
     np2 = np >> 1;

     cptr1 = s->exptab1;
     do {
         p = z;
         q = z + nloops;
         j = nblocks;
         do {
             cptr = cptr1;
             k = nloops >> 1;
             do {
                 vector float a,b,c,t1;

                 a = vec_ld(0, (float*)p);
                 b = vec_ld(0, (float*)q);

                 /* complex mul */
                 c = vec_ld(0, (float*)cptr);
                 /*  cre*re cim*re */
                 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
                 c = vec_ld(sizeof(vector float), (float*)cptr);
                 /*  -cim*im cre*im */
                 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);

                 /* butterfly */
                 vec_st(vec_add(a,b), 0, (float*)p);
                 vec_st(vec_sub(a,b), 0, (float*)q);

                 p += 2;
                 q += 2;
                 cptr += 4;
             } while (--k);

             p += nloops;
             q += nloops;
         } while (--j);
         cptr1 += nloops * 2;
         nblocks = nblocks >> 1;
         nloops = nloops << 1;
     } while (nblocks != 0);

 POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
 }
	/*
	* FFT/IFFT transforms
	* AltiVec-enabled
	* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
	* Based on code Copyright (c) 2002 Fabrice Bellard
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/
	#include "libavcodec/dsputil.h"

	#include "gcc_fixes.h"

	#include "dsputil_ppc.h"
	#include "util_altivec.h"
	/**
	* Do a complex FFT with the parameters defined in ff_fft_init(). The
	* input data must be permuted before with s->revtab table. No
	* 1.0/sqrt(n) normalization is done.
	* AltiVec-enabled
	* This code assumes that the 'z' pointer is 16 bytes-aligned
	* It also assumes all FFTComplex are 8 bytes-aligned pair of float
	* The code is exactly the same as the SSE version, except
	* that successive MUL + ADD/SUB have been merged into
	* fused multiply-add ('vec_madd' in altivec)
	*/
	void ff_fft_calc_altivec(FFTContext s, FFTComplex z)
	{
	POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
	register const vector float vczero = (const vector float)vec_splat_u32(0.);

	int ln = s->nbits;
	int j, np, np2;
	int nblocks, nloops;
	register FFTComplex p, q;
	FFTComplex cptr, cptr1;
	int k;

	POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);

	np = 1 << ln;

	{
	vector float *r, a, b, a1, c1, c2;

	r = (vector float *)&z[0];

	c1 = vcii(p,p,n,n);

	if (s->inverse) {
	c2 = vcii(p,p,n,p);
	} else {
	c2 = vcii(p,p,p,n);
	}

	j = (np >> 2);
	do {
	a = vec_ld(0, r);
	a1 = vec_ld(sizeof(vector float), r);

	b = vec_perm(a,a,vcprmle(1,0,3,2));
	a = vec_madd(a,c1,b);
	/* do the pass 0 butterfly */

	b = vec_perm(a1,a1,vcprmle(1,0,3,2));
	b = vec_madd(a1,c1,b);
	/* do the pass 0 butterfly */

	/* multiply third by -i */
	b = vec_perm(b,b,vcprmle(2,3,1,0));

	/* do the pass 1 butterfly */
	vec_st(vec_madd(b,c2,a), 0, r);
	vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);

	r += 2;
	} while (--j != 0);
	}
	/* pass 2 .. ln-1 */

	nblocks = np >> 3;
	nloops = 1 << 2;
	np2 = np >> 1;

	cptr1 = s->exptab1;
	do {
	p = z;
	q = z + nloops;
	j = nblocks;
	do {
	cptr = cptr1;
	k = nloops >> 1;
	do {
	vector float a,b,c,t1;

	a = vec_ld(0, (float*)p);
	b = vec_ld(0, (float*)q);

	/* complex mul */
	c = vec_ld(0, (float*)cptr);
	/* crere cimre */
	t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
	c = vec_ld(sizeof(vector float), (float*)cptr);
	/* -cimim creim */
	b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);

	/* butterfly */
	vec_st(vec_add(a,b), 0, (float*)p);
	vec_st(vec_sub(a,b), 0, (float*)q);

	p += 2;
	q += 2;
	cptr += 4;
	} while (--k);

	p += nloops;
	q += nloops;
	} while (--j);
	cptr1 += nloops * 2;
	nblocks = nblocks >> 1;
	nloops = nloops << 1;
	} while (nblocks != 0);

	POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
	}