| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| .globl mbloop_filter_horizontal_edge_y_ppc |
| .globl loop_filter_horizontal_edge_y_ppc |
| .globl mbloop_filter_vertical_edge_y_ppc |
| .globl loop_filter_vertical_edge_y_ppc |
| |
| .globl mbloop_filter_horizontal_edge_uv_ppc |
| .globl loop_filter_horizontal_edge_uv_ppc |
| .globl mbloop_filter_vertical_edge_uv_ppc |
| .globl loop_filter_vertical_edge_uv_ppc |
| |
| .globl loop_filter_simple_horizontal_edge_ppc |
| .globl loop_filter_simple_vertical_edge_ppc |
| |
| .text |
| ;# We often need to perform transposes (and other transpose-like operations) |
| ;# on matrices of data. This is simplified by the fact that we usually |
| ;# operate on hunks of data whose dimensions are powers of 2, or at least |
| ;# divisible by highish powers of 2. |
| ;# |
| ;# These operations can be very confusing. They become more straightforward |
| ;# when we think of them as permutations of address bits: Concatenate a |
| ;# group of vector registers and think of it as occupying a block of |
| ;# memory beginning at address zero. The low four bits 0...3 of the |
| ;# address then correspond to position within a register, the higher-order |
| ;# address bits select the register. |
| ;# |
| ;# Although register selection, at the code level, is arbitrary, things |
| ;# are simpler if we use contiguous ranges of register numbers, simpler |
| ;# still if the low-order bits of the register number correspond to |
| ;# conceptual address bits. We do this whenever reasonable. |
| ;# |
| ;# A 16x16 transpose can then be thought of as an operation on |
| ;# a 256-element block of memory. It takes 8 bits 0...7 to address this |
| ;# memory and the effect of a transpose is to interchange address bit |
| ;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the |
| ;# column, which is interchanged with the row addressed by bits 4..7. |
| ;# |
| ;# The altivec merge instructions provide a rapid means of effecting |
| ;# many of these transforms. They operate at three widths (8,16,32). |
| ;# Writing V(x) for vector register #x, paired merges permute address |
| ;# indices as follows. |
| ;# |
| ;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: |
| ;# |
| ;# vmrghb V( x), V( y), V( y + (1<<s)) |
| ;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s)) |
| ;# |
| ;# |
| ;# =0= 1->2 2->3 3->(4+d) (4+s)->1: |
| ;# |
| ;# vmrghh V( x), V( y), V( y + (1<<s)) |
| ;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s)) |
| ;# |
| ;# |
| ;# =0= =1= 2->3 3->(4+d) (4+s)->2: |
| ;# |
| ;# vmrghw V( x), V( y), V( y + (1<<s)) |
| ;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s)) |
| ;# |
| ;# |
| ;# Unfortunately, there is no doubleword merge instruction. |
| ;# The following sequence uses "vperm" is a substitute. |
| ;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c) |
| ;# are in registers Vhihi and Vlolo, we can also effect the permutation |
| ;# |
| ;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence: |
| ;# |
| ;# vperm V( x), V( y), V( y + (1<<s)), Vhihi |
| ;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo |
| ;# |
| ;# |
| ;# Except for bits s and d, the other relationships between register |
| ;# number (= high-order part of address) bits are at the disposal of |
| ;# the programmer. |
| ;# |
| |
| ;# To avoid excess transposes, we filter all 3 vertical luma subblock |
| ;# edges together. This requires a single 16x16 transpose, which, in |
| ;# the above language, amounts to the following permutation of address |
| ;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by |
| ;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. |
| ;# |
| ;# Except for the fact that the destination registers get written |
| ;# before we are done referencing the old contents, the cyclic transform |
| ;# is effected by |
| ;# |
| ;# x = 0; do { |
| ;# vmrghb V(2x), V(x), V(x+8); |
| ;# vmrghb V(2x+1), V(x), V(x+8); |
| ;# } while( ++x < 8); |
| ;# |
| ;# For clarity, and because we can afford it, we do this transpose |
| ;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, |
| ;# leaving the final result in 16 .. 31, as the lower registers are |
| ;# used in the filtering itself. |
| ;# |
| .macro Tpair A, B, X, Y |
| vmrghb \A, \X, \Y |
| vmrglb \B, \X, \Y |
| .endm |
| |
| ;# Each step takes 8*2 = 16 instructions |
| |
| .macro t16_even |
| Tpair v16,v17, v0,v8 |
| Tpair v18,v19, v1,v9 |
| Tpair v20,v21, v2,v10 |
| Tpair v22,v23, v3,v11 |
| Tpair v24,v25, v4,v12 |
| Tpair v26,v27, v5,v13 |
| Tpair v28,v29, v6,v14 |
| Tpair v30,v31, v7,v15 |
| .endm |
| |
| .macro t16_odd |
| Tpair v0,v1, v16,v24 |
| Tpair v2,v3, v17,v25 |
| Tpair v4,v5, v18,v26 |
| Tpair v6,v7, v19,v27 |
| Tpair v8,v9, v20,v28 |
| Tpair v10,v11, v21,v29 |
| Tpair v12,v13, v22,v30 |
| Tpair v14,v15, v23,v31 |
| .endm |
| |
| ;# Whole transpose takes 4*16 = 64 instructions |
| |
| .macro t16_full |
| t16_odd |
| t16_even |
| t16_odd |
| t16_even |
| .endm |
| |
| ;# Vertical edge filtering requires transposes. For the simple filter, |
| ;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels |
| ;# each. Writing 0 ... 63 for the pixel indices, the desired result is: |
| ;# |
| ;# v0 = 0 1 ... 14 15 |
| ;# v1 = 16 17 ... 30 31 |
| ;# v2 = 32 33 ... 47 48 |
| ;# v3 = 49 50 ... 62 63 |
| ;# |
| ;# In frame-buffer memory, the layout is: |
| ;# |
| ;# 0 16 32 48 |
| ;# 1 17 33 49 |
| ;# ... |
| ;# 15 31 47 63. |
| ;# |
| ;# We begin by reading the data 32 bits at a time (using scalar operations) |
| ;# into a temporary array, reading the rows of the array into vector registers, |
| ;# with the following layout: |
| ;# |
| ;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 |
| ;# v1 = 1 17 33 49 5 21 ... 45 61 |
| ;# v2 = 2 18 ... 46 62 |
| ;# v3 = 3 19 ... 47 63 |
| ;# |
| ;# From the "address-bit" perspective discussed above, we simply need to |
| ;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. |
| ;# In other words, we transpose each of the four 4x4 submatrices. |
| ;# |
| ;# This transformation is its own inverse, and we need to perform it |
| ;# again before writing the pixels back into the frame buffer. |
| ;# |
| ;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, |
| ;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors |
| ;# defined above. We think of both groups of 4 registers as having |
| ;# "addresses" {0,1,2,3} * 16. |
| ;# |
| .macro Transpose4times4x4 Vlo, Vhi |
| |
| ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= |
| |
| vmrghb v4, v0, v1 |
| vmrglb v5, v0, v1 |
| vmrghb v6, v2, v3 |
| vmrglb v7, v2, v3 |
| |
| ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 |
| |
| vmrghh v0, v4, v6 |
| vmrglh v1, v4, v6 |
| vmrghh v2, v5, v7 |
| vmrglh v3, v5, v7 |
| |
| ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= |
| |
| vmrghw v4, v0, v1 |
| vmrglw v5, v0, v1 |
| vmrghw v6, v2, v3 |
| vmrglw v7, v2, v3 |
| |
| ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 |
| |
| vperm v0, v4, v6, \Vlo |
| vperm v1, v4, v6, \Vhi |
| vperm v2, v5, v7, \Vlo |
| vperm v3, v5, v7, \Vhi |
| .endm |
| ;# end Transpose4times4x4 |
| |
| |
| ;# Normal mb vertical edge filter transpose. |
| ;# |
| ;# We read 8 columns of data, initially in the following pattern: |
| ;# |
| ;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) |
| ;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) |
| ;# ... |
| ;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) |
| ;# |
| ;# and wish to convert to: |
| ;# |
| ;# (0,0) ... (0,15) |
| ;# (1,0) ... (1,15) |
| ;# ... |
| ;# (7,0) ... (7,15). |
| ;# |
| ;# In "address bit" language, we wish to map |
| ;# |
| ;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. |
| ;# |
| ;# This can be accomplished by 4 iterations of the cyclic transform |
| ;# |
| ;# I -> (I+1) mod 7; |
| ;# |
| ;# each iteration can be realized by (d=0, s=2): |
| ;# |
| ;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); |
| ;# |
| ;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; |
| ;# preserving v8 = sign converter. |
| ;# |
| ;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the |
| ;# result lands in the "mirror" registers v10...v17 |
| ;# |
| .macro t8x16_odd |
| Tpair v10, v11, v0, v4 |
| Tpair v12, v13, v1, v5 |
| Tpair v14, v15, v2, v6 |
| Tpair v16, v17, v3, v7 |
| .endm |
| |
| .macro t8x16_even |
| Tpair v0, v1, v10, v14 |
| Tpair v2, v3, v11, v15 |
| Tpair v4, v5, v12, v16 |
| Tpair v6, v7, v13, v17 |
| .endm |
| |
| .macro transpose8x16_fwd |
| t8x16_odd |
| t8x16_even |
| t8x16_odd |
| t8x16_even |
| .endm |
| |
| .macro transpose8x16_inv |
| t8x16_odd |
| t8x16_even |
| t8x16_odd |
| .endm |
| |
| .macro Transpose16x16 |
| vmrghb v0, v16, v24 |
| vmrglb v1, v16, v24 |
| vmrghb v2, v17, v25 |
| vmrglb v3, v17, v25 |
| vmrghb v4, v18, v26 |
| vmrglb v5, v18, v26 |
| vmrghb v6, v19, v27 |
| vmrglb v7, v19, v27 |
| vmrghb v8, v20, v28 |
| vmrglb v9, v20, v28 |
| vmrghb v10, v21, v29 |
| vmrglb v11, v21, v29 |
| vmrghb v12, v22, v30 |
| vmrglb v13, v22, v30 |
| vmrghb v14, v23, v31 |
| vmrglb v15, v23, v31 |
| vmrghb v16, v0, v8 |
| vmrglb v17, v0, v8 |
| vmrghb v18, v1, v9 |
| vmrglb v19, v1, v9 |
| vmrghb v20, v2, v10 |
| vmrglb v21, v2, v10 |
| vmrghb v22, v3, v11 |
| vmrglb v23, v3, v11 |
| vmrghb v24, v4, v12 |
| vmrglb v25, v4, v12 |
| vmrghb v26, v5, v13 |
| vmrglb v27, v5, v13 |
| vmrghb v28, v6, v14 |
| vmrglb v29, v6, v14 |
| vmrghb v30, v7, v15 |
| vmrglb v31, v7, v15 |
| vmrghb v0, v16, v24 |
| vmrglb v1, v16, v24 |
| vmrghb v2, v17, v25 |
| vmrglb v3, v17, v25 |
| vmrghb v4, v18, v26 |
| vmrglb v5, v18, v26 |
| vmrghb v6, v19, v27 |
| vmrglb v7, v19, v27 |
| vmrghb v8, v20, v28 |
| vmrglb v9, v20, v28 |
| vmrghb v10, v21, v29 |
| vmrglb v11, v21, v29 |
| vmrghb v12, v22, v30 |
| vmrglb v13, v22, v30 |
| vmrghb v14, v23, v31 |
| vmrglb v15, v23, v31 |
| vmrghb v16, v0, v8 |
| vmrglb v17, v0, v8 |
| vmrghb v18, v1, v9 |
| vmrglb v19, v1, v9 |
| vmrghb v20, v2, v10 |
| vmrglb v21, v2, v10 |
| vmrghb v22, v3, v11 |
| vmrglb v23, v3, v11 |
| vmrghb v24, v4, v12 |
| vmrglb v25, v4, v12 |
| vmrghb v26, v5, v13 |
| vmrglb v27, v5, v13 |
| vmrghb v28, v6, v14 |
| vmrglb v29, v6, v14 |
| vmrghb v30, v7, v15 |
| vmrglb v31, v7, v15 |
| .endm |
| |
| ;# load_g loads a global vector (whose address is in the local variable Gptr) |
| ;# into vector register Vreg. Trashes r0 |
| .macro load_g Vreg, Gptr |
| lwz r0, \Gptr |
| lvx \Vreg, 0, r0 |
| .endm |
| |
| ;# exploit the saturation here. if the answer is negative |
| ;# it will be clamped to 0. orring 0 with a positive |
| ;# number will be the positive number (abs) |
| ;# RES = abs( A-B), trashes TMP |
| .macro Abs RES, TMP, A, B |
| vsububs \RES, \A, \B |
| vsububs \TMP, \B, \A |
| vor \RES, \RES, \TMP |
| .endm |
| |
| ;# RES = Max( RES, abs( A-B)), trashes TMP |
| .macro max_abs RES, TMP, A, B |
| vsububs \TMP, \A, \B |
| vmaxub \RES, \RES, \TMP |
| vsububs \TMP, \B, \A |
| vmaxub \RES, \RES, \TMP |
| .endm |
| |
| .macro Masks |
| ;# build masks |
| ;# input is all 8 bit unsigned (0-255). need to |
| ;# do abs(vala-valb) > limit. but no need to compare each |
| ;# value to the limit. find the max of the absolute differences |
| ;# and compare that to the limit. |
| ;# First hev |
| Abs v14, v13, v2, v3 ;# |P1 - P0| |
| max_abs v14, v13, v5, v4 ;# |Q1 - Q0| |
| |
| vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded |
| |
| ;# Next limit |
| max_abs v14, v13, v0, v1 ;# |P3 - P2| |
| max_abs v14, v13, v1, v2 ;# |P2 - P1| |
| max_abs v14, v13, v6, v5 ;# |Q2 - Q1| |
| max_abs v14, v13, v7, v6 ;# |Q3 - Q2| |
| |
| vcmpgtub v9, v14, v9 ;# R = true if limit exceeded |
| |
| ;# flimit |
| Abs v14, v13, v3, v4 ;# |P0 - Q0| |
| |
| vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded |
| |
| vor v8, v8, v9 ;# R = true if flimit or limit exceeded |
| ;# done building masks |
| .endm |
| |
| .macro build_constants RFL, RLI, RTH, FL, LI, TH |
| ;# build constants |
| lvx \FL, 0, \RFL ;# flimit |
| lvx \LI, 0, \RLI ;# limit |
| lvx \TH, 0, \RTH ;# thresh |
| |
| vspltisb v11, 8 |
| vspltisb v12, 4 |
| vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 |
| .endm |
| |
| .macro load_data_y |
| ;# setup strides/pointers to be able to access |
| ;# all of the data |
| add r5, r4, r4 ;# r5 = 2 * stride |
| sub r6, r3, r5 ;# r6 -> 2 rows back |
| neg r7, r4 ;# r7 = -stride |
| |
| ;# load 16 pixels worth of data to work on |
| sub r0, r6, r5 ;# r0 -> 4 rows back (temp) |
| lvx v0, 0, r0 ;# P3 (read only) |
| lvx v1, r7, r6 ;# P2 |
| lvx v2, 0, r6 ;# P1 |
| lvx v3, r7, r3 ;# P0 |
| lvx v4, 0, r3 ;# Q0 |
| lvx v5, r4, r3 ;# Q1 |
| lvx v6, r5, r3 ;# Q2 |
| add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) |
| lvx v7, r4, r0 ;# Q3 (read only) |
| .endm |
| |
| ;# Expects |
| ;# v10 == HEV |
| ;# v13 == tmp |
| ;# v14 == tmp |
| .macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT |
| vxor \P1, \P1, v11 ;# SP1 |
| vxor \P0, \P0, v11 ;# SP0 |
| vxor \Q0, \Q0, v11 ;# SQ0 |
| vxor \Q1, \Q1, v11 ;# SQ1 |
| |
| vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) |
| .if \HEV_PRESENT |
| vand v13, v13, v10 ;# f &= hev |
| .endif |
| vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 |
| vaddsbs v13, v13, v14 |
| vaddsbs v13, v13, v14 |
| vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) |
| |
| vandc v13, v13, v8 ;# f &= mask |
| |
| vspltisb v8, 3 |
| vspltisb v9, 4 |
| |
| vaddsbs v14, v13, v9 ;# f1 = c (f+4) |
| vaddsbs v15, v13, v8 ;# f2 = c (f+3) |
| |
| vsrab v13, v14, v8 ;# f1 >>= 3 |
| vsrab v15, v15, v8 ;# f2 >>= 3 |
| |
| vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) |
| vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) |
| .endm |
| |
| .macro vp8_mbfilter |
| Masks |
| |
| ;# start the fitering here |
| vxor v1, v1, v11 ;# SP2 |
| vxor v2, v2, v11 ;# SP1 |
| vxor v3, v3, v11 ;# SP0 |
| vxor v4, v4, v11 ;# SQ0 |
| vxor v5, v5, v11 ;# SQ1 |
| vxor v6, v6, v11 ;# SQ2 |
| |
| ;# add outer taps if we have high edge variance |
| vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) |
| |
| vsubsbs v14, v4, v3 ;# SQ0-SP0 |
| vaddsbs v13, v13, v14 |
| vaddsbs v13, v13, v14 |
| vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) |
| |
| vandc v13, v13, v8 ;# f &= mask |
| vand v15, v13, v10 ;# f2 = f & hev |
| |
| ;# save bottom 3 bits so that we round one side +4 and the other +3 |
| vspltisb v8, 3 |
| vspltisb v9, 4 |
| |
| vaddsbs v14, v15, v9 ;# f1 = c (f+4) |
| vaddsbs v15, v15, v8 ;# f2 = c (f+3) |
| |
| vsrab v14, v14, v8 ;# f1 >>= 3 |
| vsrab v15, v15, v8 ;# f2 >>= 3 |
| |
| vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) |
| vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) |
| |
| ;# only apply wider filter if not high edge variance |
| vandc v13, v13, v10 ;# f &= ~hev |
| |
| vspltisb v9, 2 |
| vnor v8, v8, v8 |
| vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f |
| vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f |
| vspltisb v8, 9 |
| |
| ;# roughly 1/7th difference across boundary |
| vspltish v10, 7 |
| vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) |
| vmulesb v15, v8, v13 |
| vaddshs v14, v14, v9 ;# += 63 |
| vaddshs v15, v15, v9 |
| vsrah v14, v14, v10 ;# >>= 7 |
| vsrah v15, v15, v10 |
| vmrglh v10, v15, v14 |
| vmrghh v15, v15, v14 |
| |
| vpkshss v10, v15, v10 ;# X = saturated down to bytes |
| |
| vsubsbs v6, v6, v10 ;# subtract from Q and add to P |
| vaddsbs v1, v1, v10 |
| |
| vxor v6, v6, v11 |
| vxor v1, v1, v11 |
| |
| ;# roughly 2/7th difference across boundary |
| vspltish v10, 7 |
| vaddubm v12, v8, v8 |
| vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) |
| vmulesb v15, v12, v13 |
| vaddshs v14, v14, v9 |
| vaddshs v15, v15, v9 |
| vsrah v14, v14, v10 ;# >>= 7 |
| vsrah v15, v15, v10 |
| vmrglh v10, v15, v14 |
| vmrghh v15, v15, v14 |
| |
| vpkshss v10, v15, v10 ;# X = saturated down to bytes |
| |
| vsubsbs v5, v5, v10 ;# subtract from Q and add to P |
| vaddsbs v2, v2, v10 |
| |
| vxor v5, v5, v11 |
| vxor v2, v2, v11 |
| |
| ;# roughly 3/7th difference across boundary |
| vspltish v10, 7 |
| vaddubm v12, v12, v8 |
| vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) |
| vmulesb v15, v12, v13 |
| vaddshs v14, v14, v9 |
| vaddshs v15, v15, v9 |
| vsrah v14, v14, v10 ;# >>= 7 |
| vsrah v15, v15, v10 |
| vmrglh v10, v15, v14 |
| vmrghh v15, v15, v14 |
| |
| vpkshss v10, v15, v10 ;# X = saturated down to bytes |
| |
| vsubsbs v4, v4, v10 ;# subtract from Q and add to P |
| vaddsbs v3, v3, v10 |
| |
| vxor v4, v4, v11 |
| vxor v3, v3, v11 |
| .endm |
| |
| .macro SBFilter |
| Masks |
| |
| common_adjust v3, v4, v2, v5, 1 |
| |
| ;# outer tap adjustments |
| vspltisb v8, 1 |
| |
| vaddubm v13, v13, v8 ;# f += 1 |
| vsrab v13, v13, v8 ;# f >>= 1 |
| |
| vandc v13, v13, v10 ;# f &= ~hev |
| |
| vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) |
| vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) |
| |
| vxor v2, v2, v11 |
| vxor v3, v3, v11 |
| vxor v4, v4, v11 |
| vxor v5, v5, v11 |
| .endm |
| |
| .align 2 |
| mbloop_filter_horizontal_edge_y_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| build_constants r5, r6, r7, v8, v9, v10 |
| |
| load_data_y |
| |
| vp8_mbfilter |
| |
| stvx v1, r7, r6 ;# P2 |
| stvx v2, 0, r6 ;# P1 |
| stvx v3, r7, r3 ;# P0 |
| stvx v4, 0, r3 ;# Q0 |
| stvx v5, r4, r3 ;# Q1 |
| stvx v6, r5, r3 ;# Q2 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .align 2 |
| ;# r3 unsigned char *s |
| ;# r4 int p |
| ;# r5 const signed char *flimit |
| ;# r6 const signed char *limit |
| ;# r7 const signed char *thresh |
| loop_filter_horizontal_edge_y_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| build_constants r5, r6, r7, v8, v9, v10 |
| |
| load_data_y |
| |
| SBFilter |
| |
| stvx v2, 0, r6 ;# P1 |
| stvx v3, r7, r3 ;# P0 |
| stvx v4, 0, r3 ;# Q0 |
| stvx v5, r4, r3 ;# Q1 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| ;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. |
| ;# So we can read in an entire mb aligned. However if we want to filter the mb |
| ;# edge we run into problems. For the loopfilter we require 4 bytes before the mb |
| ;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit |
| ;# of a waste. So this is an even uglier way to get around that. |
| ;# Using the regular register file words are read in and then saved back out to |
| ;# memory to align and order them up. Then they are read in using the |
| ;# vector register file. |
| .macro RLVmb V, R |
| lwzux r0, r3, r4 |
| stw r0, 4(\R) |
| lwz r0,-4(r3) |
| stw r0, 0(\R) |
| lwzux r0, r3, r4 |
| stw r0,12(\R) |
| lwz r0,-4(r3) |
| stw r0, 8(\R) |
| lvx \V, 0, \R |
| .endm |
| |
| .macro WLVmb V, R |
| stvx \V, 0, \R |
| lwz r0,12(\R) |
| stwux r0, r3, r4 |
| lwz r0, 8(\R) |
| stw r0,-4(r3) |
| lwz r0, 4(\R) |
| stwux r0, r3, r4 |
| lwz r0, 0(\R) |
| stw r0,-4(r3) |
| .endm |
| |
| .align 2 |
| ;# r3 unsigned char *s |
| ;# r4 int p |
| ;# r5 const signed char *flimit |
| ;# r6 const signed char *limit |
| ;# r7 const signed char *thresh |
| mbloop_filter_vertical_edge_y_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xc000 |
| mtspr 256, r12 ;# set VRSAVE |
| |
| la r9, -48(r1) ;# temporary space for reading in vectors |
| sub r3, r3, r4 |
| |
| RLVmb v0, r9 |
| RLVmb v1, r9 |
| RLVmb v2, r9 |
| RLVmb v3, r9 |
| RLVmb v4, r9 |
| RLVmb v5, r9 |
| RLVmb v6, r9 |
| RLVmb v7, r9 |
| |
| transpose8x16_fwd |
| |
| build_constants r5, r6, r7, v8, v9, v10 |
| |
| vp8_mbfilter |
| |
| transpose8x16_inv |
| |
| add r3, r3, r4 |
| neg r4, r4 |
| |
| WLVmb v17, r9 |
| WLVmb v16, r9 |
| WLVmb v15, r9 |
| WLVmb v14, r9 |
| WLVmb v13, r9 |
| WLVmb v12, r9 |
| WLVmb v11, r9 |
| WLVmb v10, r9 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .macro RL V, R, P |
| lvx \V, 0, \R |
| add \R, \R, \P |
| .endm |
| |
| .macro WL V, R, P |
| stvx \V, 0, \R |
| add \R, \R, \P |
| .endm |
| |
| .macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 |
| ;# K = |P0-P1| already |
| Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| |
| vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) |
| vcmpgtub v10, v14, v0 |
| |
| Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] |
| |
| max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) |
| max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) |
| max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) |
| |
| vmaxub v14, v14, v4 ;# M = max interior abs diff |
| vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded |
| |
| Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) |
| vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded |
| vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded |
| |
| ;# replace P1,Q1 w/signed versions |
| common_adjust \P0, \Q0, \P1, \Q1, 1 |
| |
| vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant |
| vsrab v13, v13, v1 |
| vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev |
| vsubsbs \Q1, \Q1, v13 |
| vaddsbs \P1, \P1, v13 |
| |
| vxor \P1, \P1, v11 ;# P1 |
| vxor \P0, \P0, v11 ;# P0 |
| vxor \Q0, \Q0, v11 ;# Q0 |
| vxor \Q1, \Q1, v11 ;# Q1 |
| .endm |
| |
| |
| .align 2 |
| ;# r3 unsigned char *s |
| ;# r4 int p |
| ;# r5 const signed char *flimit |
| ;# r6 const signed char *limit |
| ;# r7 const signed char *thresh |
| loop_filter_vertical_edge_y_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| addi r9, r3, 0 |
| RL v16, r9, r4 |
| RL v17, r9, r4 |
| RL v18, r9, r4 |
| RL v19, r9, r4 |
| RL v20, r9, r4 |
| RL v21, r9, r4 |
| RL v22, r9, r4 |
| RL v23, r9, r4 |
| RL v24, r9, r4 |
| RL v25, r9, r4 |
| RL v26, r9, r4 |
| RL v27, r9, r4 |
| RL v28, r9, r4 |
| RL v29, r9, r4 |
| RL v30, r9, r4 |
| lvx v31, 0, r9 |
| |
| Transpose16x16 |
| |
| vspltisb v1, 1 |
| |
| build_constants r5, r6, r7, v3, v2, v0 |
| |
| Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| |
| |
| Fil v16, v17, v18, v19, v20, v21, v22, v23 |
| Fil v20, v21, v22, v23, v24, v25, v26, v27 |
| Fil v24, v25, v26, v27, v28, v29, v30, v31 |
| |
| Transpose16x16 |
| |
| addi r9, r3, 0 |
| WL v16, r9, r4 |
| WL v17, r9, r4 |
| WL v18, r9, r4 |
| WL v19, r9, r4 |
| WL v20, r9, r4 |
| WL v21, r9, r4 |
| WL v22, r9, r4 |
| WL v23, r9, r4 |
| WL v24, r9, r4 |
| WL v25, r9, r4 |
| WL v26, r9, r4 |
| WL v27, r9, r4 |
| WL v28, r9, r4 |
| WL v29, r9, r4 |
| WL v30, r9, r4 |
| stvx v31, 0, r9 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| ;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| .macro active_chroma_sel V |
| andi. r7, r3, 8 ;# row origin modulo 16 |
| add r7, r7, r7 ;# selects selectors |
| lis r12, _chromaSelectors@ha |
| la r0, _chromaSelectors@l(r12) |
| lwzux r0, r7, r0 ;# leave selector addr in r7 |
| |
| lvx \V, 0, r0 ;# mask to concatenate active U,V pels |
| .endm |
| |
| .macro hread_uv Dest, U, V, Offs, VMask |
| lvx \U, \Offs, r3 |
| lvx \V, \Offs, r4 |
| vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V |
| .endm |
| |
| .macro hwrite_uv New, U, V, Offs, Umask, Vmask |
| vperm \U, \New, \U, \Umask ;# Combine new pels with siblings |
| vperm \V, \New, \V, \Vmask |
| stvx \U, \Offs, r3 ;# Write to frame buffer |
| stvx \V, \Offs, r4 |
| .endm |
| |
| ;# Process U,V in parallel. |
| .macro load_chroma_h |
| neg r9, r5 ;# r9 = -1 * stride |
| add r8, r9, r9 ;# r8 = -2 * stride |
| add r10, r5, r5 ;# r10 = 2 * stride |
| |
| active_chroma_sel v12 |
| |
| ;# P3, Q3 are read-only; need not save addresses or sibling pels |
| add r6, r8, r8 ;# r6 = -4 * stride |
| hread_uv v0, v14, v15, r6, v12 |
| add r6, r10, r5 ;# r6 = 3 * stride |
| hread_uv v7, v14, v15, r6, v12 |
| |
| ;# Others are read/write; save addresses and sibling pels |
| |
| add r6, r8, r9 ;# r6 = -3 * stride |
| hread_uv v1, v16, v17, r6, v12 |
| hread_uv v2, v18, v19, r8, v12 |
| hread_uv v3, v20, v21, r9, v12 |
| hread_uv v4, v22, v23, 0, v12 |
| hread_uv v5, v24, v25, r5, v12 |
| hread_uv v6, v26, v27, r10, v12 |
| .endm |
| |
| .macro uresult_sel V |
| load_g \V, 4(r7) |
| .endm |
| |
| .macro vresult_sel V |
| load_g \V, 8(r7) |
| .endm |
| |
| ;# always write P1,P0,Q0,Q1 |
| .macro store_chroma_h |
| uresult_sel v11 |
| vresult_sel v12 |
| hwrite_uv v2, v18, v19, r8, v11, v12 |
| hwrite_uv v3, v20, v21, r9, v11, v12 |
| hwrite_uv v4, v22, v23, 0, v11, v12 |
| hwrite_uv v5, v24, v25, r5, v11, v12 |
| .endm |
| |
| .align 2 |
| ;# r3 unsigned char *u |
| ;# r4 unsigned char *v |
| ;# r5 int p |
| ;# r6 const signed char *flimit |
| ;# r7 const signed char *limit |
| ;# r8 const signed char *thresh |
| mbloop_filter_horizontal_edge_uv_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| build_constants r6, r7, r8, v8, v9, v10 |
| |
| load_chroma_h |
| |
| vp8_mbfilter |
| |
| store_chroma_h |
| |
| hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 |
| hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .align 2 |
| ;# r3 unsigned char *u |
| ;# r4 unsigned char *v |
| ;# r5 int p |
| ;# r6 const signed char *flimit |
| ;# r7 const signed char *limit |
| ;# r8 const signed char *thresh |
| loop_filter_horizontal_edge_uv_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| build_constants r6, r7, r8, v8, v9, v10 |
| |
| load_chroma_h |
| |
| SBFilter |
| |
| store_chroma_h |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .macro R V, R |
| lwzux r0, r3, r5 |
| stw r0, 4(\R) |
| lwz r0,-4(r3) |
| stw r0, 0(\R) |
| lwzux r0, r4, r5 |
| stw r0,12(\R) |
| lwz r0,-4(r4) |
| stw r0, 8(\R) |
| lvx \V, 0, \R |
| .endm |
| |
| |
| .macro W V, R |
| stvx \V, 0, \R |
| lwz r0,12(\R) |
| stwux r0, r4, r5 |
| lwz r0, 8(\R) |
| stw r0,-4(r4) |
| lwz r0, 4(\R) |
| stwux r0, r3, r5 |
| lwz r0, 0(\R) |
| stw r0,-4(r3) |
| .endm |
| |
| .macro chroma_vread R |
| sub r3, r3, r5 ;# back up one line for simplicity |
| sub r4, r4, r5 |
| |
| R v0, \R |
| R v1, \R |
| R v2, \R |
| R v3, \R |
| R v4, \R |
| R v5, \R |
| R v6, \R |
| R v7, \R |
| |
| transpose8x16_fwd |
| .endm |
| |
| .macro chroma_vwrite R |
| |
| transpose8x16_inv |
| |
| add r3, r3, r5 |
| add r4, r4, r5 |
| neg r5, r5 ;# Write rows back in reverse order |
| |
| W v17, \R |
| W v16, \R |
| W v15, \R |
| W v14, \R |
| W v13, \R |
| W v12, \R |
| W v11, \R |
| W v10, \R |
| .endm |
| |
| .align 2 |
| ;# r3 unsigned char *u |
| ;# r4 unsigned char *v |
| ;# r5 int p |
| ;# r6 const signed char *flimit |
| ;# r7 const signed char *limit |
| ;# r8 const signed char *thresh |
| mbloop_filter_vertical_edge_uv_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xc000 |
| mtspr 256, r12 ;# set VRSAVE |
| |
| la r9, -48(r1) ;# temporary space for reading in vectors |
| |
| chroma_vread r9 |
| |
| build_constants r6, r7, r8, v8, v9, v10 |
| |
| vp8_mbfilter |
| |
| chroma_vwrite r9 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .align 2 |
| ;# r3 unsigned char *u |
| ;# r4 unsigned char *v |
| ;# r5 int p |
| ;# r6 const signed char *flimit |
| ;# r7 const signed char *limit |
| ;# r8 const signed char *thresh |
| loop_filter_vertical_edge_uv_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xc000 |
| mtspr 256, r12 ;# set VRSAVE |
| |
| la r9, -48(r1) ;# temporary space for reading in vectors |
| |
| chroma_vread r9 |
| |
| build_constants r6, r7, r8, v8, v9, v10 |
| |
| SBFilter |
| |
| chroma_vwrite r9 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| ;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- |
| |
| .macro vp8_simple_filter |
| Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) |
| vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit |
| |
| ;# preserve unsigned v0 and v3 |
| common_adjust v1, v2, v0, v3, 0 |
| |
| vxor v1, v1, v11 |
| vxor v2, v2, v11 ;# cvt Q0, P0 back to pels |
| .endm |
| |
| .macro simple_vertical |
| addi r8, 0, 16 |
| addi r7, r5, 32 |
| |
| lvx v0, 0, r5 |
| lvx v1, r8, r5 |
| lvx v2, 0, r7 |
| lvx v3, r8, r7 |
| |
| lis r12, _B_hihi@ha |
| la r0, _B_hihi@l(r12) |
| lvx v16, 0, r0 |
| |
| lis r12, _B_lolo@ha |
| la r0, _B_lolo@l(r12) |
| lvx v17, 0, r0 |
| |
| Transpose4times4x4 v16, v17 |
| vp8_simple_filter |
| |
| vxor v0, v0, v11 |
| vxor v3, v3, v11 ;# cvt Q0, P0 back to pels |
| |
| Transpose4times4x4 v16, v17 |
| |
| stvx v0, 0, r5 |
| stvx v1, r8, r5 |
| stvx v2, 0, r7 |
| stvx v3, r8, r7 |
| .endm |
| |
| .align 2 |
| ;# r3 unsigned char *s |
| ;# r4 int p |
| ;# r5 const signed char *flimit |
| loop_filter_simple_horizontal_edge_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| mtspr 256, r12 ;# set VRSAVE |
| |
| ;# build constants |
| lvx v8, 0, r5 ;# flimit |
| |
| vspltisb v11, 8 |
| vspltisb v12, 4 |
| vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 |
| |
| neg r5, r4 ;# r5 = -1 * stride |
| add r6, r5, r5 ;# r6 = -2 * stride |
| |
| lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge |
| lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge |
| lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge |
| lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge |
| |
| vp8_simple_filter |
| |
| stvx v1, r5, r3 ;# store P0 |
| stvx v2, 0, r3 ;# store Q0 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .macro RLV Offs |
| stw r0, (\Offs*4)(r5) |
| lwzux r0, r7, r4 |
| .endm |
| |
| .macro WLV Offs |
| lwz r0, (\Offs*4)(r5) |
| stwux r0, r7, r4 |
| .endm |
| |
| .align 2 |
| ;# r3 unsigned char *s |
| ;# r4 int p |
| ;# r5 const signed char *flimit |
| loop_filter_simple_vertical_edge_ppc: |
| mfspr r11, 256 ;# get old VRSAVE |
| oris r12, r11, 0xffff |
| ori r12, r12, 0xc000 |
| mtspr 256, r12 ;# set VRSAVE |
| |
| ;# build constants |
| lvx v8, 0, r5 ;# flimit |
| |
| vspltisb v11, 8 |
| vspltisb v12, 4 |
| vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 |
| |
| la r5, -96(r1) ;# temporary space for reading in vectors |
| |
| ;# Store 4 pels at word "Offs" in temp array, then advance r7 |
| ;# to next row and read another 4 pels from the frame buffer. |
| |
| subi r7, r3, 2 ;# r7 -> 2 pels before start |
| lwzx r0, 0, r7 ;# read first 4 pels |
| |
| ;# 16 unaligned word accesses |
| RLV 0 |
| RLV 4 |
| RLV 8 |
| RLV 12 |
| RLV 1 |
| RLV 5 |
| RLV 9 |
| RLV 13 |
| RLV 2 |
| RLV 6 |
| RLV 10 |
| RLV 14 |
| RLV 3 |
| RLV 7 |
| RLV 11 |
| |
| stw r0, (15*4)(r5) ;# write last 4 pels |
| |
| simple_vertical |
| |
| ;# Read temp array, write frame buffer. |
| subi r7, r3, 2 ;# r7 -> 2 pels before start |
| lwzx r0, 0, r5 ;# read/write first 4 pels |
| stwx r0, 0, r7 |
| |
| WLV 4 |
| WLV 8 |
| WLV 12 |
| WLV 1 |
| WLV 5 |
| WLV 9 |
| WLV 13 |
| WLV 2 |
| WLV 6 |
| WLV 10 |
| WLV 14 |
| WLV 3 |
| WLV 7 |
| WLV 11 |
| WLV 15 |
| |
| mtspr 256, r11 ;# reset old VRSAVE |
| |
| blr |
| |
| .data |
| |
| _chromaSelectors: |
| .long _B_hihi |
| .long _B_Ures0 |
| .long _B_Vres0 |
| .long 0 |
| .long _B_lolo |
| .long _B_Ures8 |
| .long _B_Vres8 |
| .long 0 |
| |
| .align 4 |
| _B_Vres8: |
| .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| .align 4 |
| _B_Ures8: |
| .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| .align 4 |
| _B_lolo: |
| .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 |
| |
| .align 4 |
| _B_Vres0: |
| .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 |
| .align 4 |
| _B_Ures0: |
| .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 |
| |
| .align 4 |
| _B_hihi: |
| .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 |