ARMv6 versions of X(N)PROD31 macros and MULT32 macro. Saves about 1MHz or 3% decoding vorbis on gigabeat S. · tsiry-sandratraina.com/rockbox-zig@7afea91

+86 -56

1 changed file

expand all

apps

codecs

lib

asm_arm.h

+86 -56

apps/codecs/lib/asm_arm.h

··· 19 19 #if !defined(_V_WIDE_MATH) && !defined(_LOW_ACCURACY_) 20 20 #define _V_WIDE_MATH 21 21 22 + #if ARM_ARCH >= 6 22 23 static inline int32_t MULT32(int32_t x, int32_t y) { 23 - int lo,hi; 24 - asm volatile("smull\t%0, %1, %2, %3" 24 + int32_t hi; 25 + asm volatile("smmul %[hi], %[x], %[y] \n\t" 26 + : [hi] "=&r" (hi) 27 + : [x] "r" (x), [y] "r" (y) ); 28 + return(hi); 29 + } 30 + #else 31 + static inline int32_t MULT32(int32_t x, int32_t y) { 32 + int32_t lo, hi; 33 + asm volatile("smull\t%0, %1, %2, %3 \n\t" 25 34 : "=&r"(lo),"=&r"(hi) 26 35 : "r"(x),"r"(y) ); 27 36 return(hi); 28 37 } 38 + #endif 29 39 30 40 static inline int32_t MULT31(int32_t x, int32_t y) { 31 41 return MULT32(x,y)<<1; 32 42 } 33 43 34 44 static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) { 35 - int lo,hi; 45 + int32_t lo,hi; 36 46 asm volatile("smull %0, %1, %2, %3\n\t" 37 47 "movs %0, %0, lsr #15\n\t" 38 48 "adc %1, %0, %1, lsl #17\n\t" ··· 44 54 45 55 #define XPROD32(a, b, t, v, x, y) \ 46 56 { \ 47 - long l; \ 48 - asm( "smull %0, %1, %3, %5\n\t" \ 49 - "rsb %2, %6, #0\n\t" \ 50 - "smlal %0, %1, %4, %6\n\t" \ 51 - "smull %0, %2, %3, %2\n\t" \ 52 - "smlal %0, %2, %4, %5" \ 53 - : "=&r" (l), "=&r" (x), "=&r" (y) \ 54 - : "r" ((a)), "r" ((b)), "r" ((t)), "r" ((v)) ); \ 57 + int32_t l; \ 58 + asm("smull %0, %1, %3, %5\n\t" \ 59 + "rsb %2, %6, #0\n\t" \ 60 + "smlal %0, %1, %4, %6\n\t" \ 61 + "smull %0, %2, %3, %2\n\t" \ 62 + "smlal %0, %2, %4, %5" \ 63 + : "=&r" (l), "=&r" (x), "=&r" (y) \ 64 + : "r" ((a)), "r" ((b)), "r" ((t)), "r" ((v)) ); \ 55 65 } 56 66 57 - static inline void XPROD31(int32_t a, int32_t b, 58 - int32_t t, int32_t v, 59 - int32_t *x, int32_t *y) 60 - { 61 - int x1, y1, l; 62 - asm( "smull %0, %1, %3, %5\n\t" 63 - "rsb %2, %6, #0\n\t" 64 - "smlal %0, %1, %4, %6\n\t" 65 - "smull %0, %2, %3, %2\n\t" 66 - "smlal %0, %2, %4, %5" 67 - : "=&r" (l), "=&r" (x1), "=&r" (y1) 68 - : "r" (a), "r" (b), "r" (t), "r" (v) ); 69 - *x = x1 << 1; 70 - *y = y1 << 1; 67 + #if ARM_ARCH >= 6 68 + /* These may yield slightly different result from the macros below 69 + because only the high 32 bits of the multiplications are accumulated while 70 + the below macros use a 64 bit accumulator that is truncated to 32 bits.*/ 71 + #define XPROD31_R(_a, _b, _t, _v, _x, _y)\ 72 + {\ 73 + int32_t x1, y1;\ 74 + asm("smmul %[x1], %[t], %[a] \n\t"\ 75 + "smmul %[y1], %[t], %[b] \n\t"\ 76 + "smmla %[x1], %[v], %[b], %[x1] \n\t"\ 77 + "smmls %[y1], %[v], %[a], %[y1] \n\t"\ 78 + : [x1] "=&r" (x1), [y1] "=&r" (y1)\ 79 + : [a] "r" (_a), [b] "r" (_b), [t] "r" (_t), [v] "r" (_v) );\ 80 + _x = x1 << 1;\ 81 + _y = y1 << 1;\ 71 82 } 72 83 73 - static inline void XNPROD31(int32_t a, int32_t b, 74 - int32_t t, int32_t v, 75 - int32_t *x, int32_t *y) 76 - { 77 - int x1, y1, l; 78 - asm( "smull %0, %1, %3, %5\n\t" 79 - "rsb %2, %4, #0\n\t" 80 - "smlal %0, %1, %2, %6\n\t" 81 - "smull %0, %2, %4, %5\n\t" 82 - "smlal %0, %2, %3, %6" 83 - : "=&r" (l), "=&r" (x1), "=&r" (y1) 84 - : "r" (a), "r" (b), "r" (t), "r" (v) ); 85 - *x = x1 << 1; 86 - *y = y1 << 1; 84 + #define XNPROD31_R(_a, _b, _t, _v, _x, _y)\ 85 + {\ 86 + int32_t x1, y1;\ 87 + asm("smmul %[x1], %[t], %[a] \n\t"\ 88 + "smmul %[y1], %[t], %[b] \n\t"\ 89 + "smmls %[x1], %[v], %[b], %[x1] \n\t"\ 90 + "smmla %[y1], %[v], %[a], %[y1] \n\t"\ 91 + : [x1] "=&r" (x1), [y1] "=&r" (y1)\ 92 + : [a] "r" (_a), [b] "r" (_b), [t] "r" (_t), [v] "r" (_v) );\ 93 + _x = x1 << 1;\ 94 + _y = y1 << 1;\ 87 95 } 88 - 96 + #else 89 97 #define XPROD31_R(_a, _b, _t, _v, _x, _y)\ 90 98 {\ 91 - int x1, y1, l;\ 92 - asm( "smull %0, %1, %5, %3\n\t"\ 93 - "rsb %2, %3, #0\n\t"\ 94 - "smlal %0, %1, %6, %4\n\t"\ 95 - "smull %0, %2, %6, %2\n\t"\ 96 - "smlal %0, %2, %5, %4"\ 97 - : "=&r" (l), "=&r" (x1), "=&r" (y1)\ 98 - : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\ 99 + int32_t x1, y1, l;\ 100 + asm("smull %0, %1, %5, %3\n\t"\ 101 + "rsb %2, %3, #0\n\t"\ 102 + "smlal %0, %1, %6, %4\n\t"\ 103 + "smull %0, %2, %6, %2\n\t"\ 104 + "smlal %0, %2, %5, %4"\ 105 + : "=&r" (l), "=&r" (x1), "=&r" (y1)\ 106 + : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\ 99 107 _x = x1 << 1;\ 100 108 _y = y1 << 1;\ 101 109 } 102 110 103 111 #define XNPROD31_R(_a, _b, _t, _v, _x, _y)\ 104 112 {\ 105 - int x1, y1, l;\ 106 - asm( "smull %0, %1, %5, %3\n\t"\ 107 - "rsb %2, %4, #0\n\t"\ 108 - "smlal %0, %1, %6, %2\n\t"\ 109 - "smull %0, %2, %5, %4\n\t"\ 110 - "smlal %0, %2, %6, %3"\ 111 - : "=&r" (l), "=&r" (x1), "=&r" (y1)\ 112 - : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\ 113 + int32_t x1, y1, l;\ 114 + asm("smull %0, %1, %5, %3\n\t"\ 115 + "rsb %2, %4, #0\n\t"\ 116 + "smlal %0, %1, %6, %2\n\t"\ 117 + "smull %0, %2, %5, %4\n\t"\ 118 + "smlal %0, %2, %6, %3"\ 119 + : "=&r" (l), "=&r" (x1), "=&r" (y1)\ 120 + : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\ 113 121 _x = x1 << 1;\ 114 122 _y = y1 << 1;\ 115 123 } 124 + #endif 125 + 126 + static inline void XPROD31(int32_t a, int32_t b, 127 + int32_t t, int32_t v, 128 + int32_t *x, int32_t *y) 129 + { 130 + int32_t _x1, _y1; 131 + XPROD31_R(a, b, t, v, _x1, _y1); 132 + *x = _x1; 133 + *y = _y1; 134 + } 135 + 136 + static inline void XNPROD31(int32_t a, int32_t b, 137 + int32_t t, int32_t v, 138 + int32_t *x, int32_t *y) 139 + { 140 + int32_t _x1, _y1; 141 + XNPROD31_R(a, b, t, v, _x1, _y1); 142 + *x = _x1; 143 + *y = _y1; 144 + } 145 + 116 146 117 147 #ifndef _V_VECT_OPS 118 148 #define _V_VECT_OPS

Configure Feed

Configure Feed