asm-generic/div64: optimize/simplify __div64_const32()

Several years later I just realized that this code could be greatly
simplified.

First, let's formalize the need for overflow handling in
__arch_xprod64(). Assuming n = UINT64_MAX, there are 2 cases where
an overflow may occur:

1) If a bias must be added, we have m_lo * n_lo + m or
m_lo * 0xffffffff + ((m_hi << 32) + m_lo) or
((m_lo << 32) - m_lo) + ((m_hi << 32) + m_lo) or
(m_lo + m_hi) << 32 which must be < (1 << 64). So the criteria for no
overflow is m_lo + m_hi < (1 << 32).

2) The cross product m_lo * n_hi + m_hi * n_lo or
m_lo * 0xffffffff + m_hi * 0xffffffff or
((m_lo << 32) - m_lo) + ((m_hi << 32) - m_hi). Assuming the top
result from the previous step (m_lo + m_hi) that must be added to
this, we get (m_lo + m_hi) << 32 again.

So let's have a straight and simpler version when this is true.
Otherwise some reordering allows for taking care of possible overflows
without any actual conditionals. And prevent from generating both code
variants by making sure this is considered only if m is perceived as
constant by the compiler.

This, in turn, allows for greatly simplifying __div64_const32(). The
"special case" may go as well as the regular case works just fine
without needing a bias. Then reduction should be applied all the time as
minimizing m is the key.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>

authored by

Nicolas Pitre and committed by

Arnd Bergmann 2 years ago 00a31dd3 1dc82675

+34 -78

1 changed file

expand all

include

asm-generic

div64.h

+34 -78

include/asm-generic/div64.h

··· 74 74 * do the trick here). \ 75 75 */ \ 76 76 uint64_t ___res, ___x, ___t, ___m, ___n = (n); \ 77 - uint32_t ___p, ___bias; \ 77 + uint32_t ___p; \ 78 + bool ___bias = false; \ 78 79 \ 79 80 /* determine MSB of b */ \ 80 81 ___p = 1 << ilog2(___b); \ ··· 88 87 ___x = ~0ULL / ___b * ___b - 1; \ 89 88 \ 90 89 /* test our ___m with res = m * x / (p << 64) */ \ 91 - ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \ 92 - ___t = ___res += (___m & 0xffffffff) * (___x >> 32); \ 93 - ___res += (___x & 0xffffffff) * (___m >> 32); \ 94 - ___t = (___res < ___t) ? (1ULL << 32) : 0; \ 95 - ___res = (___res >> 32) + ___t; \ 96 - ___res += (___m >> 32) * (___x >> 32); \ 97 - ___res /= ___p; \ 90 + ___res = (___m & 0xffffffff) * (___x & 0xffffffff); \ 91 + ___t = (___m & 0xffffffff) * (___x >> 32) + (___res >> 32); \ 92 + ___res = (___m >> 32) * (___x >> 32) + (___t >> 32); \ 93 + ___t = (___m >> 32) * (___x & 0xffffffff) + (___t & 0xffffffff);\ 94 + ___res = (___res + (___t >> 32)) / ___p; \ 98 95 \ 99 - /* Now sanitize and optimize what we've got. */ \ 100 - if (~0ULL % (___b / (___b & -___b)) == 0) { \ 101 - /* special case, can be simplified to ... */ \ 102 - ___n /= (___b & -___b); \ 103 - ___m = ~0ULL / (___b / (___b & -___b)); \ 104 - ___p = 1; \ 105 - ___bias = 1; \ 106 - } else if (___res != ___x / ___b) { \ 96 + /* Now validate what we've got. */ \ 97 + if (___res != ___x / ___b) { \ 107 98 /* \ 108 99 * We can't get away without a bias to compensate \ 109 100 * for bit truncation errors. To avoid it we'd need an \ ··· 104 111 * \ 105 112 * Instead we do m = p / b and n / b = (n * m + m) / p. \ 106 113 */ \ 107 - ___bias = 1; \ 114 + ___bias = true; \ 108 115 /* Compute m = (p << 64) / b */ \ 109 116 ___m = (~0ULL / ___b) * ___p; \ 110 117 ___m += ((~0ULL % ___b + 1) * ___p) / ___b; \ 111 - } else { \ 112 - /* \ 113 - * Reduce m / p, and try to clear bit 31 of m when \ 114 - * possible, otherwise that'll need extra overflow \ 115 - * handling later. \ 116 - */ \ 117 - uint32_t ___bits = -(___m & -___m); \ 118 - ___bits |= ___m >> 32; \ 119 - ___bits = (~___bits) << 1; \ 120 - /* \ 121 - * If ___bits == 0 then setting bit 31 is unavoidable. \ 122 - * Simply apply the maximum possible reduction in that \ 123 - * case. Otherwise the MSB of ___bits indicates the \ 124 - * best reduction we should apply. \ 125 - */ \ 126 - if (!___bits) { \ 127 - ___p /= (___m & -___m); \ 128 - ___m /= (___m & -___m); \ 129 - } else { \ 130 - ___p >>= ilog2(___bits); \ 131 - ___m >>= ilog2(___bits); \ 132 - } \ 133 - /* No bias needed. */ \ 134 - ___bias = 0; \ 135 118 } \ 136 119 \ 120 + /* Reduce m / p to help avoid overflow handling later. */ \ 121 + ___p /= (___m & -___m); \ 122 + ___m /= (___m & -___m); \ 123 + \ 137 124 /* \ 138 - * Now we have a combination of 2 conditions: \ 139 - * \ 140 - * 1) whether or not we need to apply a bias, and \ 141 - * \ 142 - * 2) whether or not there might be an overflow in the cross \ 143 - * product determined by (___m & ((1 << 63) | (1 << 31))). \ 144 - * \ 145 - * Select the best way to do (m_bias + m * n) / (1 << 64). \ 125 + * Perform (m_bias + m * n) / (1 << 64). \ 146 126 * From now on there will be actual runtime code generated. \ 147 127 */ \ 148 128 ___res = __arch_xprod_64(___m, ___n, ___bias); \ ··· 131 165 * Semantic: retval = ((bias ? m : 0) + m * n) >> 64 132 166 * 133 167 * The product is a 128-bit value, scaled down to 64 bits. 134 - * Assuming constant propagation to optimize away unused conditional code. 168 + * Hoping for compile-time optimization of conditional code. 135 169 * Architectures may provide their own optimized assembly implementation. 136 170 */ 137 171 static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias) ··· 140 174 uint32_t m_hi = m >> 32; 141 175 uint32_t n_lo = n; 142 176 uint32_t n_hi = n >> 32; 143 - uint64_t res; 144 - uint32_t res_lo, res_hi, tmp; 177 + uint64_t x, y; 145 178 146 - if (!bias) { 147 - res = ((uint64_t)m_lo * n_lo) >> 32; 148 - } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) { 149 - /* there can't be any overflow here */ 150 - res = (m + (uint64_t)m_lo * n_lo) >> 32; 179 + /* Determine if overflow handling can be dispensed with. */ 180 + bool no_ovf = __builtin_constant_p(m) && 181 + ((m >> 32) + (m & 0xffffffff) < 0x100000000); 182 + 183 + if (no_ovf) { 184 + x = (uint64_t)m_lo * n_lo + (bias ? m : 0); 185 + x >>= 32; 186 + x += (uint64_t)m_lo * n_hi; 187 + x += (uint64_t)m_hi * n_lo; 188 + x >>= 32; 189 + x += (uint64_t)m_hi * n_hi; 151 190 } else { 152 - res = m + (uint64_t)m_lo * n_lo; 153 - res_lo = res >> 32; 154 - res_hi = (res_lo < m_hi); 155 - res = res_lo | ((uint64_t)res_hi << 32); 191 + x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0); 192 + y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0); 193 + x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32); 194 + y = (uint64_t)m_hi * n_lo + (uint32_t)y; 195 + x += (uint32_t)(y >> 32); 156 196 } 157 197 158 - if (!(m & ((1ULL << 63) | (1ULL << 31)))) { 159 - /* there can't be any overflow here */ 160 - res += (uint64_t)m_lo * n_hi; 161 - res += (uint64_t)m_hi * n_lo; 162 - res >>= 32; 163 - } else { 164 - res += (uint64_t)m_lo * n_hi; 165 - tmp = res >> 32; 166 - res += (uint64_t)m_hi * n_lo; 167 - res_lo = res >> 32; 168 - res_hi = (res_lo < tmp); 169 - res = res_lo | ((uint64_t)res_hi << 32); 170 - } 171 - 172 - res += (uint64_t)m_hi * n_hi; 173 - 174 - return res; 198 + return x; 175 199 } 176 200 #endif 177 201

Configure Feed

Configure Feed