x86: move the XOR code to lib/raid/ · tjh.dev/kernel@77fd47e

+31 -487

arch/x86/include/asm/xor.h

··· 2 2 #ifndef _ASM_X86_XOR_H 3 3 #define _ASM_X86_XOR_H 4 4 5 - /* 6 - * Optimized RAID-5 checksumming functions for SSE. 7 - */ 5 + #include <asm/cpufeature.h> 6 + #include <asm-generic/xor.h> 7 + 8 + extern struct xor_block_template xor_block_pII_mmx; 9 + extern struct xor_block_template xor_block_p5_mmx; 10 + extern struct xor_block_template xor_block_sse; 11 + extern struct xor_block_template xor_block_sse_pf64; 12 + extern struct xor_block_template xor_block_avx; 8 13 9 14 /* 10 - * Cache avoiding checksumming functions utilizing KNI instructions 11 - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12 - */ 13 - 14 - /* 15 - * Based on 16 - * High-speed RAID5 checksumming functions utilizing SSE instructions. 17 - * Copyright (C) 1998 Ingo Molnar. 18 - */ 19 - 20 - /* 21 - * x86-64 changes / gcc fixes from Andi Kleen. 22 - * Copyright 2002 Andi Kleen, SuSE Labs. 15 + * When SSE is available, use it as it can write around L2. We may also be able 16 + * to load into the L1 only depending on how the cpu deals with a load to a line 17 + * that is being prefetched. 23 18 * 24 - * This hasn't been optimized for the hammer yet, but there are likely 25 - * no advantages to be gotten from x86-64 here anyways. 19 + * When AVX2 is available, force using it as it is better by all measures. 20 + * 21 + * 32-bit without MMX can fall back to the generic routines. 26 22 */ 27 - 28 - #include <asm/fpu/api.h> 29 - 30 - #ifdef CONFIG_X86_32 31 - /* reduce register pressure */ 32 - # define XOR_CONSTANT_CONSTRAINT "i" 33 - #else 34 - # define XOR_CONSTANT_CONSTRAINT "re" 35 - #endif 36 - 37 - #define OFFS(x) "16*("#x")" 38 - #define PF_OFFS(x) "256+16*("#x")" 39 - #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40 - #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41 - #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42 - #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43 - #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44 - #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45 - #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46 - #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47 - #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48 - #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49 - #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50 - #define NOP(x) 51 - 52 - #define BLK64(pf, op, i) \ 53 - pf(i) \ 54 - op(i, 0) \ 55 - op(i + 1, 1) \ 56 - op(i + 2, 2) \ 57 - op(i + 3, 3) 58 - 59 - static void 60 - xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 61 - const unsigned long * __restrict p2) 23 + #define arch_xor_init arch_xor_init 24 + static __always_inline void __init arch_xor_init(void) 62 25 { 63 - unsigned long lines = bytes >> 8; 64 - 65 - kernel_fpu_begin(); 66 - 67 - asm volatile( 68 - #undef BLOCK 69 - #define BLOCK(i) \ 70 - LD(i, 0) \ 71 - LD(i + 1, 1) \ 72 - PF1(i) \ 73 - PF1(i + 2) \ 74 - LD(i + 2, 2) \ 75 - LD(i + 3, 3) \ 76 - PF0(i + 4) \ 77 - PF0(i + 6) \ 78 - XO1(i, 0) \ 79 - XO1(i + 1, 1) \ 80 - XO1(i + 2, 2) \ 81 - XO1(i + 3, 3) \ 82 - ST(i, 0) \ 83 - ST(i + 1, 1) \ 84 - ST(i + 2, 2) \ 85 - ST(i + 3, 3) \ 86 - 87 - 88 - PF0(0) 89 - PF0(2) 90 - 91 - " .align 32 ;\n" 92 - " 1: ;\n" 93 - 94 - BLOCK(0) 95 - BLOCK(4) 96 - BLOCK(8) 97 - BLOCK(12) 98 - 99 - " add %[inc], %[p1] ;\n" 100 - " add %[inc], %[p2] ;\n" 101 - " dec %[cnt] ;\n" 102 - " jnz 1b ;\n" 103 - : [cnt] "+r" (lines), 104 - [p1] "+r" (p1), [p2] "+r" (p2) 105 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 106 - : "memory"); 107 - 108 - kernel_fpu_end(); 26 + if (boot_cpu_has(X86_FEATURE_AVX) && 27 + boot_cpu_has(X86_FEATURE_OSXSAVE)) { 28 + xor_force(&xor_block_avx); 29 + } else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) { 30 + xor_register(&xor_block_sse); 31 + xor_register(&xor_block_sse_pf64); 32 + } else if (boot_cpu_has(X86_FEATURE_MMX)) { 33 + xor_register(&xor_block_pII_mmx); 34 + xor_register(&xor_block_p5_mmx); 35 + } else { 36 + xor_register(&xor_block_8regs); 37 + xor_register(&xor_block_8regs_p); 38 + xor_register(&xor_block_32regs); 39 + xor_register(&xor_block_32regs_p); 40 + } 109 41 } 110 - 111 - static void 112 - xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 113 - const unsigned long * __restrict p2) 114 - { 115 - unsigned long lines = bytes >> 8; 116 - 117 - kernel_fpu_begin(); 118 - 119 - asm volatile( 120 - #undef BLOCK 121 - #define BLOCK(i) \ 122 - BLK64(PF0, LD, i) \ 123 - BLK64(PF1, XO1, i) \ 124 - BLK64(NOP, ST, i) \ 125 - 126 - " .align 32 ;\n" 127 - " 1: ;\n" 128 - 129 - BLOCK(0) 130 - BLOCK(4) 131 - BLOCK(8) 132 - BLOCK(12) 133 - 134 - " add %[inc], %[p1] ;\n" 135 - " add %[inc], %[p2] ;\n" 136 - " dec %[cnt] ;\n" 137 - " jnz 1b ;\n" 138 - : [cnt] "+r" (lines), 139 - [p1] "+r" (p1), [p2] "+r" (p2) 140 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 141 - : "memory"); 142 - 143 - kernel_fpu_end(); 144 - } 145 - 146 - static void 147 - xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 148 - const unsigned long * __restrict p2, 149 - const unsigned long * __restrict p3) 150 - { 151 - unsigned long lines = bytes >> 8; 152 - 153 - kernel_fpu_begin(); 154 - 155 - asm volatile( 156 - #undef BLOCK 157 - #define BLOCK(i) \ 158 - PF1(i) \ 159 - PF1(i + 2) \ 160 - LD(i, 0) \ 161 - LD(i + 1, 1) \ 162 - LD(i + 2, 2) \ 163 - LD(i + 3, 3) \ 164 - PF2(i) \ 165 - PF2(i + 2) \ 166 - PF0(i + 4) \ 167 - PF0(i + 6) \ 168 - XO1(i, 0) \ 169 - XO1(i + 1, 1) \ 170 - XO1(i + 2, 2) \ 171 - XO1(i + 3, 3) \ 172 - XO2(i, 0) \ 173 - XO2(i + 1, 1) \ 174 - XO2(i + 2, 2) \ 175 - XO2(i + 3, 3) \ 176 - ST(i, 0) \ 177 - ST(i + 1, 1) \ 178 - ST(i + 2, 2) \ 179 - ST(i + 3, 3) \ 180 - 181 - 182 - PF0(0) 183 - PF0(2) 184 - 185 - " .align 32 ;\n" 186 - " 1: ;\n" 187 - 188 - BLOCK(0) 189 - BLOCK(4) 190 - BLOCK(8) 191 - BLOCK(12) 192 - 193 - " add %[inc], %[p1] ;\n" 194 - " add %[inc], %[p2] ;\n" 195 - " add %[inc], %[p3] ;\n" 196 - " dec %[cnt] ;\n" 197 - " jnz 1b ;\n" 198 - : [cnt] "+r" (lines), 199 - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 200 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 201 - : "memory"); 202 - 203 - kernel_fpu_end(); 204 - } 205 - 206 - static void 207 - xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 208 - const unsigned long * __restrict p2, 209 - const unsigned long * __restrict p3) 210 - { 211 - unsigned long lines = bytes >> 8; 212 - 213 - kernel_fpu_begin(); 214 - 215 - asm volatile( 216 - #undef BLOCK 217 - #define BLOCK(i) \ 218 - BLK64(PF0, LD, i) \ 219 - BLK64(PF1, XO1, i) \ 220 - BLK64(PF2, XO2, i) \ 221 - BLK64(NOP, ST, i) \ 222 - 223 - " .align 32 ;\n" 224 - " 1: ;\n" 225 - 226 - BLOCK(0) 227 - BLOCK(4) 228 - BLOCK(8) 229 - BLOCK(12) 230 - 231 - " add %[inc], %[p1] ;\n" 232 - " add %[inc], %[p2] ;\n" 233 - " add %[inc], %[p3] ;\n" 234 - " dec %[cnt] ;\n" 235 - " jnz 1b ;\n" 236 - : [cnt] "+r" (lines), 237 - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 238 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 239 - : "memory"); 240 - 241 - kernel_fpu_end(); 242 - } 243 - 244 - static void 245 - xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 246 - const unsigned long * __restrict p2, 247 - const unsigned long * __restrict p3, 248 - const unsigned long * __restrict p4) 249 - { 250 - unsigned long lines = bytes >> 8; 251 - 252 - kernel_fpu_begin(); 253 - 254 - asm volatile( 255 - #undef BLOCK 256 - #define BLOCK(i) \ 257 - PF1(i) \ 258 - PF1(i + 2) \ 259 - LD(i, 0) \ 260 - LD(i + 1, 1) \ 261 - LD(i + 2, 2) \ 262 - LD(i + 3, 3) \ 263 - PF2(i) \ 264 - PF2(i + 2) \ 265 - XO1(i, 0) \ 266 - XO1(i + 1, 1) \ 267 - XO1(i + 2, 2) \ 268 - XO1(i + 3, 3) \ 269 - PF3(i) \ 270 - PF3(i + 2) \ 271 - PF0(i + 4) \ 272 - PF0(i + 6) \ 273 - XO2(i, 0) \ 274 - XO2(i + 1, 1) \ 275 - XO2(i + 2, 2) \ 276 - XO2(i + 3, 3) \ 277 - XO3(i, 0) \ 278 - XO3(i + 1, 1) \ 279 - XO3(i + 2, 2) \ 280 - XO3(i + 3, 3) \ 281 - ST(i, 0) \ 282 - ST(i + 1, 1) \ 283 - ST(i + 2, 2) \ 284 - ST(i + 3, 3) \ 285 - 286 - 287 - PF0(0) 288 - PF0(2) 289 - 290 - " .align 32 ;\n" 291 - " 1: ;\n" 292 - 293 - BLOCK(0) 294 - BLOCK(4) 295 - BLOCK(8) 296 - BLOCK(12) 297 - 298 - " add %[inc], %[p1] ;\n" 299 - " add %[inc], %[p2] ;\n" 300 - " add %[inc], %[p3] ;\n" 301 - " add %[inc], %[p4] ;\n" 302 - " dec %[cnt] ;\n" 303 - " jnz 1b ;\n" 304 - : [cnt] "+r" (lines), [p1] "+r" (p1), 305 - [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 306 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 307 - : "memory"); 308 - 309 - kernel_fpu_end(); 310 - } 311 - 312 - static void 313 - xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 314 - const unsigned long * __restrict p2, 315 - const unsigned long * __restrict p3, 316 - const unsigned long * __restrict p4) 317 - { 318 - unsigned long lines = bytes >> 8; 319 - 320 - kernel_fpu_begin(); 321 - 322 - asm volatile( 323 - #undef BLOCK 324 - #define BLOCK(i) \ 325 - BLK64(PF0, LD, i) \ 326 - BLK64(PF1, XO1, i) \ 327 - BLK64(PF2, XO2, i) \ 328 - BLK64(PF3, XO3, i) \ 329 - BLK64(NOP, ST, i) \ 330 - 331 - " .align 32 ;\n" 332 - " 1: ;\n" 333 - 334 - BLOCK(0) 335 - BLOCK(4) 336 - BLOCK(8) 337 - BLOCK(12) 338 - 339 - " add %[inc], %[p1] ;\n" 340 - " add %[inc], %[p2] ;\n" 341 - " add %[inc], %[p3] ;\n" 342 - " add %[inc], %[p4] ;\n" 343 - " dec %[cnt] ;\n" 344 - " jnz 1b ;\n" 345 - : [cnt] "+r" (lines), [p1] "+r" (p1), 346 - [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 347 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 348 - : "memory"); 349 - 350 - kernel_fpu_end(); 351 - } 352 - 353 - static void 354 - xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 355 - const unsigned long * __restrict p2, 356 - const unsigned long * __restrict p3, 357 - const unsigned long * __restrict p4, 358 - const unsigned long * __restrict p5) 359 - { 360 - unsigned long lines = bytes >> 8; 361 - 362 - kernel_fpu_begin(); 363 - 364 - asm volatile( 365 - #undef BLOCK 366 - #define BLOCK(i) \ 367 - PF1(i) \ 368 - PF1(i + 2) \ 369 - LD(i, 0) \ 370 - LD(i + 1, 1) \ 371 - LD(i + 2, 2) \ 372 - LD(i + 3, 3) \ 373 - PF2(i) \ 374 - PF2(i + 2) \ 375 - XO1(i, 0) \ 376 - XO1(i + 1, 1) \ 377 - XO1(i + 2, 2) \ 378 - XO1(i + 3, 3) \ 379 - PF3(i) \ 380 - PF3(i + 2) \ 381 - XO2(i, 0) \ 382 - XO2(i + 1, 1) \ 383 - XO2(i + 2, 2) \ 384 - XO2(i + 3, 3) \ 385 - PF4(i) \ 386 - PF4(i + 2) \ 387 - PF0(i + 4) \ 388 - PF0(i + 6) \ 389 - XO3(i, 0) \ 390 - XO3(i + 1, 1) \ 391 - XO3(i + 2, 2) \ 392 - XO3(i + 3, 3) \ 393 - XO4(i, 0) \ 394 - XO4(i + 1, 1) \ 395 - XO4(i + 2, 2) \ 396 - XO4(i + 3, 3) \ 397 - ST(i, 0) \ 398 - ST(i + 1, 1) \ 399 - ST(i + 2, 2) \ 400 - ST(i + 3, 3) \ 401 - 402 - 403 - PF0(0) 404 - PF0(2) 405 - 406 - " .align 32 ;\n" 407 - " 1: ;\n" 408 - 409 - BLOCK(0) 410 - BLOCK(4) 411 - BLOCK(8) 412 - BLOCK(12) 413 - 414 - " add %[inc], %[p1] ;\n" 415 - " add %[inc], %[p2] ;\n" 416 - " add %[inc], %[p3] ;\n" 417 - " add %[inc], %[p4] ;\n" 418 - " add %[inc], %[p5] ;\n" 419 - " dec %[cnt] ;\n" 420 - " jnz 1b ;\n" 421 - : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422 - [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424 - : "memory"); 425 - 426 - kernel_fpu_end(); 427 - } 428 - 429 - static void 430 - xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 431 - const unsigned long * __restrict p2, 432 - const unsigned long * __restrict p3, 433 - const unsigned long * __restrict p4, 434 - const unsigned long * __restrict p5) 435 - { 436 - unsigned long lines = bytes >> 8; 437 - 438 - kernel_fpu_begin(); 439 - 440 - asm volatile( 441 - #undef BLOCK 442 - #define BLOCK(i) \ 443 - BLK64(PF0, LD, i) \ 444 - BLK64(PF1, XO1, i) \ 445 - BLK64(PF2, XO2, i) \ 446 - BLK64(PF3, XO3, i) \ 447 - BLK64(PF4, XO4, i) \ 448 - BLK64(NOP, ST, i) \ 449 - 450 - " .align 32 ;\n" 451 - " 1: ;\n" 452 - 453 - BLOCK(0) 454 - BLOCK(4) 455 - BLOCK(8) 456 - BLOCK(12) 457 - 458 - " add %[inc], %[p1] ;\n" 459 - " add %[inc], %[p2] ;\n" 460 - " add %[inc], %[p3] ;\n" 461 - " add %[inc], %[p4] ;\n" 462 - " add %[inc], %[p5] ;\n" 463 - " dec %[cnt] ;\n" 464 - " jnz 1b ;\n" 465 - : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 466 - [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 467 - : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 468 - : "memory"); 469 - 470 - kernel_fpu_end(); 471 - } 472 - 473 - static struct xor_block_template xor_block_sse_pf64 = { 474 - .name = "prefetch64-sse", 475 - .do_2 = xor_sse_2_pf64, 476 - .do_3 = xor_sse_3_pf64, 477 - .do_4 = xor_sse_4_pf64, 478 - .do_5 = xor_sse_5_pf64, 479 - }; 480 - 481 - #undef LD 482 - #undef XO1 483 - #undef XO2 484 - #undef XO3 485 - #undef XO4 486 - #undef ST 487 - #undef NOP 488 - #undef BLK64 489 - #undef BLOCK 490 - 491 - #undef XOR_CONSTANT_CONSTRAINT 492 - 493 - #ifdef CONFIG_X86_32 494 - # include <asm/xor_32.h> 495 - #else 496 - # include <asm/xor_64.h> 497 - #endif 498 42 499 43 #endif /* _ASM_X86_XOR_H */

+8 -52

arch/x86/include/asm/xor_32.h lib/raid/xor/x86/xor-mmx.c

··· 1 - /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 - #ifndef _ASM_X86_XOR_32_H 3 - #define _ASM_X86_XOR_32_H 4 - 1 + // SPDX-License-Identifier: GPL-2.0-or-later 5 2 /* 6 - * Optimized RAID-5 checksumming functions for MMX. 7 - */ 8 - 9 - /* 10 - * High-speed RAID5 checksumming functions utilizing MMX instructions. 3 + * Optimized XOR parity functions for MMX. 4 + * 11 5 * Copyright (C) 1998 Ingo Molnar. 12 6 */ 7 + #include <linux/raid/xor_impl.h> 8 + #include <asm/fpu/api.h> 9 + #include <asm/xor.h> 13 10 14 11 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" 15 12 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" ··· 14 17 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" 15 18 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" 16 19 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" 17 - 18 - #include <asm/fpu/api.h> 19 20 20 21 static void 21 22 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, ··· 514 519 kernel_fpu_end(); 515 520 } 516 521 517 - static struct xor_block_template xor_block_pII_mmx = { 522 + struct xor_block_template xor_block_pII_mmx = { 518 523 .name = "pII_mmx", 519 524 .do_2 = xor_pII_mmx_2, 520 525 .do_3 = xor_pII_mmx_3, ··· 522 527 .do_5 = xor_pII_mmx_5, 523 528 }; 524 529 525 - static struct xor_block_template xor_block_p5_mmx = { 530 + struct xor_block_template xor_block_p5_mmx = { 526 531 .name = "p5_mmx", 527 532 .do_2 = xor_p5_mmx_2, 528 533 .do_3 = xor_p5_mmx_3, 529 534 .do_4 = xor_p5_mmx_4, 530 535 .do_5 = xor_p5_mmx_5, 531 536 }; 532 - 533 - static struct xor_block_template xor_block_pIII_sse = { 534 - .name = "pIII_sse", 535 - .do_2 = xor_sse_2, 536 - .do_3 = xor_sse_3, 537 - .do_4 = xor_sse_4, 538 - .do_5 = xor_sse_5, 539 - }; 540 - 541 - /* Also try the AVX routines */ 542 - #include <asm/xor_avx.h> 543 - 544 - /* Also try the generic routines. */ 545 - #include <asm-generic/xor.h> 546 - 547 - /* We force the use of the SSE xor block because it can write around L2. 548 - We may also be able to load into the L1 only depending on how the cpu 549 - deals with a load to a line that is being prefetched. */ 550 - #define arch_xor_init arch_xor_init 551 - static __always_inline void __init arch_xor_init(void) 552 - { 553 - if (boot_cpu_has(X86_FEATURE_AVX) && 554 - boot_cpu_has(X86_FEATURE_OSXSAVE)) { 555 - xor_force(&xor_block_avx); 556 - } else if (boot_cpu_has(X86_FEATURE_XMM)) { 557 - xor_register(&xor_block_pIII_sse); 558 - xor_register(&xor_block_sse_pf64); 559 - } else if (boot_cpu_has(X86_FEATURE_MMX)) { 560 - xor_register(&xor_block_pII_mmx); 561 - xor_register(&xor_block_p5_mmx); 562 - } else { 563 - xor_register(&xor_block_8regs); 564 - xor_register(&xor_block_8regs_p); 565 - xor_register(&xor_block_32regs); 566 - xor_register(&xor_block_32regs_p); 567 - } 568 - } 569 - 570 - #endif /* _ASM_X86_XOR_32_H */

-32

arch/x86/include/asm/xor_64.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef _ASM_X86_XOR_64_H 3 - #define _ASM_X86_XOR_64_H 4 - 5 - static struct xor_block_template xor_block_sse = { 6 - .name = "generic_sse", 7 - .do_2 = xor_sse_2, 8 - .do_3 = xor_sse_3, 9 - .do_4 = xor_sse_4, 10 - .do_5 = xor_sse_5, 11 - }; 12 - 13 - 14 - /* Also try the AVX routines */ 15 - #include <asm/xor_avx.h> 16 - 17 - /* We force the use of the SSE xor block because it can write around L2. 18 - We may also be able to load into the L1 only depending on how the cpu 19 - deals with a load to a line that is being prefetched. */ 20 - #define arch_xor_init arch_xor_init 21 - static __always_inline void __init arch_xor_init(void) 22 - { 23 - if (boot_cpu_has(X86_FEATURE_AVX) && 24 - boot_cpu_has(X86_FEATURE_OSXSAVE)) { 25 - xor_force(&xor_block_avx); 26 - } else { 27 - xor_register(&xor_block_sse_pf64); 28 - xor_register(&xor_block_sse); 29 - } 30 - } 31 - 32 - #endif /* _ASM_X86_XOR_64_H */

+5 -9

arch/x86/include/asm/xor_avx.h lib/raid/xor/x86/xor-avx.c

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - #ifndef _ASM_X86_XOR_AVX_H 3 - #define _ASM_X86_XOR_AVX_H 4 - 1 + // SPDX-License-Identifier: GPL-2.0-only 5 2 /* 6 - * Optimized RAID-5 checksumming functions for AVX 3 + * Optimized XOR parity functions for AVX 7 4 * 8 5 * Copyright (C) 2012 Intel Corporation 9 6 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10 7 * 11 8 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12 9 */ 13 - 14 10 #include <linux/compiler.h> 11 + #include <linux/raid/xor_impl.h> 15 12 #include <asm/fpu/api.h> 13 + #include <asm/xor.h> 16 14 17 15 #define BLOCK4(i) \ 18 16 BLOCK(32 * i, 0) \ ··· 156 158 kernel_fpu_end(); 157 159 } 158 160 159 - static struct xor_block_template xor_block_avx = { 161 + struct xor_block_template xor_block_avx = { 160 162 .name = "avx", 161 163 .do_2 = xor_avx_2, 162 164 .do_3 = xor_avx_3, 163 165 .do_4 = xor_avx_4, 164 166 .do_5 = xor_avx_5, 165 167 }; 166 - 167 - #endif

+2

lib/raid/xor/Makefile

··· 21 21 xor-$(CONFIG_SPARC32) += sparc/xor-sparc32.o 22 22 xor-$(CONFIG_SPARC64) += sparc/xor-sparc64.o sparc/xor-sparc64-glue.o 23 23 xor-$(CONFIG_S390) += s390/xor.o 24 + xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o 25 + xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o 24 26 25 27 26 28 CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU)

+476

lib/raid/xor/x86/xor-sse.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Optimized XOR parity functions for SSE. 4 + * 5 + * Cache avoiding checksumming functions utilizing KNI instructions 6 + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 7 + * 8 + * Based on 9 + * High-speed RAID5 checksumming functions utilizing SSE instructions. 10 + * Copyright (C) 1998 Ingo Molnar. 11 + * 12 + * x86-64 changes / gcc fixes from Andi Kleen. 13 + * Copyright 2002 Andi Kleen, SuSE Labs. 14 + */ 15 + #include <linux/raid/xor_impl.h> 16 + #include <asm/fpu/api.h> 17 + #include <asm/xor.h> 18 + 19 + #ifdef CONFIG_X86_32 20 + /* reduce register pressure */ 21 + # define XOR_CONSTANT_CONSTRAINT "i" 22 + #else 23 + # define XOR_CONSTANT_CONSTRAINT "re" 24 + #endif 25 + 26 + #define OFFS(x) "16*("#x")" 27 + #define PF_OFFS(x) "256+16*("#x")" 28 + #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 29 + #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 30 + #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 31 + #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 32 + #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 33 + #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 34 + #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 35 + #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 36 + #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 37 + #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 38 + #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 39 + #define NOP(x) 40 + 41 + #define BLK64(pf, op, i) \ 42 + pf(i) \ 43 + op(i, 0) \ 44 + op(i + 1, 1) \ 45 + op(i + 2, 2) \ 46 + op(i + 3, 3) 47 + 48 + static void 49 + xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 50 + const unsigned long * __restrict p2) 51 + { 52 + unsigned long lines = bytes >> 8; 53 + 54 + kernel_fpu_begin(); 55 + 56 + asm volatile( 57 + #undef BLOCK 58 + #define BLOCK(i) \ 59 + LD(i, 0) \ 60 + LD(i + 1, 1) \ 61 + PF1(i) \ 62 + PF1(i + 2) \ 63 + LD(i + 2, 2) \ 64 + LD(i + 3, 3) \ 65 + PF0(i + 4) \ 66 + PF0(i + 6) \ 67 + XO1(i, 0) \ 68 + XO1(i + 1, 1) \ 69 + XO1(i + 2, 2) \ 70 + XO1(i + 3, 3) \ 71 + ST(i, 0) \ 72 + ST(i + 1, 1) \ 73 + ST(i + 2, 2) \ 74 + ST(i + 3, 3) \ 75 + 76 + 77 + PF0(0) 78 + PF0(2) 79 + 80 + " .align 32 ;\n" 81 + " 1: ;\n" 82 + 83 + BLOCK(0) 84 + BLOCK(4) 85 + BLOCK(8) 86 + BLOCK(12) 87 + 88 + " add %[inc], %[p1] ;\n" 89 + " add %[inc], %[p2] ;\n" 90 + " dec %[cnt] ;\n" 91 + " jnz 1b ;\n" 92 + : [cnt] "+r" (lines), 93 + [p1] "+r" (p1), [p2] "+r" (p2) 94 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 95 + : "memory"); 96 + 97 + kernel_fpu_end(); 98 + } 99 + 100 + static void 101 + xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 102 + const unsigned long * __restrict p2) 103 + { 104 + unsigned long lines = bytes >> 8; 105 + 106 + kernel_fpu_begin(); 107 + 108 + asm volatile( 109 + #undef BLOCK 110 + #define BLOCK(i) \ 111 + BLK64(PF0, LD, i) \ 112 + BLK64(PF1, XO1, i) \ 113 + BLK64(NOP, ST, i) \ 114 + 115 + " .align 32 ;\n" 116 + " 1: ;\n" 117 + 118 + BLOCK(0) 119 + BLOCK(4) 120 + BLOCK(8) 121 + BLOCK(12) 122 + 123 + " add %[inc], %[p1] ;\n" 124 + " add %[inc], %[p2] ;\n" 125 + " dec %[cnt] ;\n" 126 + " jnz 1b ;\n" 127 + : [cnt] "+r" (lines), 128 + [p1] "+r" (p1), [p2] "+r" (p2) 129 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 130 + : "memory"); 131 + 132 + kernel_fpu_end(); 133 + } 134 + 135 + static void 136 + xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 137 + const unsigned long * __restrict p2, 138 + const unsigned long * __restrict p3) 139 + { 140 + unsigned long lines = bytes >> 8; 141 + 142 + kernel_fpu_begin(); 143 + 144 + asm volatile( 145 + #undef BLOCK 146 + #define BLOCK(i) \ 147 + PF1(i) \ 148 + PF1(i + 2) \ 149 + LD(i, 0) \ 150 + LD(i + 1, 1) \ 151 + LD(i + 2, 2) \ 152 + LD(i + 3, 3) \ 153 + PF2(i) \ 154 + PF2(i + 2) \ 155 + PF0(i + 4) \ 156 + PF0(i + 6) \ 157 + XO1(i, 0) \ 158 + XO1(i + 1, 1) \ 159 + XO1(i + 2, 2) \ 160 + XO1(i + 3, 3) \ 161 + XO2(i, 0) \ 162 + XO2(i + 1, 1) \ 163 + XO2(i + 2, 2) \ 164 + XO2(i + 3, 3) \ 165 + ST(i, 0) \ 166 + ST(i + 1, 1) \ 167 + ST(i + 2, 2) \ 168 + ST(i + 3, 3) \ 169 + 170 + 171 + PF0(0) 172 + PF0(2) 173 + 174 + " .align 32 ;\n" 175 + " 1: ;\n" 176 + 177 + BLOCK(0) 178 + BLOCK(4) 179 + BLOCK(8) 180 + BLOCK(12) 181 + 182 + " add %[inc], %[p1] ;\n" 183 + " add %[inc], %[p2] ;\n" 184 + " add %[inc], %[p3] ;\n" 185 + " dec %[cnt] ;\n" 186 + " jnz 1b ;\n" 187 + : [cnt] "+r" (lines), 188 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 189 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 190 + : "memory"); 191 + 192 + kernel_fpu_end(); 193 + } 194 + 195 + static void 196 + xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 197 + const unsigned long * __restrict p2, 198 + const unsigned long * __restrict p3) 199 + { 200 + unsigned long lines = bytes >> 8; 201 + 202 + kernel_fpu_begin(); 203 + 204 + asm volatile( 205 + #undef BLOCK 206 + #define BLOCK(i) \ 207 + BLK64(PF0, LD, i) \ 208 + BLK64(PF1, XO1, i) \ 209 + BLK64(PF2, XO2, i) \ 210 + BLK64(NOP, ST, i) \ 211 + 212 + " .align 32 ;\n" 213 + " 1: ;\n" 214 + 215 + BLOCK(0) 216 + BLOCK(4) 217 + BLOCK(8) 218 + BLOCK(12) 219 + 220 + " add %[inc], %[p1] ;\n" 221 + " add %[inc], %[p2] ;\n" 222 + " add %[inc], %[p3] ;\n" 223 + " dec %[cnt] ;\n" 224 + " jnz 1b ;\n" 225 + : [cnt] "+r" (lines), 226 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 227 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 228 + : "memory"); 229 + 230 + kernel_fpu_end(); 231 + } 232 + 233 + static void 234 + xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 235 + const unsigned long * __restrict p2, 236 + const unsigned long * __restrict p3, 237 + const unsigned long * __restrict p4) 238 + { 239 + unsigned long lines = bytes >> 8; 240 + 241 + kernel_fpu_begin(); 242 + 243 + asm volatile( 244 + #undef BLOCK 245 + #define BLOCK(i) \ 246 + PF1(i) \ 247 + PF1(i + 2) \ 248 + LD(i, 0) \ 249 + LD(i + 1, 1) \ 250 + LD(i + 2, 2) \ 251 + LD(i + 3, 3) \ 252 + PF2(i) \ 253 + PF2(i + 2) \ 254 + XO1(i, 0) \ 255 + XO1(i + 1, 1) \ 256 + XO1(i + 2, 2) \ 257 + XO1(i + 3, 3) \ 258 + PF3(i) \ 259 + PF3(i + 2) \ 260 + PF0(i + 4) \ 261 + PF0(i + 6) \ 262 + XO2(i, 0) \ 263 + XO2(i + 1, 1) \ 264 + XO2(i + 2, 2) \ 265 + XO2(i + 3, 3) \ 266 + XO3(i, 0) \ 267 + XO3(i + 1, 1) \ 268 + XO3(i + 2, 2) \ 269 + XO3(i + 3, 3) \ 270 + ST(i, 0) \ 271 + ST(i + 1, 1) \ 272 + ST(i + 2, 2) \ 273 + ST(i + 3, 3) \ 274 + 275 + 276 + PF0(0) 277 + PF0(2) 278 + 279 + " .align 32 ;\n" 280 + " 1: ;\n" 281 + 282 + BLOCK(0) 283 + BLOCK(4) 284 + BLOCK(8) 285 + BLOCK(12) 286 + 287 + " add %[inc], %[p1] ;\n" 288 + " add %[inc], %[p2] ;\n" 289 + " add %[inc], %[p3] ;\n" 290 + " add %[inc], %[p4] ;\n" 291 + " dec %[cnt] ;\n" 292 + " jnz 1b ;\n" 293 + : [cnt] "+r" (lines), [p1] "+r" (p1), 294 + [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 295 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 296 + : "memory"); 297 + 298 + kernel_fpu_end(); 299 + } 300 + 301 + static void 302 + xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 303 + const unsigned long * __restrict p2, 304 + const unsigned long * __restrict p3, 305 + const unsigned long * __restrict p4) 306 + { 307 + unsigned long lines = bytes >> 8; 308 + 309 + kernel_fpu_begin(); 310 + 311 + asm volatile( 312 + #undef BLOCK 313 + #define BLOCK(i) \ 314 + BLK64(PF0, LD, i) \ 315 + BLK64(PF1, XO1, i) \ 316 + BLK64(PF2, XO2, i) \ 317 + BLK64(PF3, XO3, i) \ 318 + BLK64(NOP, ST, i) \ 319 + 320 + " .align 32 ;\n" 321 + " 1: ;\n" 322 + 323 + BLOCK(0) 324 + BLOCK(4) 325 + BLOCK(8) 326 + BLOCK(12) 327 + 328 + " add %[inc], %[p1] ;\n" 329 + " add %[inc], %[p2] ;\n" 330 + " add %[inc], %[p3] ;\n" 331 + " add %[inc], %[p4] ;\n" 332 + " dec %[cnt] ;\n" 333 + " jnz 1b ;\n" 334 + : [cnt] "+r" (lines), [p1] "+r" (p1), 335 + [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 336 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 337 + : "memory"); 338 + 339 + kernel_fpu_end(); 340 + } 341 + 342 + static void 343 + xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 344 + const unsigned long * __restrict p2, 345 + const unsigned long * __restrict p3, 346 + const unsigned long * __restrict p4, 347 + const unsigned long * __restrict p5) 348 + { 349 + unsigned long lines = bytes >> 8; 350 + 351 + kernel_fpu_begin(); 352 + 353 + asm volatile( 354 + #undef BLOCK 355 + #define BLOCK(i) \ 356 + PF1(i) \ 357 + PF1(i + 2) \ 358 + LD(i, 0) \ 359 + LD(i + 1, 1) \ 360 + LD(i + 2, 2) \ 361 + LD(i + 3, 3) \ 362 + PF2(i) \ 363 + PF2(i + 2) \ 364 + XO1(i, 0) \ 365 + XO1(i + 1, 1) \ 366 + XO1(i + 2, 2) \ 367 + XO1(i + 3, 3) \ 368 + PF3(i) \ 369 + PF3(i + 2) \ 370 + XO2(i, 0) \ 371 + XO2(i + 1, 1) \ 372 + XO2(i + 2, 2) \ 373 + XO2(i + 3, 3) \ 374 + PF4(i) \ 375 + PF4(i + 2) \ 376 + PF0(i + 4) \ 377 + PF0(i + 6) \ 378 + XO3(i, 0) \ 379 + XO3(i + 1, 1) \ 380 + XO3(i + 2, 2) \ 381 + XO3(i + 3, 3) \ 382 + XO4(i, 0) \ 383 + XO4(i + 1, 1) \ 384 + XO4(i + 2, 2) \ 385 + XO4(i + 3, 3) \ 386 + ST(i, 0) \ 387 + ST(i + 1, 1) \ 388 + ST(i + 2, 2) \ 389 + ST(i + 3, 3) \ 390 + 391 + 392 + PF0(0) 393 + PF0(2) 394 + 395 + " .align 32 ;\n" 396 + " 1: ;\n" 397 + 398 + BLOCK(0) 399 + BLOCK(4) 400 + BLOCK(8) 401 + BLOCK(12) 402 + 403 + " add %[inc], %[p1] ;\n" 404 + " add %[inc], %[p2] ;\n" 405 + " add %[inc], %[p3] ;\n" 406 + " add %[inc], %[p4] ;\n" 407 + " add %[inc], %[p5] ;\n" 408 + " dec %[cnt] ;\n" 409 + " jnz 1b ;\n" 410 + : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 411 + [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 412 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 413 + : "memory"); 414 + 415 + kernel_fpu_end(); 416 + } 417 + 418 + static void 419 + xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 420 + const unsigned long * __restrict p2, 421 + const unsigned long * __restrict p3, 422 + const unsigned long * __restrict p4, 423 + const unsigned long * __restrict p5) 424 + { 425 + unsigned long lines = bytes >> 8; 426 + 427 + kernel_fpu_begin(); 428 + 429 + asm volatile( 430 + #undef BLOCK 431 + #define BLOCK(i) \ 432 + BLK64(PF0, LD, i) \ 433 + BLK64(PF1, XO1, i) \ 434 + BLK64(PF2, XO2, i) \ 435 + BLK64(PF3, XO3, i) \ 436 + BLK64(PF4, XO4, i) \ 437 + BLK64(NOP, ST, i) \ 438 + 439 + " .align 32 ;\n" 440 + " 1: ;\n" 441 + 442 + BLOCK(0) 443 + BLOCK(4) 444 + BLOCK(8) 445 + BLOCK(12) 446 + 447 + " add %[inc], %[p1] ;\n" 448 + " add %[inc], %[p2] ;\n" 449 + " add %[inc], %[p3] ;\n" 450 + " add %[inc], %[p4] ;\n" 451 + " add %[inc], %[p5] ;\n" 452 + " dec %[cnt] ;\n" 453 + " jnz 1b ;\n" 454 + : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 455 + [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 456 + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 457 + : "memory"); 458 + 459 + kernel_fpu_end(); 460 + } 461 + 462 + struct xor_block_template xor_block_sse = { 463 + .name = "sse", 464 + .do_2 = xor_sse_2, 465 + .do_3 = xor_sse_3, 466 + .do_4 = xor_sse_4, 467 + .do_5 = xor_sse_5, 468 + }; 469 + 470 + struct xor_block_template xor_block_sse_pf64 = { 471 + .name = "prefetch64-sse", 472 + .do_2 = xor_sse_2_pf64, 473 + .do_3 = xor_sse_3_pf64, 474 + .do_4 = xor_sse_4_pf64, 475 + .do_5 = xor_sse_5_pf64, 476 + };

Configure Feed

Configure Feed