Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux

Pull interleaved SHA-256 hashing support from Eric Biggers:
"Optimize fsverity with 2-way interleaved hashing

Add support for 2-way interleaved SHA-256 hashing to lib/crypto/, and
make fsverity use it for faster file data verification. This improves
fsverity performance on many x86_64 and arm64 processors.

Later, I plan to make dm-verity use this too"

* tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux:
fsverity: Use 2-way interleaved SHA-256 hashing when supported
fsverity: Remove inode parameter from fsverity_hash_block()
lib/crypto: tests: Add tests and benchmark for sha256_finup_2x()
lib/crypto: x86/sha256: Add support for 2-way interleaved hashing
lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing
lib/crypto: sha256: Add support for 2-way interleaved hashing

+1147 -56
+5 -7
fs/verity/enable.c
··· 19 19 }; 20 20 21 21 /* Hash a block, writing the result to the next level's pending block buffer. */ 22 - static int hash_one_block(struct inode *inode, 23 - const struct merkle_tree_params *params, 22 + static int hash_one_block(const struct merkle_tree_params *params, 24 23 struct block_buffer *cur) 25 24 { 26 25 struct block_buffer *next = cur + 1; ··· 35 36 /* Zero-pad the block if it's shorter than the block size. */ 36 37 memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); 37 38 38 - fsverity_hash_block(params, inode, cur->data, 39 - &next->data[next->filled]); 39 + fsverity_hash_block(params, cur->data, &next->data[next->filled]); 40 40 next->filled += params->digest_size; 41 41 cur->filled = 0; 42 42 return 0; ··· 121 123 fsverity_err(inode, "Short read of file data"); 122 124 goto out; 123 125 } 124 - err = hash_one_block(inode, params, &buffers[-1]); 126 + err = hash_one_block(params, &buffers[-1]); 125 127 if (err) 126 128 goto out; 127 129 for (level = 0; level < num_levels; level++) { ··· 132 134 } 133 135 /* Next block at @level is full */ 134 136 135 - err = hash_one_block(inode, params, &buffers[level]); 137 + err = hash_one_block(params, &buffers[level]); 136 138 if (err) 137 139 goto out; 138 140 err = write_merkle_tree_block(inode, ··· 152 154 /* Finish all nonempty pending tree blocks. */ 153 155 for (level = 0; level < num_levels; level++) { 154 156 if (buffers[level].filled != 0) { 155 - err = hash_one_block(inode, params, &buffers[level]); 157 + err = hash_one_block(params, &buffers[level]); 156 158 if (err) 157 159 goto out; 158 160 err = write_merkle_tree_block(inode,
+1 -1
fs/verity/fsverity_private.h
··· 90 90 fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, 91 91 const u8 *salt, size_t salt_size); 92 92 void fsverity_hash_block(const struct merkle_tree_params *params, 93 - const struct inode *inode, const void *data, u8 *out); 93 + const void *data, u8 *out); 94 94 void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, 95 95 const void *data, size_t size, u8 *out); 96 96 void __init fsverity_check_hash_algs(void);
+1 -2
fs/verity/hash_algs.c
··· 94 94 /** 95 95 * fsverity_hash_block() - hash a single data or hash block 96 96 * @params: the Merkle tree's parameters 97 - * @inode: inode for which the hashing is being done 98 97 * @data: virtual address of a buffer containing the block to hash 99 98 * @out: output digest, size 'params->digest_size' bytes 100 99 * ··· 101 102 * in the Merkle tree parameters. 102 103 */ 103 104 void fsverity_hash_block(const struct merkle_tree_params *params, 104 - const struct inode *inode, const void *data, u8 *out) 105 + const void *data, u8 *out) 105 106 { 106 107 union fsverity_hash_ctx ctx; 107 108
+140 -35
fs/verity/verify.c
··· 10 10 #include <linux/bio.h> 11 11 #include <linux/export.h> 12 12 13 + #define FS_VERITY_MAX_PENDING_BLOCKS 2 14 + 15 + struct fsverity_pending_block { 16 + const void *data; 17 + u64 pos; 18 + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; 19 + }; 20 + 21 + struct fsverity_verification_context { 22 + struct inode *inode; 23 + struct fsverity_info *vi; 24 + unsigned long max_ra_pages; 25 + 26 + /* 27 + * This is the queue of data blocks that are pending verification. When 28 + * the crypto layer supports interleaved hashing, we allow multiple 29 + * blocks to be queued up in order to utilize it. This can improve 30 + * performance significantly vs. sequential hashing of each block. 31 + */ 32 + int num_pending; 33 + int max_pending; 34 + struct fsverity_pending_block 35 + pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS]; 36 + }; 37 + 13 38 static struct workqueue_struct *fsverity_read_workqueue; 14 39 15 40 /* ··· 104 79 } 105 80 106 81 /* 107 - * Verify a single data block against the file's Merkle tree. 82 + * Verify the hash of a single data block against the file's Merkle tree. 108 83 * 109 84 * In principle, we need to verify the entire path to the root node. However, 110 85 * for efficiency the filesystem may cache the hash blocks. Therefore we need ··· 113 88 * 114 89 * Return: %true if the data block is valid, else %false. 115 90 */ 116 - static bool 117 - verify_data_block(struct inode *inode, struct fsverity_info *vi, 118 - const void *data, u64 data_pos, unsigned long max_ra_pages) 91 + static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, 92 + const struct fsverity_pending_block *dblock, 93 + unsigned long max_ra_pages) 119 94 { 95 + const u64 data_pos = dblock->pos; 120 96 const struct merkle_tree_params *params = &vi->tree_params; 121 97 const unsigned int hsize = params->digest_size; 122 98 int level; ··· 141 115 */ 142 116 u64 hidx = data_pos >> params->log_blocksize; 143 117 144 - /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ 145 - BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); 118 + /* 119 + * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may 120 + * be mapped at once. 121 + */ 122 + static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <= 123 + KM_MAX_IDX); 146 124 147 125 if (unlikely(data_pos >= inode->i_size)) { 148 126 /* ··· 157 127 * any part past EOF should be all zeroes. Therefore, we need 158 128 * to verify that any data blocks fully past EOF are all zeroes. 159 129 */ 160 - if (memchr_inv(data, 0, params->block_size)) { 130 + if (memchr_inv(dblock->data, 0, params->block_size)) { 161 131 fsverity_err(inode, 162 132 "FILE CORRUPTED! Data past EOF is not zeroed"); 163 133 return false; ··· 232 202 unsigned long hblock_idx = hblocks[level - 1].index; 233 203 unsigned int hoffset = hblocks[level - 1].hoffset; 234 204 235 - fsverity_hash_block(params, inode, haddr, real_hash); 205 + fsverity_hash_block(params, haddr, real_hash); 236 206 if (memcmp(want_hash, real_hash, hsize) != 0) 237 207 goto corrupted; 238 208 /* ··· 250 220 put_page(hpage); 251 221 } 252 222 253 - /* Finally, verify the data block. */ 254 - fsverity_hash_block(params, inode, data, real_hash); 255 - if (memcmp(want_hash, real_hash, hsize) != 0) 223 + /* Finally, verify the hash of the data block. */ 224 + if (memcmp(want_hash, dblock->real_hash, hsize) != 0) 256 225 goto corrupted; 257 226 return true; 258 227 259 228 corrupted: 260 - fsverity_err(inode, 261 - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", 262 - data_pos, level - 1, 263 - params->hash_alg->name, hsize, want_hash, 264 - params->hash_alg->name, hsize, real_hash); 229 + fsverity_err( 230 + inode, 231 + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", 232 + data_pos, level - 1, params->hash_alg->name, hsize, want_hash, 233 + params->hash_alg->name, hsize, 234 + level == 0 ? dblock->real_hash : real_hash); 265 235 error: 266 236 for (; level > 0; level--) { 267 237 kunmap_local(hblocks[level - 1].addr); ··· 270 240 return false; 271 241 } 272 242 273 - static bool 274 - verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, 275 - unsigned long max_ra_pages) 243 + static void 244 + fsverity_init_verification_context(struct fsverity_verification_context *ctx, 245 + struct inode *inode, 246 + unsigned long max_ra_pages) 276 247 { 277 - struct inode *inode = data_folio->mapping->host; 278 248 struct fsverity_info *vi = *fsverity_info_addr(inode); 279 - const unsigned int block_size = vi->tree_params.block_size; 249 + 250 + ctx->inode = inode; 251 + ctx->vi = vi; 252 + ctx->max_ra_pages = max_ra_pages; 253 + ctx->num_pending = 0; 254 + if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 && 255 + sha256_finup_2x_is_optimized()) 256 + ctx->max_pending = 2; 257 + else 258 + ctx->max_pending = 1; 259 + } 260 + 261 + static void 262 + fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx) 263 + { 264 + int i; 265 + 266 + for (i = ctx->num_pending - 1; i >= 0; i--) { 267 + kunmap_local(ctx->pending_blocks[i].data); 268 + ctx->pending_blocks[i].data = NULL; 269 + } 270 + ctx->num_pending = 0; 271 + } 272 + 273 + static bool 274 + fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx) 275 + { 276 + struct fsverity_info *vi = ctx->vi; 277 + const struct merkle_tree_params *params = &vi->tree_params; 278 + int i; 279 + 280 + if (ctx->num_pending == 2) { 281 + /* num_pending == 2 implies that the algorithm is SHA-256 */ 282 + sha256_finup_2x(params->hashstate ? &params->hashstate->sha256 : 283 + NULL, 284 + ctx->pending_blocks[0].data, 285 + ctx->pending_blocks[1].data, params->block_size, 286 + ctx->pending_blocks[0].real_hash, 287 + ctx->pending_blocks[1].real_hash); 288 + } else { 289 + for (i = 0; i < ctx->num_pending; i++) 290 + fsverity_hash_block(params, ctx->pending_blocks[i].data, 291 + ctx->pending_blocks[i].real_hash); 292 + } 293 + 294 + for (i = 0; i < ctx->num_pending; i++) { 295 + if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i], 296 + ctx->max_ra_pages)) 297 + return false; 298 + } 299 + fsverity_clear_pending_blocks(ctx); 300 + return true; 301 + } 302 + 303 + static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx, 304 + struct folio *data_folio, size_t len, 305 + size_t offset) 306 + { 307 + struct fsverity_info *vi = ctx->vi; 308 + const struct merkle_tree_params *params = &vi->tree_params; 309 + const unsigned int block_size = params->block_size; 280 310 u64 pos = (u64)data_folio->index << PAGE_SHIFT; 281 311 282 312 if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) ··· 345 255 folio_test_uptodate(data_folio))) 346 256 return false; 347 257 do { 348 - void *data; 349 - bool valid; 350 - 351 - data = kmap_local_folio(data_folio, offset); 352 - valid = verify_data_block(inode, vi, data, pos + offset, 353 - max_ra_pages); 354 - kunmap_local(data); 355 - if (!valid) 258 + ctx->pending_blocks[ctx->num_pending].data = 259 + kmap_local_folio(data_folio, offset); 260 + ctx->pending_blocks[ctx->num_pending].pos = pos + offset; 261 + if (++ctx->num_pending == ctx->max_pending && 262 + !fsverity_verify_pending_blocks(ctx)) 356 263 return false; 357 264 offset += block_size; 358 265 len -= block_size; ··· 371 284 */ 372 285 bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) 373 286 { 374 - return verify_data_blocks(folio, len, offset, 0); 287 + struct fsverity_verification_context ctx; 288 + 289 + fsverity_init_verification_context(&ctx, folio->mapping->host, 0); 290 + 291 + if (fsverity_add_data_blocks(&ctx, folio, len, offset) && 292 + fsverity_verify_pending_blocks(&ctx)) 293 + return true; 294 + fsverity_clear_pending_blocks(&ctx); 295 + return false; 375 296 } 376 297 EXPORT_SYMBOL_GPL(fsverity_verify_blocks); 377 298 ··· 400 305 */ 401 306 void fsverity_verify_bio(struct bio *bio) 402 307 { 308 + struct inode *inode = bio_first_folio_all(bio)->mapping->host; 309 + struct fsverity_verification_context ctx; 403 310 struct folio_iter fi; 404 311 unsigned long max_ra_pages = 0; 405 312 ··· 418 321 max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); 419 322 } 420 323 324 + fsverity_init_verification_context(&ctx, inode, max_ra_pages); 325 + 421 326 bio_for_each_folio_all(fi, bio) { 422 - if (!verify_data_blocks(fi.folio, fi.length, fi.offset, 423 - max_ra_pages)) { 424 - bio->bi_status = BLK_STS_IOERR; 425 - break; 426 - } 327 + if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, 328 + fi.offset)) 329 + goto ioerr; 427 330 } 331 + 332 + if (!fsverity_verify_pending_blocks(&ctx)) 333 + goto ioerr; 334 + return; 335 + 336 + ioerr: 337 + fsverity_clear_pending_blocks(&ctx); 338 + bio->bi_status = BLK_STS_IOERR; 428 339 } 429 340 EXPORT_SYMBOL_GPL(fsverity_verify_bio); 430 341 #endif /* CONFIG_BLOCK */
+28
include/crypto/sha2.h
··· 376 376 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); 377 377 378 378 /** 379 + * sha256_finup_2x() - Compute two SHA-256 digests from a common initial 380 + * context. On some CPUs, this is faster than sequentially 381 + * computing each digest. 382 + * @ctx: an optional initial context, which may have already processed data. If 383 + * NULL, a default initial context is used (equivalent to sha256_init()). 384 + * @data1: data for the first message 385 + * @data2: data for the second message 386 + * @len: the length of each of @data1 and @data2, in bytes 387 + * @out1: (output) the first SHA-256 message digest 388 + * @out2: (output) the second SHA-256 message digest 389 + * 390 + * Context: Any context. 391 + */ 392 + void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, 393 + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], 394 + u8 out2[SHA256_DIGEST_SIZE]); 395 + 396 + /** 397 + * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real 398 + * interleaved implementation, as opposed to a 399 + * sequential fallback 400 + * @return: true if optimized 401 + * 402 + * Context: Any context. 403 + */ 404 + bool sha256_finup_2x_is_optimized(void); 405 + 406 + /** 379 407 * struct hmac_sha256_key - Prepared key for HMAC-SHA256 380 408 * @key: private 381 409 */
+278 -6
lib/crypto/arm64/sha256-ce.S
··· 70 70 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 71 71 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 72 72 73 + .macro load_round_constants tmp 74 + adr_l \tmp, .Lsha2_rcon 75 + ld1 { v0.4s- v3.4s}, [\tmp], #64 76 + ld1 { v4.4s- v7.4s}, [\tmp], #64 77 + ld1 { v8.4s-v11.4s}, [\tmp], #64 78 + ld1 {v12.4s-v15.4s}, [\tmp] 79 + .endm 80 + 73 81 /* 74 82 * size_t __sha256_ce_transform(struct sha256_block_state *state, 75 83 * const u8 *data, size_t nblocks); 76 84 */ 77 85 .text 78 86 SYM_FUNC_START(__sha256_ce_transform) 79 - /* load round constants */ 80 - adr_l x8, .Lsha2_rcon 81 - ld1 { v0.4s- v3.4s}, [x8], #64 82 - ld1 { v4.4s- v7.4s}, [x8], #64 83 - ld1 { v8.4s-v11.4s}, [x8], #64 84 - ld1 {v12.4s-v15.4s}, [x8] 87 + 88 + load_round_constants x8 85 89 86 90 /* load state */ 87 91 ld1 {dgav.4s, dgbv.4s}, [x0] ··· 138 134 mov x0, x2 139 135 ret 140 136 SYM_FUNC_END(__sha256_ce_transform) 137 + 138 + .unreq dga 139 + .unreq dgav 140 + .unreq dgb 141 + .unreq dgbv 142 + .unreq t0 143 + .unreq t1 144 + .unreq dg0q 145 + .unreq dg0v 146 + .unreq dg1q 147 + .unreq dg1v 148 + .unreq dg2q 149 + .unreq dg2v 150 + 151 + // parameters for sha256_ce_finup2x() 152 + ctx .req x0 153 + data1 .req x1 154 + data2 .req x2 155 + len .req w3 156 + out1 .req x4 157 + out2 .req x5 158 + 159 + // other scalar variables 160 + count .req x6 161 + final_step .req w7 162 + 163 + // x8-x9 are used as temporaries. 164 + 165 + // v0-v15 are used to cache the SHA-256 round constants. 166 + // v16-v19 are used for the message schedule for the first message. 167 + // v20-v23 are used for the message schedule for the second message. 168 + // v24-v31 are used for the state and temporaries as given below. 169 + // *_a are for the first message and *_b for the second. 170 + state0_a_q .req q24 171 + state0_a .req v24 172 + state1_a_q .req q25 173 + state1_a .req v25 174 + state0_b_q .req q26 175 + state0_b .req v26 176 + state1_b_q .req q27 177 + state1_b .req v27 178 + t0_a .req v28 179 + t0_b .req v29 180 + t1_a_q .req q30 181 + t1_a .req v30 182 + t1_b_q .req q31 183 + t1_b .req v31 184 + 185 + #define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) 186 + #define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) 187 + // offsetof(struct __sha256_ctx, state) is assumed to be 0. 188 + 189 + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a 190 + // and m0_b contain the current 4 message schedule words for the first 191 + // and second message respectively. 192 + // 193 + // If not all the message schedule words have been computed yet, then 194 + // this also computes 4 more message schedule words for each message. 195 + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for 196 + // the first message, and likewise m1_b-m3_b for the second. After 197 + // consuming the current value of m0_a, this macro computes the group 198 + // after m3_a and writes it to m0_a, and likewise for *_b. This means 199 + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, 200 + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through 201 + // the registers accordingly. 202 + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ 203 + m0_b, m1_b, m2_b, m3_b 204 + add t0_a\().4s, \m0_a\().4s, \k\().4s 205 + add t0_b\().4s, \m0_b\().4s, \k\().4s 206 + .if \i < 48 207 + sha256su0 \m0_a\().4s, \m1_a\().4s 208 + sha256su0 \m0_b\().4s, \m1_b\().4s 209 + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s 210 + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s 211 + .endif 212 + mov t1_a.16b, state0_a.16b 213 + mov t1_b.16b, state0_b.16b 214 + sha256h state0_a_q, state1_a_q, t0_a\().4s 215 + sha256h state0_b_q, state1_b_q, t0_b\().4s 216 + sha256h2 state1_a_q, t1_a_q, t0_a\().4s 217 + sha256h2 state1_b_q, t1_b_q, t0_b\().4s 218 + .endm 219 + 220 + .macro do_16rounds_2x i, k0, k1, k2, k3 221 + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 222 + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 223 + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 224 + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 225 + .endm 226 + 227 + // 228 + // void sha256_ce_finup2x(const struct __sha256_ctx *ctx, 229 + // const u8 *data1, const u8 *data2, int len, 230 + // u8 out1[SHA256_DIGEST_SIZE], 231 + // u8 out2[SHA256_DIGEST_SIZE]); 232 + // 233 + // This function computes the SHA-256 digests of two messages |data1| and 234 + // |data2| that are both |len| bytes long, starting from the initial context 235 + // |ctx|. |len| must be at least SHA256_BLOCK_SIZE. 236 + // 237 + // The instructions for the two SHA-256 operations are interleaved. On many 238 + // CPUs, this is almost twice as fast as hashing each message individually due 239 + // to taking better advantage of the CPU's SHA-256 and SIMD throughput. 240 + // 241 + SYM_FUNC_START(sha256_ce_finup2x) 242 + sub sp, sp, #128 243 + mov final_step, #0 244 + load_round_constants x8 245 + 246 + // Load the initial state from ctx->state. 247 + ld1 {state0_a.4s-state1_a.4s}, [ctx] 248 + 249 + // Load ctx->bytecount. Take the mod 64 of it to get the number of 250 + // bytes that are buffered in ctx->buf. Also save it in a register with 251 + // len added to it. 252 + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] 253 + add count, x8, len, sxtw 254 + and x8, x8, #63 255 + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? 256 + 257 + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them 258 + // followed by the first 64 - x8 bytes of data. Since len >= 64, we 259 + // just load 64 bytes from each of ctx->buf, data1, and data2 260 + // unconditionally and rearrange the data as needed. 261 + add x9, ctx, #OFFSETOF_BUF 262 + ld1 {v16.16b-v19.16b}, [x9] 263 + st1 {v16.16b-v19.16b}, [sp] 264 + 265 + ld1 {v16.16b-v19.16b}, [data1], #64 266 + add x9, sp, x8 267 + st1 {v16.16b-v19.16b}, [x9] 268 + ld1 {v16.4s-v19.4s}, [sp] 269 + 270 + ld1 {v20.16b-v23.16b}, [data2], #64 271 + st1 {v20.16b-v23.16b}, [x9] 272 + ld1 {v20.4s-v23.4s}, [sp] 273 + 274 + sub len, len, #64 275 + sub data1, data1, x8 276 + sub data2, data2, x8 277 + add len, len, w8 278 + mov state0_b.16b, state0_a.16b 279 + mov state1_b.16b, state1_a.16b 280 + b .Lfinup2x_loop_have_data 281 + 282 + .Lfinup2x_enter_loop: 283 + sub len, len, #64 284 + mov state0_b.16b, state0_a.16b 285 + mov state1_b.16b, state1_a.16b 286 + .Lfinup2x_loop: 287 + // Load the next two data blocks. 288 + ld1 {v16.4s-v19.4s}, [data1], #64 289 + ld1 {v20.4s-v23.4s}, [data2], #64 290 + .Lfinup2x_loop_have_data: 291 + // Convert the words of the data blocks from big endian. 292 + CPU_LE( rev32 v16.16b, v16.16b ) 293 + CPU_LE( rev32 v17.16b, v17.16b ) 294 + CPU_LE( rev32 v18.16b, v18.16b ) 295 + CPU_LE( rev32 v19.16b, v19.16b ) 296 + CPU_LE( rev32 v20.16b, v20.16b ) 297 + CPU_LE( rev32 v21.16b, v21.16b ) 298 + CPU_LE( rev32 v22.16b, v22.16b ) 299 + CPU_LE( rev32 v23.16b, v23.16b ) 300 + .Lfinup2x_loop_have_bswapped_data: 301 + 302 + // Save the original state for each block. 303 + st1 {state0_a.4s-state1_b.4s}, [sp] 304 + 305 + // Do the SHA-256 rounds on each block. 306 + do_16rounds_2x 0, v0, v1, v2, v3 307 + do_16rounds_2x 16, v4, v5, v6, v7 308 + do_16rounds_2x 32, v8, v9, v10, v11 309 + do_16rounds_2x 48, v12, v13, v14, v15 310 + 311 + // Add the original state for each block. 312 + ld1 {v16.4s-v19.4s}, [sp] 313 + add state0_a.4s, state0_a.4s, v16.4s 314 + add state1_a.4s, state1_a.4s, v17.4s 315 + add state0_b.4s, state0_b.4s, v18.4s 316 + add state1_b.4s, state1_b.4s, v19.4s 317 + 318 + // Update len and loop back if more blocks remain. 319 + sub len, len, #64 320 + tbz len, #31, .Lfinup2x_loop // len >= 0? 321 + 322 + // Check if any final blocks need to be handled. 323 + // final_step = 2: all done 324 + // final_step = 1: need to do count-only padding block 325 + // final_step = 0: need to do the block with 0x80 padding byte 326 + tbnz final_step, #1, .Lfinup2x_done 327 + tbnz final_step, #0, .Lfinup2x_finalize_countonly 328 + add len, len, #64 329 + cbz len, .Lfinup2x_finalize_blockaligned 330 + 331 + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. 332 + // To do this, write the padding starting with the 0x80 byte to 333 + // &sp[64]. Then for each message, copy the last 64 data bytes to sp 334 + // and load from &sp[64 - len] to get the needed padding block. This 335 + // code relies on the data buffers being >= 64 bytes in length. 336 + sub w8, len, #64 // w8 = len - 64 337 + add data1, data1, w8, sxtw // data1 += len - 64 338 + add data2, data2, w8, sxtw // data2 += len - 64 339 + CPU_LE( mov x9, #0x80 ) 340 + CPU_LE( fmov d16, x9 ) 341 + CPU_BE( movi v16.16b, #0 ) 342 + CPU_BE( mov x9, #0x8000000000000000 ) 343 + CPU_BE( mov v16.d[1], x9 ) 344 + movi v17.16b, #0 345 + stp q16, q17, [sp, #64] 346 + stp q17, q17, [sp, #96] 347 + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] 348 + cmp len, #56 349 + b.ge 1f // will count spill into its own block? 350 + lsl count, count, #3 351 + CPU_LE( rev count, count ) 352 + str count, [x9, #56] 353 + mov final_step, #2 // won't need count-only block 354 + b 2f 355 + 1: 356 + mov final_step, #1 // will need count-only block 357 + 2: 358 + ld1 {v16.16b-v19.16b}, [data1] 359 + st1 {v16.16b-v19.16b}, [sp] 360 + ld1 {v16.4s-v19.4s}, [x9] 361 + ld1 {v20.16b-v23.16b}, [data2] 362 + st1 {v20.16b-v23.16b}, [sp] 363 + ld1 {v20.4s-v23.4s}, [x9] 364 + b .Lfinup2x_loop_have_data 365 + 366 + // Prepare a padding block, either: 367 + // 368 + // {0x80, 0, 0, 0, ..., count (as __be64)} 369 + // This is for a block aligned message. 370 + // 371 + // { 0, 0, 0, 0, ..., count (as __be64)} 372 + // This is for a message whose length mod 64 is >= 56. 373 + // 374 + // Pre-swap the endianness of the words. 375 + .Lfinup2x_finalize_countonly: 376 + movi v16.2d, #0 377 + b 1f 378 + .Lfinup2x_finalize_blockaligned: 379 + mov x8, #0x80000000 380 + fmov d16, x8 381 + 1: 382 + movi v17.2d, #0 383 + movi v18.2d, #0 384 + ror count, count, #29 // ror(lsl(count, 3), 32) 385 + mov v19.d[0], xzr 386 + mov v19.d[1], count 387 + mov v20.16b, v16.16b 388 + movi v21.2d, #0 389 + movi v22.2d, #0 390 + mov v23.16b, v19.16b 391 + mov final_step, #2 392 + b .Lfinup2x_loop_have_bswapped_data 393 + 394 + .Lfinup2x_done: 395 + // Write the two digests with all bytes in the correct order. 396 + CPU_LE( rev32 state0_a.16b, state0_a.16b ) 397 + CPU_LE( rev32 state1_a.16b, state1_a.16b ) 398 + CPU_LE( rev32 state0_b.16b, state0_b.16b ) 399 + CPU_LE( rev32 state1_b.16b, state1_b.16b ) 400 + st1 {state0_a.4s-state1_a.4s}, [out1] 401 + st1 {state0_b.4s-state1_b.4s}, [out2] 402 + add sp, sp, #128 403 + ret 404 + SYM_FUNC_END(sha256_ce_finup2x)
+37
lib/crypto/arm64/sha256.h
··· 44 44 } 45 45 } 46 46 47 + static_assert(offsetof(struct __sha256_ctx, state) == 0); 48 + static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); 49 + static_assert(offsetof(struct __sha256_ctx, buf) == 40); 50 + asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx, 51 + const u8 *data1, const u8 *data2, int len, 52 + u8 out1[SHA256_DIGEST_SIZE], 53 + u8 out2[SHA256_DIGEST_SIZE]); 54 + 55 + #define sha256_finup_2x_arch sha256_finup_2x_arch 56 + static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, 57 + const u8 *data1, const u8 *data2, size_t len, 58 + u8 out1[SHA256_DIGEST_SIZE], 59 + u8 out2[SHA256_DIGEST_SIZE]) 60 + { 61 + /* 62 + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. 63 + * Further limit len to 65536 to avoid spending too long with preemption 64 + * disabled. (Of course, in practice len is nearly always 4096 anyway.) 65 + */ 66 + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && 67 + static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE && 68 + len <= 65536 && likely(may_use_simd())) { 69 + kernel_neon_begin(); 70 + sha256_ce_finup2x(ctx, data1, data2, len, out1, out2); 71 + kernel_neon_end(); 72 + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); 73 + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); 74 + return true; 75 + } 76 + return false; 77 + } 78 + 79 + static bool sha256_finup_2x_is_optimized_arch(void) 80 + { 81 + return static_key_enabled(&have_ce); 82 + } 83 + 47 84 #ifdef CONFIG_KERNEL_MODE_NEON 48 85 #define sha256_mod_init_arch sha256_mod_init_arch 49 86 static void sha256_mod_init_arch(void)
+66 -5
lib/crypto/sha256.c
··· 25 25 }, 26 26 }; 27 27 28 - static const struct sha256_block_state sha256_iv = { 29 - .h = { 30 - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, 31 - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, 28 + static const struct sha256_ctx initial_sha256_ctx = { 29 + .ctx = { 30 + .state = { 31 + .h = { 32 + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, 33 + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, 34 + }, 35 + }, 36 + .bytecount = 0, 32 37 }, 33 38 }; 39 + 40 + #define sha256_iv (initial_sha256_ctx.ctx.state) 34 41 35 42 static const u32 sha256_K[64] = { 36 43 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, ··· 268 261 } 269 262 EXPORT_SYMBOL(sha256); 270 263 271 - /* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ 264 + /* 265 + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) 266 + * doesn't need either HMAC support or interleaved hashing support 267 + */ 272 268 #ifndef __DISABLE_EXPORTS 269 + 270 + #ifndef sha256_finup_2x_arch 271 + static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, 272 + const u8 *data1, const u8 *data2, size_t len, 273 + u8 out1[SHA256_DIGEST_SIZE], 274 + u8 out2[SHA256_DIGEST_SIZE]) 275 + { 276 + return false; 277 + } 278 + static bool sha256_finup_2x_is_optimized_arch(void) 279 + { 280 + return false; 281 + } 282 + #endif 283 + 284 + /* Sequential fallback implementation of sha256_finup_2x() */ 285 + static noinline_for_stack void sha256_finup_2x_sequential( 286 + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, 287 + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) 288 + { 289 + struct __sha256_ctx mut_ctx; 290 + 291 + mut_ctx = *ctx; 292 + __sha256_update(&mut_ctx, data1, len); 293 + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); 294 + 295 + mut_ctx = *ctx; 296 + __sha256_update(&mut_ctx, data2, len); 297 + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); 298 + } 299 + 300 + void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, 301 + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], 302 + u8 out2[SHA256_DIGEST_SIZE]) 303 + { 304 + if (ctx == NULL) 305 + ctx = &initial_sha256_ctx; 306 + 307 + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, 308 + out2))) 309 + return; 310 + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); 311 + } 312 + EXPORT_SYMBOL_GPL(sha256_finup_2x); 313 + 314 + bool sha256_finup_2x_is_optimized(void) 315 + { 316 + return sha256_finup_2x_is_optimized_arch(); 317 + } 318 + EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); 319 + 273 320 static void __hmac_sha256_preparekey(struct sha256_block_state *istate, 274 321 struct sha256_block_state *ostate, 275 322 const u8 *raw_key, size_t raw_key_len,
+184
lib/crypto/tests/sha256_kunit.c
··· 5 5 #include <crypto/sha2.h> 6 6 #include "sha256-testvecs.h" 7 7 8 + /* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ 8 9 #define HASH sha256 9 10 #define HASH_CTX sha256_ctx 10 11 #define HASH_SIZE SHA256_DIGEST_SIZE ··· 22 21 #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey 23 22 #include "hash-test-template.h" 24 23 24 + static void free_guarded_buf(void *buf) 25 + { 26 + vfree(buf); 27 + } 28 + 29 + /* 30 + * Allocate a KUnit-managed buffer that has length @len bytes immediately 31 + * followed by an unmapped page, and assert that the allocation succeeds. 32 + */ 33 + static void *alloc_guarded_buf(struct kunit *test, size_t len) 34 + { 35 + size_t full_len = round_up(len, PAGE_SIZE); 36 + void *buf = vmalloc(full_len); 37 + 38 + KUNIT_ASSERT_NOT_NULL(test, buf); 39 + KUNIT_ASSERT_EQ(test, 0, 40 + kunit_add_action_or_reset(test, free_guarded_buf, buf)); 41 + return buf + full_len - len; 42 + } 43 + 44 + /* 45 + * Test for sha256_finup_2x(). Specifically, choose various data lengths and 46 + * salt lengths, and for each one, verify that sha256_finup_2x() produces the 47 + * same results as sha256_update() and sha256_final(). 48 + * 49 + * Use guarded buffers for all inputs and outputs to reliably detect any 50 + * out-of-bounds reads or writes, even if they occur in assembly code. 51 + */ 52 + static void test_sha256_finup_2x(struct kunit *test) 53 + { 54 + const size_t max_data_len = 16384; 55 + u8 *data1_buf, *data2_buf, *hash1, *hash2; 56 + u8 expected_hash1[SHA256_DIGEST_SIZE]; 57 + u8 expected_hash2[SHA256_DIGEST_SIZE]; 58 + u8 salt[SHA256_BLOCK_SIZE]; 59 + struct sha256_ctx *ctx; 60 + 61 + data1_buf = alloc_guarded_buf(test, max_data_len); 62 + data2_buf = alloc_guarded_buf(test, max_data_len); 63 + hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); 64 + hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); 65 + ctx = alloc_guarded_buf(test, sizeof(*ctx)); 66 + 67 + rand_bytes(data1_buf, max_data_len); 68 + rand_bytes(data2_buf, max_data_len); 69 + rand_bytes(salt, sizeof(salt)); 70 + 71 + for (size_t i = 0; i < 500; i++) { 72 + size_t salt_len = rand_length(sizeof(salt)); 73 + size_t data_len = rand_length(max_data_len); 74 + const u8 *data1 = data1_buf + max_data_len - data_len; 75 + const u8 *data2 = data2_buf + max_data_len - data_len; 76 + struct sha256_ctx orig_ctx; 77 + 78 + sha256_init(ctx); 79 + sha256_update(ctx, salt, salt_len); 80 + orig_ctx = *ctx; 81 + 82 + sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2); 83 + KUNIT_ASSERT_MEMEQ_MSG( 84 + test, ctx, &orig_ctx, sizeof(*ctx), 85 + "sha256_finup_2x() modified its ctx argument"); 86 + 87 + sha256_update(ctx, data1, data_len); 88 + sha256_final(ctx, expected_hash1); 89 + sha256_update(&orig_ctx, data2, data_len); 90 + sha256_final(&orig_ctx, expected_hash2); 91 + KUNIT_ASSERT_MEMEQ_MSG( 92 + test, hash1, expected_hash1, SHA256_DIGEST_SIZE, 93 + "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len, 94 + data_len); 95 + KUNIT_ASSERT_MEMEQ_MSG( 96 + test, hash2, expected_hash2, SHA256_DIGEST_SIZE, 97 + "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len, 98 + data_len); 99 + } 100 + } 101 + 102 + /* Test sha256_finup_2x() with ctx == NULL */ 103 + static void test_sha256_finup_2x_defaultctx(struct kunit *test) 104 + { 105 + const size_t data_len = 128; 106 + struct sha256_ctx ctx; 107 + u8 hash1_a[SHA256_DIGEST_SIZE]; 108 + u8 hash2_a[SHA256_DIGEST_SIZE]; 109 + u8 hash1_b[SHA256_DIGEST_SIZE]; 110 + u8 hash2_b[SHA256_DIGEST_SIZE]; 111 + 112 + rand_bytes(test_buf, 2 * data_len); 113 + 114 + sha256_init(&ctx); 115 + sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a, 116 + hash2_a); 117 + 118 + sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b, 119 + hash2_b); 120 + 121 + KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE); 122 + KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE); 123 + } 124 + 125 + /* 126 + * Test that sha256_finup_2x() and sha256_update/final() produce consistent 127 + * results with total message lengths that require more than 32 bits. 128 + */ 129 + static void test_sha256_finup_2x_hugelen(struct kunit *test) 130 + { 131 + const size_t data_len = 4 * SHA256_BLOCK_SIZE; 132 + struct sha256_ctx ctx = {}; 133 + u8 expected_hash[SHA256_DIGEST_SIZE]; 134 + u8 hash[SHA256_DIGEST_SIZE]; 135 + 136 + rand_bytes(test_buf, data_len); 137 + for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) { 138 + sha256_init(&ctx); 139 + ctx.ctx.bytecount = 0x123456789abcd00 + align; 140 + 141 + sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash); 142 + 143 + sha256_update(&ctx, test_buf, data_len); 144 + sha256_final(&ctx, expected_hash); 145 + 146 + KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, 147 + SHA256_DIGEST_SIZE); 148 + } 149 + } 150 + 151 + /* Benchmark for sha256_finup_2x() */ 152 + static void benchmark_sha256_finup_2x(struct kunit *test) 153 + { 154 + /* 155 + * Try a few different salt lengths, since sha256_finup_2x() performance 156 + * may vary slightly for the same data_len depending on how many bytes 157 + * were already processed in the initial context. 158 + */ 159 + static const size_t salt_lens_to_test[] = { 0, 32, 64 }; 160 + const size_t data_len = 4096; 161 + const size_t num_iters = 4096; 162 + struct sha256_ctx ctx; 163 + u8 hash1[SHA256_DIGEST_SIZE]; 164 + u8 hash2[SHA256_DIGEST_SIZE]; 165 + 166 + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) 167 + kunit_skip(test, "not enabled"); 168 + if (!sha256_finup_2x_is_optimized()) 169 + kunit_skip(test, "not relevant"); 170 + 171 + rand_bytes(test_buf, data_len * 2); 172 + 173 + /* Warm-up */ 174 + for (size_t i = 0; i < num_iters; i++) 175 + sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len], 176 + data_len, hash1, hash2); 177 + 178 + for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) { 179 + size_t salt_len = salt_lens_to_test[i]; 180 + u64 t0, t1; 181 + 182 + /* 183 + * Prepare the initial context. The time to process the salt is 184 + * not measured; we're just interested in sha256_finup_2x(). 185 + */ 186 + sha256_init(&ctx); 187 + sha256_update(&ctx, test_buf, salt_len); 188 + 189 + preempt_disable(); 190 + t0 = ktime_get_ns(); 191 + for (size_t j = 0; j < num_iters; j++) 192 + sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len], 193 + data_len, hash1, hash2); 194 + t1 = ktime_get_ns(); 195 + preempt_enable(); 196 + kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s", 197 + data_len, salt_len, 198 + div64_u64((u64)data_len * 2 * num_iters * 1000, 199 + t1 - t0 ?: 1)); 200 + } 201 + } 202 + 25 203 static struct kunit_case hash_test_cases[] = { 26 204 HASH_KUNIT_CASES, 205 + KUNIT_CASE(test_sha256_finup_2x), 206 + KUNIT_CASE(test_sha256_finup_2x_defaultctx), 207 + KUNIT_CASE(test_sha256_finup_2x_hugelen), 27 208 KUNIT_CASE(benchmark_hash), 209 + KUNIT_CASE(benchmark_sha256_finup_2x), 28 210 {}, 29 211 }; 30 212
+368
lib/crypto/x86/sha256-ni-asm.S
··· 165 165 RET 166 166 SYM_FUNC_END(sha256_ni_transform) 167 167 168 + #undef DIGEST_PTR 169 + #undef DATA_PTR 170 + #undef NUM_BLKS 171 + #undef SHA256CONSTANTS 172 + #undef MSG 173 + #undef STATE0 174 + #undef STATE1 175 + #undef MSG0 176 + #undef MSG1 177 + #undef MSG2 178 + #undef MSG3 179 + #undef TMP 180 + #undef SHUF_MASK 181 + #undef ABEF_SAVE 182 + #undef CDGH_SAVE 183 + 184 + // parameters for sha256_ni_finup2x() 185 + #define CTX %rdi 186 + #define DATA1 %rsi 187 + #define DATA2 %rdx 188 + #define LEN %ecx 189 + #define LEN8 %cl 190 + #define LEN64 %rcx 191 + #define OUT1 %r8 192 + #define OUT2 %r9 193 + 194 + // other scalar variables 195 + #define SHA256CONSTANTS %rax 196 + #define COUNT %r10 197 + #define COUNT32 %r10d 198 + #define FINAL_STEP %r11d 199 + 200 + // rbx is used as a temporary. 201 + 202 + #define MSG %xmm0 // sha256rnds2 implicit operand 203 + #define STATE0_A %xmm1 204 + #define STATE1_A %xmm2 205 + #define STATE0_B %xmm3 206 + #define STATE1_B %xmm4 207 + #define TMP_A %xmm5 208 + #define TMP_B %xmm6 209 + #define MSG0_A %xmm7 210 + #define MSG1_A %xmm8 211 + #define MSG2_A %xmm9 212 + #define MSG3_A %xmm10 213 + #define MSG0_B %xmm11 214 + #define MSG1_B %xmm12 215 + #define MSG2_B %xmm13 216 + #define MSG3_B %xmm14 217 + #define SHUF_MASK %xmm15 218 + 219 + #define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) 220 + #define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) 221 + #define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) 222 + 223 + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b 224 + // contain the current 4 message schedule words for the first and second message 225 + // respectively. 226 + // 227 + // If not all the message schedule words have been computed yet, then this also 228 + // computes 4 more message schedule words for each message. m1_a-m3_a contain 229 + // the next 3 groups of 4 message schedule words for the first message, and 230 + // likewise m1_b-m3_b for the second. After consuming the current value of 231 + // m0_a, this macro computes the group after m3_a and writes it to m0_a, and 232 + // likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the 233 + // current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must 234 + // cycle through the registers accordingly. 235 + .macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b 236 + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A 237 + movdqa TMP_A, TMP_B 238 + paddd \m0_a, TMP_A 239 + paddd \m0_b, TMP_B 240 + .if \i < 48 241 + sha256msg1 \m1_a, \m0_a 242 + sha256msg1 \m1_b, \m0_b 243 + .endif 244 + movdqa TMP_A, MSG 245 + sha256rnds2 STATE0_A, STATE1_A 246 + movdqa TMP_B, MSG 247 + sha256rnds2 STATE0_B, STATE1_B 248 + pshufd $0x0E, TMP_A, MSG 249 + sha256rnds2 STATE1_A, STATE0_A 250 + pshufd $0x0E, TMP_B, MSG 251 + sha256rnds2 STATE1_B, STATE0_B 252 + .if \i < 48 253 + movdqa \m3_a, TMP_A 254 + movdqa \m3_b, TMP_B 255 + palignr $4, \m2_a, TMP_A 256 + palignr $4, \m2_b, TMP_B 257 + paddd TMP_A, \m0_a 258 + paddd TMP_B, \m0_b 259 + sha256msg2 \m3_a, \m0_a 260 + sha256msg2 \m3_b, \m0_b 261 + .endif 262 + .endm 263 + 264 + // 265 + // void sha256_ni_finup2x(const struct __sha256_ctx *ctx, 266 + // const u8 *data1, const u8 *data2, int len, 267 + // u8 out1[SHA256_DIGEST_SIZE], 268 + // u8 out2[SHA256_DIGEST_SIZE]); 269 + // 270 + // This function computes the SHA-256 digests of two messages |data1| and 271 + // |data2| that are both |len| bytes long, starting from the initial context 272 + // |ctx|. |len| must be at least SHA256_BLOCK_SIZE. 273 + // 274 + // The instructions for the two SHA-256 operations are interleaved. On many 275 + // CPUs, this is almost twice as fast as hashing each message individually due 276 + // to taking better advantage of the CPU's SHA-256 and SIMD throughput. 277 + // 278 + SYM_FUNC_START(sha256_ni_finup2x) 279 + // Allocate 128 bytes of stack space, 16-byte aligned. 280 + push %rbx 281 + push %rbp 282 + mov %rsp, %rbp 283 + sub $128, %rsp 284 + and $~15, %rsp 285 + 286 + // Load the shuffle mask for swapping the endianness of 32-bit words. 287 + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 288 + 289 + // Set up pointer to the round constants. 290 + lea K256+32*4(%rip), SHA256CONSTANTS 291 + 292 + // Initially we're not processing the final blocks. 293 + xor FINAL_STEP, FINAL_STEP 294 + 295 + // Load the initial state from ctx->state. 296 + movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA 297 + movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE 298 + movdqa STATE0_A, TMP_A 299 + punpcklqdq STATE1_A, STATE0_A // FEBA 300 + punpckhqdq TMP_A, STATE1_A // DCHG 301 + pshufd $0x1B, STATE0_A, STATE0_A // ABEF 302 + pshufd $0xB1, STATE1_A, STATE1_A // CDGH 303 + 304 + // Load ctx->bytecount. Take the mod 64 of it to get the number of 305 + // bytes that are buffered in ctx->buf. Also save it in a register with 306 + // LEN added to it. 307 + mov LEN, LEN 308 + mov OFFSETOF_BYTECOUNT(CTX), %rbx 309 + lea (%rbx, LEN64, 1), COUNT 310 + and $63, %ebx 311 + jz .Lfinup2x_enter_loop // No bytes buffered? 312 + 313 + // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them 314 + // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we 315 + // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 316 + // unconditionally and rearrange the data as needed. 317 + 318 + movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A 319 + movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A 320 + movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A 321 + movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A 322 + movdqa MSG0_A, 0*16(%rsp) 323 + movdqa MSG1_A, 1*16(%rsp) 324 + movdqa MSG2_A, 2*16(%rsp) 325 + movdqa MSG3_A, 3*16(%rsp) 326 + 327 + movdqu 0*16(DATA1), MSG0_A 328 + movdqu 1*16(DATA1), MSG1_A 329 + movdqu 2*16(DATA1), MSG2_A 330 + movdqu 3*16(DATA1), MSG3_A 331 + movdqu MSG0_A, 0*16(%rsp,%rbx) 332 + movdqu MSG1_A, 1*16(%rsp,%rbx) 333 + movdqu MSG2_A, 2*16(%rsp,%rbx) 334 + movdqu MSG3_A, 3*16(%rsp,%rbx) 335 + movdqa 0*16(%rsp), MSG0_A 336 + movdqa 1*16(%rsp), MSG1_A 337 + movdqa 2*16(%rsp), MSG2_A 338 + movdqa 3*16(%rsp), MSG3_A 339 + 340 + movdqu 0*16(DATA2), MSG0_B 341 + movdqu 1*16(DATA2), MSG1_B 342 + movdqu 2*16(DATA2), MSG2_B 343 + movdqu 3*16(DATA2), MSG3_B 344 + movdqu MSG0_B, 0*16(%rsp,%rbx) 345 + movdqu MSG1_B, 1*16(%rsp,%rbx) 346 + movdqu MSG2_B, 2*16(%rsp,%rbx) 347 + movdqu MSG3_B, 3*16(%rsp,%rbx) 348 + movdqa 0*16(%rsp), MSG0_B 349 + movdqa 1*16(%rsp), MSG1_B 350 + movdqa 2*16(%rsp), MSG2_B 351 + movdqa 3*16(%rsp), MSG3_B 352 + 353 + sub $64, %rbx // rbx = buffered - 64 354 + sub %rbx, DATA1 // DATA1 += 64 - buffered 355 + sub %rbx, DATA2 // DATA2 += 64 - buffered 356 + add %ebx, LEN // LEN += buffered - 64 357 + movdqa STATE0_A, STATE0_B 358 + movdqa STATE1_A, STATE1_B 359 + jmp .Lfinup2x_loop_have_data 360 + 361 + .Lfinup2x_enter_loop: 362 + sub $64, LEN 363 + movdqa STATE0_A, STATE0_B 364 + movdqa STATE1_A, STATE1_B 365 + .Lfinup2x_loop: 366 + // Load the next two data blocks. 367 + movdqu 0*16(DATA1), MSG0_A 368 + movdqu 0*16(DATA2), MSG0_B 369 + movdqu 1*16(DATA1), MSG1_A 370 + movdqu 1*16(DATA2), MSG1_B 371 + movdqu 2*16(DATA1), MSG2_A 372 + movdqu 2*16(DATA2), MSG2_B 373 + movdqu 3*16(DATA1), MSG3_A 374 + movdqu 3*16(DATA2), MSG3_B 375 + add $64, DATA1 376 + add $64, DATA2 377 + .Lfinup2x_loop_have_data: 378 + // Convert the words of the data blocks from big endian. 379 + pshufb SHUF_MASK, MSG0_A 380 + pshufb SHUF_MASK, MSG0_B 381 + pshufb SHUF_MASK, MSG1_A 382 + pshufb SHUF_MASK, MSG1_B 383 + pshufb SHUF_MASK, MSG2_A 384 + pshufb SHUF_MASK, MSG2_B 385 + pshufb SHUF_MASK, MSG3_A 386 + pshufb SHUF_MASK, MSG3_B 387 + .Lfinup2x_loop_have_bswapped_data: 388 + 389 + // Save the original state for each block. 390 + movdqa STATE0_A, 0*16(%rsp) 391 + movdqa STATE0_B, 1*16(%rsp) 392 + movdqa STATE1_A, 2*16(%rsp) 393 + movdqa STATE1_B, 3*16(%rsp) 394 + 395 + // Do the SHA-256 rounds on each block. 396 + .irp i, 0, 16, 32, 48 397 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ 398 + MSG0_B, MSG1_B, MSG2_B, MSG3_B 399 + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ 400 + MSG1_B, MSG2_B, MSG3_B, MSG0_B 401 + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ 402 + MSG2_B, MSG3_B, MSG0_B, MSG1_B 403 + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ 404 + MSG3_B, MSG0_B, MSG1_B, MSG2_B 405 + .endr 406 + 407 + // Add the original state for each block. 408 + paddd 0*16(%rsp), STATE0_A 409 + paddd 1*16(%rsp), STATE0_B 410 + paddd 2*16(%rsp), STATE1_A 411 + paddd 3*16(%rsp), STATE1_B 412 + 413 + // Update LEN and loop back if more blocks remain. 414 + sub $64, LEN 415 + jge .Lfinup2x_loop 416 + 417 + // Check if any final blocks need to be handled. 418 + // FINAL_STEP = 2: all done 419 + // FINAL_STEP = 1: need to do count-only padding block 420 + // FINAL_STEP = 0: need to do the block with 0x80 padding byte 421 + cmp $1, FINAL_STEP 422 + jg .Lfinup2x_done 423 + je .Lfinup2x_finalize_countonly 424 + add $64, LEN 425 + jz .Lfinup2x_finalize_blockaligned 426 + 427 + // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. 428 + // To do this, write the padding starting with the 0x80 byte to 429 + // &sp[64]. Then for each message, copy the last 64 data bytes to sp 430 + // and load from &sp[64 - LEN] to get the needed padding block. This 431 + // code relies on the data buffers being >= 64 bytes in length. 432 + mov $64, %ebx 433 + sub LEN, %ebx // ebx = 64 - LEN 434 + sub %rbx, DATA1 // DATA1 -= 64 - LEN 435 + sub %rbx, DATA2 // DATA2 -= 64 - LEN 436 + mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary 437 + movd FINAL_STEP, MSG0_A 438 + pxor MSG1_A, MSG1_A 439 + movdqa MSG0_A, 4*16(%rsp) 440 + movdqa MSG1_A, 5*16(%rsp) 441 + movdqa MSG1_A, 6*16(%rsp) 442 + movdqa MSG1_A, 7*16(%rsp) 443 + cmp $56, LEN 444 + jge 1f // will COUNT spill into its own block? 445 + shl $3, COUNT 446 + bswap COUNT 447 + mov COUNT, 56(%rsp,%rbx) 448 + mov $2, FINAL_STEP // won't need count-only block 449 + jmp 2f 450 + 1: 451 + mov $1, FINAL_STEP // will need count-only block 452 + 2: 453 + movdqu 0*16(DATA1), MSG0_A 454 + movdqu 1*16(DATA1), MSG1_A 455 + movdqu 2*16(DATA1), MSG2_A 456 + movdqu 3*16(DATA1), MSG3_A 457 + movdqa MSG0_A, 0*16(%rsp) 458 + movdqa MSG1_A, 1*16(%rsp) 459 + movdqa MSG2_A, 2*16(%rsp) 460 + movdqa MSG3_A, 3*16(%rsp) 461 + movdqu 0*16(%rsp,%rbx), MSG0_A 462 + movdqu 1*16(%rsp,%rbx), MSG1_A 463 + movdqu 2*16(%rsp,%rbx), MSG2_A 464 + movdqu 3*16(%rsp,%rbx), MSG3_A 465 + 466 + movdqu 0*16(DATA2), MSG0_B 467 + movdqu 1*16(DATA2), MSG1_B 468 + movdqu 2*16(DATA2), MSG2_B 469 + movdqu 3*16(DATA2), MSG3_B 470 + movdqa MSG0_B, 0*16(%rsp) 471 + movdqa MSG1_B, 1*16(%rsp) 472 + movdqa MSG2_B, 2*16(%rsp) 473 + movdqa MSG3_B, 3*16(%rsp) 474 + movdqu 0*16(%rsp,%rbx), MSG0_B 475 + movdqu 1*16(%rsp,%rbx), MSG1_B 476 + movdqu 2*16(%rsp,%rbx), MSG2_B 477 + movdqu 3*16(%rsp,%rbx), MSG3_B 478 + jmp .Lfinup2x_loop_have_data 479 + 480 + // Prepare a padding block, either: 481 + // 482 + // {0x80, 0, 0, 0, ..., count (as __be64)} 483 + // This is for a block aligned message. 484 + // 485 + // { 0, 0, 0, 0, ..., count (as __be64)} 486 + // This is for a message whose length mod 64 is >= 56. 487 + // 488 + // Pre-swap the endianness of the words. 489 + .Lfinup2x_finalize_countonly: 490 + pxor MSG0_A, MSG0_A 491 + jmp 1f 492 + 493 + .Lfinup2x_finalize_blockaligned: 494 + mov $0x80000000, %ebx 495 + movd %ebx, MSG0_A 496 + 1: 497 + pxor MSG1_A, MSG1_A 498 + pxor MSG2_A, MSG2_A 499 + ror $29, COUNT 500 + movq COUNT, MSG3_A 501 + pslldq $8, MSG3_A 502 + movdqa MSG0_A, MSG0_B 503 + pxor MSG1_B, MSG1_B 504 + pxor MSG2_B, MSG2_B 505 + movdqa MSG3_A, MSG3_B 506 + mov $2, FINAL_STEP 507 + jmp .Lfinup2x_loop_have_bswapped_data 508 + 509 + .Lfinup2x_done: 510 + // Write the two digests with all bytes in the correct order. 511 + movdqa STATE0_A, TMP_A 512 + movdqa STATE0_B, TMP_B 513 + punpcklqdq STATE1_A, STATE0_A // GHEF 514 + punpcklqdq STATE1_B, STATE0_B 515 + punpckhqdq TMP_A, STATE1_A // ABCD 516 + punpckhqdq TMP_B, STATE1_B 517 + pshufd $0xB1, STATE0_A, STATE0_A // HGFE 518 + pshufd $0xB1, STATE0_B, STATE0_B 519 + pshufd $0x1B, STATE1_A, STATE1_A // DCBA 520 + pshufd $0x1B, STATE1_B, STATE1_B 521 + pshufb SHUF_MASK, STATE0_A 522 + pshufb SHUF_MASK, STATE0_B 523 + pshufb SHUF_MASK, STATE1_A 524 + pshufb SHUF_MASK, STATE1_B 525 + movdqu STATE0_A, 1*16(OUT1) 526 + movdqu STATE0_B, 1*16(OUT2) 527 + movdqu STATE1_A, 0*16(OUT1) 528 + movdqu STATE1_B, 0*16(OUT2) 529 + 530 + mov %rbp, %rsp 531 + pop %rbp 532 + pop %rbx 533 + RET 534 + SYM_FUNC_END(sha256_ni_finup2x) 535 + 168 536 .section .rodata.cst256.K256, "aM", @progbits, 256 169 537 .align 64 170 538 K256:
+39
lib/crypto/x86/sha256.h
··· 7 7 #include <asm/fpu/api.h> 8 8 #include <linux/static_call.h> 9 9 10 + static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); 11 + 10 12 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); 11 13 12 14 #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ ··· 37 35 static_call(sha256_blocks_x86)(state, data, nblocks); 38 36 } 39 37 38 + static_assert(offsetof(struct __sha256_ctx, state) == 0); 39 + static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); 40 + static_assert(offsetof(struct __sha256_ctx, buf) == 40); 41 + asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, 42 + const u8 *data1, const u8 *data2, int len, 43 + u8 out1[SHA256_DIGEST_SIZE], 44 + u8 out2[SHA256_DIGEST_SIZE]); 45 + 46 + #define sha256_finup_2x_arch sha256_finup_2x_arch 47 + static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, 48 + const u8 *data1, const u8 *data2, size_t len, 49 + u8 out1[SHA256_DIGEST_SIZE], 50 + u8 out2[SHA256_DIGEST_SIZE]) 51 + { 52 + /* 53 + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. 54 + * Further limit len to 65536 to avoid spending too long with preemption 55 + * disabled. (Of course, in practice len is nearly always 4096 anyway.) 56 + */ 57 + if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && 58 + len <= 65536 && likely(irq_fpu_usable())) { 59 + kernel_fpu_begin(); 60 + sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); 61 + kernel_fpu_end(); 62 + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); 63 + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); 64 + return true; 65 + } 66 + return false; 67 + } 68 + 69 + static bool sha256_finup_2x_is_optimized_arch(void) 70 + { 71 + return static_key_enabled(&have_sha_ni); 72 + } 73 + 40 74 #define sha256_mod_init_arch sha256_mod_init_arch 41 75 static void sha256_mod_init_arch(void) 42 76 { 43 77 if (boot_cpu_has(X86_FEATURE_SHA_NI)) { 44 78 static_call_update(sha256_blocks_x86, sha256_blocks_ni); 79 + static_branch_enable(&have_sha_ni); 45 80 } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, 46 81 NULL) && 47 82 boot_cpu_has(X86_FEATURE_AVX)) {