Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branches 'sched-urgent-for-linus', 'perf-urgent-for-linus' and 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/accounting, proc: Fix /proc/stat interrupts sum

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
tracepoints/module: Fix disabling tracepoints with taint CRAP or OOT
x86/kprobes: Add arch/x86/tools/insn_sanity to .gitignore
x86/kprobes: Fix typo transferred from Intel manual

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits
x86, tsc: Fix SMI induced variation in quick_pit_calibrate()
x86, opcode: ANDN and Group 17 in x86-opcode-map.txt
x86/kconfig: Move the ZONE_DMA entry under a menu
x86/UV2: Add accounting for BAU strong nacks
x86/UV2: Ack BAU interrupt earlier
x86/UV2: Remove stale no-resources test for UV2 BAU
x86/UV2: Work around BAU bug
x86/UV2: Fix BAU destination timeout initialization
x86/UV2: Fix new UV2 hardware by using native UV2 broadcast mode
x86: Get rid of dubious one-bit signed bitfield

+444 -104
+1
arch/x86/.gitignore
··· 1 1 boot/compressed/vmlinux 2 2 tools/test_get_len 3 + tools/insn_sanity 3 4
+10 -10
arch/x86/Kconfig
··· 125 125 config MMU 126 126 def_bool y 127 127 128 - config ZONE_DMA 129 - bool "DMA memory allocation support" if EXPERT 130 - default y 131 - help 132 - DMA memory allocation support allows devices with less than 32-bit 133 - addressing to allocate within the first 16MB of address space. 134 - Disable if no such devices will be used. 135 - 136 - If unsure, say Y. 137 - 138 128 config SBUS 139 129 bool 140 130 ··· 244 254 source "kernel/Kconfig.freezer" 245 255 246 256 menu "Processor type and features" 257 + 258 + config ZONE_DMA 259 + bool "DMA memory allocation support" if EXPERT 260 + default y 261 + help 262 + DMA memory allocation support allows devices with less than 32-bit 263 + addressing to allocate within the first 16MB of address space. 264 + Disable if no such devices will be used. 265 + 266 + If unsure, say Y. 247 267 248 268 source "kernel/time/Kconfig" 249 269
+1
arch/x86/include/asm/unistd.h
··· 7 7 # include <asm/unistd_32.h> 8 8 # define __ARCH_WANT_IPC_PARSE_VERSION 9 9 # define __ARCH_WANT_STAT64 10 + # define __ARCH_WANT_SYS_IPC 10 11 # define __ARCH_WANT_SYS_OLD_MMAP 11 12 # define __ARCH_WANT_SYS_OLD_SELECT 12 13
+99 -8
arch/x86/include/asm/uv/uv_bau.h
··· 65 65 * UV2: Bit 19 selects between 66 66 * (0): 10 microsecond timebase and 67 67 * (1): 80 microseconds 68 - * we're using 655us, similar to UV1: 65 units of 10us 68 + * we're using 560us, similar to UV1: 65 units of 10us 69 69 */ 70 70 #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) 71 71 #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) ··· 167 167 #define FLUSH_RETRY_TIMEOUT 2 168 168 #define FLUSH_GIVEUP 3 169 169 #define FLUSH_COMPLETE 4 170 + #define FLUSH_RETRY_BUSYBUG 5 170 171 171 172 /* 172 173 * tuning the action when the numalink network is extremely delayed ··· 236 235 237 236 238 237 /* 239 - * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) 238 + * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) 240 239 * see table 4.2.3.0.1 in broacast_assist spec. 241 240 */ 242 - struct bau_msg_header { 241 + struct uv1_bau_msg_header { 243 242 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 244 243 /* bits 5:0 */ 245 244 unsigned int base_dest_nasid:15; /* nasid of the first bit */ ··· 319 318 }; 320 319 321 320 /* 321 + * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) 322 + * see figure 9-2 of harp_sys.pdf 323 + */ 324 + struct uv2_bau_msg_header { 325 + unsigned int base_dest_nasid:15; /* nasid of the first bit */ 326 + /* bits 14:0 */ /* in uvhub map */ 327 + unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ 328 + /* bits 19:15 */ 329 + unsigned int rsvd_1:1; /* must be zero */ 330 + /* bit 20 */ 331 + /* Address bits 59:21 */ 332 + /* bits 25:2 of address (44:21) are payload */ 333 + /* these next 24 bits become bytes 12-14 of msg */ 334 + /* bits 28:21 land in byte 12 */ 335 + unsigned int replied_to:1; /* sent as 0 by the source to 336 + byte 12 */ 337 + /* bit 21 */ 338 + unsigned int msg_type:3; /* software type of the 339 + message */ 340 + /* bits 24:22 */ 341 + unsigned int canceled:1; /* message canceled, resource 342 + is to be freed*/ 343 + /* bit 25 */ 344 + unsigned int payload_1:3; /* not currently used */ 345 + /* bits 28:26 */ 346 + 347 + /* bits 36:29 land in byte 13 */ 348 + unsigned int payload_2a:3; /* not currently used */ 349 + unsigned int payload_2b:5; /* not currently used */ 350 + /* bits 36:29 */ 351 + 352 + /* bits 44:37 land in byte 14 */ 353 + unsigned int payload_3:8; /* not currently used */ 354 + /* bits 44:37 */ 355 + 356 + unsigned int rsvd_2:7; /* reserved */ 357 + /* bits 51:45 */ 358 + unsigned int swack_flag:1; /* software acknowledge flag */ 359 + /* bit 52 */ 360 + unsigned int rsvd_3a:3; /* must be zero */ 361 + unsigned int rsvd_3b:8; /* must be zero */ 362 + unsigned int rsvd_3c:8; /* must be zero */ 363 + unsigned int rsvd_3d:3; /* must be zero */ 364 + /* bits 74:53 */ 365 + unsigned int fairness:3; /* usually zero */ 366 + /* bits 77:75 */ 367 + 368 + unsigned int sequence:16; /* message sequence number */ 369 + /* bits 93:78 Suppl_A */ 370 + unsigned int chaining:1; /* next descriptor is part of 371 + this activation*/ 372 + /* bit 94 */ 373 + unsigned int multilevel:1; /* multi-level multicast 374 + format */ 375 + /* bit 95 */ 376 + unsigned int rsvd_4:24; /* ordered / source node / 377 + source subnode / aging 378 + must be zero */ 379 + /* bits 119:96 */ 380 + unsigned int command:8; /* message type */ 381 + /* bits 127:120 */ 382 + }; 383 + 384 + /* 322 385 * The activation descriptor: 323 386 * The format of the message to send, plus all accompanying control 324 387 * Should be 64 bytes 325 388 */ 326 389 struct bau_desc { 327 - struct pnmask distribution; 390 + struct pnmask distribution; 328 391 /* 329 392 * message template, consisting of header and payload: 330 393 */ 331 - struct bau_msg_header header; 332 - struct bau_msg_payload payload; 394 + union bau_msg_header { 395 + struct uv1_bau_msg_header uv1_hdr; 396 + struct uv2_bau_msg_header uv2_hdr; 397 + } header; 398 + 399 + struct bau_msg_payload payload; 333 400 }; 334 - /* 401 + /* UV1: 335 402 * -payload-- ---------header------ 336 403 * bytes 0-11 bits 41-56 bits 58-81 404 + * A B (2) C (3) 405 + * 406 + * A/B/C are moved to: 407 + * A C B 408 + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) 409 + * ------------payload queue----------- 410 + */ 411 + /* UV2: 412 + * -payload-- ---------header------ 413 + * bytes 0-11 bits 70-78 bits 21-44 337 414 * A B (2) C (3) 338 415 * 339 416 * A/B/C are moved to: ··· 464 385 struct msg_desc { 465 386 struct bau_pq_entry *msg; 466 387 int msg_slot; 467 - int swack_slot; 468 388 struct bau_pq_entry *queue_first; 469 389 struct bau_pq_entry *queue_last; 470 390 }; ··· 483 405 requests */ 484 406 unsigned long s_stimeout; /* source side timeouts */ 485 407 unsigned long s_dtimeout; /* destination side timeouts */ 408 + unsigned long s_strongnacks; /* number of strong nack's */ 486 409 unsigned long s_time; /* time spent in sending side */ 487 410 unsigned long s_retriesok; /* successful retries */ 488 411 unsigned long s_ntargcpu; /* total number of cpu's ··· 518 439 unsigned long s_retry_messages; /* retry broadcasts */ 519 440 unsigned long s_bau_reenabled; /* for bau enable/disable */ 520 441 unsigned long s_bau_disabled; /* for bau enable/disable */ 442 + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ 443 + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ 444 + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ 521 445 /* destination statistics */ 522 446 unsigned long d_alltlb; /* times all tlb's on this 523 447 cpu were flushed */ ··· 593 511 short osnode; 594 512 short uvhub_cpu; 595 513 short uvhub; 514 + short uvhub_version; 596 515 short cpus_in_socket; 597 516 short cpus_in_uvhub; 598 517 short partition_base_pnode; 518 + short using_desc; /* an index, like uvhub_cpu */ 519 + unsigned int inuse_map; 599 520 unsigned short message_number; 600 521 unsigned short uvhub_quiesce; 601 522 short socket_acknowledge_count[DEST_Q_SIZE]; ··· 616 531 int cong_response_us; 617 532 int cong_reps; 618 533 int cong_period; 534 + unsigned long clocks_per_100_usec; 619 535 cycles_t period_time; 620 536 long period_requests; 621 537 struct hub_and_pnode *thp; ··· 675 589 static inline void write_mmr_sw_ack(unsigned long mr) 676 590 { 677 591 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); 592 + } 593 + 594 + static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) 595 + { 596 + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); 678 597 } 679 598 680 599 static inline unsigned long read_mmr_sw_ack(void)
+6 -8
arch/x86/kernel/tsc.c
··· 290 290 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 291 291 { 292 292 int count; 293 - u64 tsc = 0; 293 + u64 tsc = 0, prev_tsc = 0; 294 294 295 295 for (count = 0; count < 50000; count++) { 296 296 if (!pit_verify_msb(val)) 297 297 break; 298 + prev_tsc = tsc; 298 299 tsc = get_cycles(); 299 300 } 300 - *deltap = get_cycles() - tsc; 301 + *deltap = get_cycles() - prev_tsc; 301 302 *tscp = tsc; 302 303 303 304 /* ··· 312 311 * How many MSB values do we want to see? We aim for 313 312 * a maximum error rate of 500ppm (in practice the 314 313 * real error is much smaller), but refuse to spend 315 - * more than 25ms on it. 314 + * more than 50ms on it. 316 315 */ 317 - #define MAX_QUICK_PIT_MS 25 316 + #define MAX_QUICK_PIT_MS 50 318 317 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 319 318 320 319 static unsigned long quick_pit_calibrate(void) ··· 384 383 * 385 384 * As a result, we can depend on there not being 386 385 * any odd delays anywhere, and the TSC reads are 387 - * reliable (within the error). We also adjust the 388 - * delta to the middle of the error bars, just 389 - * because it looks nicer. 386 + * reliable (within the error). 390 387 * 391 388 * kHz = ticks / time-in-seconds / 1000; 392 389 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 393 390 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) 394 391 */ 395 - delta += (long)(d2 - d1)/2; 396 392 delta *= PIT_TICK_RATE; 397 393 do_div(delta, i*256*1000); 398 394 printk("Fast TSC calibration using PIT\n");
+5 -3
arch/x86/lib/x86-opcode-map.txt
··· 219 219 ac: LODS/B AL,Xb 220 220 ad: LODS/W/D/Q rAX,Xv 221 221 ae: SCAS/B AL,Yb 222 - af: SCAS/W/D/Q rAX,Xv 222 + # Note: The May 2011 Intel manual shows Xv for the second parameter of the 223 + # next instruction but Yv is correct 224 + af: SCAS/W/D/Q rAX,Yv 223 225 # 0xb0 - 0xbf 224 226 b0: MOV AL/R8L,Ib 225 227 b1: MOV CL/R9L,Ib ··· 731 729 df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) 732 730 f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) 733 731 f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) 734 - f3: ANDN Gy,By,Ey (v) 735 - f4: Grp17 (1A) 732 + f2: ANDN Gy,By,Ey (v) 733 + f3: Grp17 (1A) 736 734 f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) 737 735 f6: MULX By,Gy,rDX,Ey (F2),(v) 738 736 f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
+316 -72
arch/x86/platform/uv/tlb_uv.c
··· 157 157 * clear of the Timeout bit (as well) will free the resource. No reply will 158 158 * be sent (the hardware will only do one reply per message). 159 159 */ 160 - static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) 160 + static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, 161 + int do_acknowledge) 161 162 { 162 163 unsigned long dw; 163 164 struct bau_pq_entry *msg; 164 165 165 166 msg = mdp->msg; 166 - if (!msg->canceled) { 167 + if (!msg->canceled && do_acknowledge) { 167 168 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; 168 169 write_mmr_sw_ack(dw); 169 170 } ··· 213 212 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { 214 213 unsigned long mr; 215 214 /* 216 - * is the resource timed out? 217 - * make everyone ignore the cancelled message. 215 + * Is the resource timed out? 216 + * Make everyone ignore the cancelled message. 218 217 */ 219 218 msg2->canceled = 1; 220 219 stat->d_canceled++; ··· 232 231 * Do all the things a cpu should do for a TLB shootdown message. 233 232 * Other cpu's may come here at the same time for this message. 234 233 */ 235 - static void bau_process_message(struct msg_desc *mdp, 236 - struct bau_control *bcp) 234 + static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, 235 + int do_acknowledge) 237 236 { 238 237 short socket_ack_count = 0; 239 238 short *sp; ··· 285 284 if (msg_ack_count == bcp->cpus_in_uvhub) { 286 285 /* 287 286 * All cpus in uvhub saw it; reply 287 + * (unless we are in the UV2 workaround) 288 288 */ 289 - reply_to_message(mdp, bcp); 289 + reply_to_message(mdp, bcp, do_acknowledge); 290 290 } 291 291 } 292 292 ··· 493 491 /* 494 492 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. 495 493 */ 496 - static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) 494 + static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) 497 495 { 498 496 unsigned long descriptor_status; 499 497 unsigned long descriptor_status2; 500 498 501 499 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); 502 - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; 500 + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; 503 501 descriptor_status = (descriptor_status << 1) | descriptor_status2; 504 502 return descriptor_status; 503 + } 504 + 505 + /* 506 + * Return whether the status of the descriptor that is normally used for this 507 + * cpu (the one indexed by its hub-relative cpu number) is busy. 508 + * The status of the original 32 descriptors is always reflected in the 64 509 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. 510 + * The bit provided by the activation_status_2 register is irrelevant to 511 + * the status if it is only being tested for busy or not busy. 512 + */ 513 + int normal_busy(struct bau_control *bcp) 514 + { 515 + int cpu = bcp->uvhub_cpu; 516 + int mmr_offset; 517 + int right_shift; 518 + 519 + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 520 + right_shift = cpu * UV_ACT_STATUS_SIZE; 521 + return (((((read_lmmr(mmr_offset) >> right_shift) & 522 + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); 523 + } 524 + 525 + /* 526 + * Entered when a bau descriptor has gone into a permanent busy wait because 527 + * of a hardware bug. 528 + * Workaround the bug. 529 + */ 530 + int handle_uv2_busy(struct bau_control *bcp) 531 + { 532 + int busy_one = bcp->using_desc; 533 + int normal = bcp->uvhub_cpu; 534 + int selected = -1; 535 + int i; 536 + unsigned long descriptor_status; 537 + unsigned long status; 538 + int mmr_offset; 539 + struct bau_desc *bau_desc_old; 540 + struct bau_desc *bau_desc_new; 541 + struct bau_control *hmaster = bcp->uvhub_master; 542 + struct ptc_stats *stat = bcp->statp; 543 + cycles_t ttm; 544 + 545 + stat->s_uv2_wars++; 546 + spin_lock(&hmaster->uvhub_lock); 547 + /* try for the original first */ 548 + if (busy_one != normal) { 549 + if (!normal_busy(bcp)) 550 + selected = normal; 551 + } 552 + if (selected < 0) { 553 + /* can't use the normal, select an alternate */ 554 + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; 555 + descriptor_status = read_lmmr(mmr_offset); 556 + 557 + /* scan available descriptors 32-63 */ 558 + for (i = 0; i < UV_CPUS_PER_AS; i++) { 559 + if ((hmaster->inuse_map & (1 << i)) == 0) { 560 + status = ((descriptor_status >> 561 + (i * UV_ACT_STATUS_SIZE)) & 562 + UV_ACT_STATUS_MASK) << 1; 563 + if (status != UV2H_DESC_BUSY) { 564 + selected = i + UV_CPUS_PER_AS; 565 + break; 566 + } 567 + } 568 + } 569 + } 570 + 571 + if (busy_one != normal) 572 + /* mark the busy alternate as not in-use */ 573 + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); 574 + 575 + if (selected >= 0) { 576 + /* switch to the selected descriptor */ 577 + if (selected != normal) { 578 + /* set the selected alternate as in-use */ 579 + hmaster->inuse_map |= 580 + (1 << (selected - UV_CPUS_PER_AS)); 581 + if (selected > stat->s_uv2_wars_hw) 582 + stat->s_uv2_wars_hw = selected; 583 + } 584 + bau_desc_old = bcp->descriptor_base; 585 + bau_desc_old += (ITEMS_PER_DESC * busy_one); 586 + bcp->using_desc = selected; 587 + bau_desc_new = bcp->descriptor_base; 588 + bau_desc_new += (ITEMS_PER_DESC * selected); 589 + *bau_desc_new = *bau_desc_old; 590 + } else { 591 + /* 592 + * All are busy. Wait for the normal one for this cpu to 593 + * free up. 594 + */ 595 + stat->s_uv2_war_waits++; 596 + spin_unlock(&hmaster->uvhub_lock); 597 + ttm = get_cycles(); 598 + do { 599 + cpu_relax(); 600 + } while (normal_busy(bcp)); 601 + spin_lock(&hmaster->uvhub_lock); 602 + /* switch to the original descriptor */ 603 + bcp->using_desc = normal; 604 + bau_desc_old = bcp->descriptor_base; 605 + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); 606 + bcp->using_desc = (ITEMS_PER_DESC * normal); 607 + bau_desc_new = bcp->descriptor_base; 608 + bau_desc_new += (ITEMS_PER_DESC * normal); 609 + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ 610 + } 611 + spin_unlock(&hmaster->uvhub_lock); 612 + return FLUSH_RETRY_BUSYBUG; 505 613 } 506 614 507 615 static int uv2_wait_completion(struct bau_desc *bau_desc, ··· 620 508 { 621 509 unsigned long descriptor_stat; 622 510 cycles_t ttm; 623 - int cpu = bcp->uvhub_cpu; 511 + int desc = bcp->using_desc; 512 + long busy_reps = 0; 624 513 struct ptc_stats *stat = bcp->statp; 625 514 626 - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); 515 + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); 627 516 628 517 /* spin on the status MMR, waiting for it to go idle */ 629 518 while (descriptor_stat != UV2H_DESC_IDLE) { ··· 635 522 * our message and its state will stay IDLE. 636 523 */ 637 524 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || 638 - (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || 639 525 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { 640 526 stat->s_stimeout++; 641 527 return FLUSH_GIVEUP; 528 + } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { 529 + stat->s_strongnacks++; 530 + bcp->conseccompletes = 0; 531 + return FLUSH_GIVEUP; 642 532 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { 643 533 stat->s_dtimeout++; 644 - ttm = get_cycles(); 645 - /* 646 - * Our retries may be blocked by all destination 647 - * swack resources being consumed, and a timeout 648 - * pending. In that case hardware returns the 649 - * ERROR that looks like a destination timeout. 650 - */ 651 - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { 652 - bcp->conseccompletes = 0; 653 - return FLUSH_RETRY_PLUGGED; 654 - } 655 534 bcp->conseccompletes = 0; 656 535 return FLUSH_RETRY_TIMEOUT; 657 536 } else { 537 + busy_reps++; 538 + if (busy_reps > 1000000) { 539 + /* not to hammer on the clock */ 540 + busy_reps = 0; 541 + ttm = get_cycles(); 542 + if ((ttm - bcp->send_message) > 543 + (bcp->clocks_per_100_usec)) { 544 + return handle_uv2_busy(bcp); 545 + } 546 + } 658 547 /* 659 548 * descriptor_stat is still BUSY 660 549 */ 661 550 cpu_relax(); 662 551 } 663 - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); 552 + descriptor_stat = uv2_read_status(mmr_offset, right_shift, 553 + desc); 664 554 } 665 555 bcp->conseccompletes++; 666 556 return FLUSH_COMPLETE; ··· 679 563 { 680 564 int right_shift; 681 565 unsigned long mmr_offset; 682 - int cpu = bcp->uvhub_cpu; 566 + int desc = bcp->using_desc; 683 567 684 - if (cpu < UV_CPUS_PER_AS) { 568 + if (desc < UV_CPUS_PER_AS) { 685 569 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; 686 - right_shift = cpu * UV_ACT_STATUS_SIZE; 570 + right_shift = desc * UV_ACT_STATUS_SIZE; 687 571 } else { 688 572 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; 689 - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); 573 + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); 690 574 } 691 575 692 - if (is_uv1_hub()) 576 + if (bcp->uvhub_version == 1) 693 577 return uv1_wait_completion(bau_desc, mmr_offset, right_shift, 694 578 bcp, try); 695 579 else ··· 868 752 * Returns 1 if it gives up entirely and the original cpu mask is to be 869 753 * returned to the kernel. 870 754 */ 871 - int uv_flush_send_and_wait(struct bau_desc *bau_desc, 872 - struct cpumask *flush_mask, struct bau_control *bcp) 755 + int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) 873 756 { 874 757 int seq_number = 0; 875 758 int completion_stat = 0; 759 + int uv1 = 0; 876 760 long try = 0; 877 761 unsigned long index; 878 762 cycles_t time1; 879 763 cycles_t time2; 880 764 struct ptc_stats *stat = bcp->statp; 881 765 struct bau_control *hmaster = bcp->uvhub_master; 766 + struct uv1_bau_msg_header *uv1_hdr = NULL; 767 + struct uv2_bau_msg_header *uv2_hdr = NULL; 768 + struct bau_desc *bau_desc; 882 769 883 - if (is_uv1_hub()) 770 + if (bcp->uvhub_version == 1) 884 771 uv1_throttle(hmaster, stat); 885 772 886 773 while (hmaster->uvhub_quiesce) ··· 891 772 892 773 time1 = get_cycles(); 893 774 do { 894 - if (try == 0) { 895 - bau_desc->header.msg_type = MSG_REGULAR; 775 + bau_desc = bcp->descriptor_base; 776 + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 777 + if (bcp->uvhub_version == 1) { 778 + uv1 = 1; 779 + uv1_hdr = &bau_desc->header.uv1_hdr; 780 + } else 781 + uv2_hdr = &bau_desc->header.uv2_hdr; 782 + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { 783 + if (uv1) 784 + uv1_hdr->msg_type = MSG_REGULAR; 785 + else 786 + uv2_hdr->msg_type = MSG_REGULAR; 896 787 seq_number = bcp->message_number++; 897 788 } else { 898 - bau_desc->header.msg_type = MSG_RETRY; 789 + if (uv1) 790 + uv1_hdr->msg_type = MSG_RETRY; 791 + else 792 + uv2_hdr->msg_type = MSG_RETRY; 899 793 stat->s_retry_messages++; 900 794 } 901 795 902 - bau_desc->header.sequence = seq_number; 903 - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; 796 + if (uv1) 797 + uv1_hdr->sequence = seq_number; 798 + else 799 + uv2_hdr->sequence = seq_number; 800 + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; 904 801 bcp->send_message = get_cycles(); 905 802 906 803 write_mmr_activation(index); 907 804 908 805 try++; 909 806 completion_stat = wait_completion(bau_desc, bcp, try); 807 + /* UV2: wait_completion() may change the bcp->using_desc */ 910 808 911 809 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); 912 810 ··· 934 798 } 935 799 cpu_relax(); 936 800 } while ((completion_stat == FLUSH_RETRY_PLUGGED) || 801 + (completion_stat == FLUSH_RETRY_BUSYBUG) || 937 802 (completion_stat == FLUSH_RETRY_TIMEOUT)); 938 803 939 804 time2 = get_cycles(); ··· 949 812 record_send_stats(time1, time2, bcp, stat, completion_stat, try); 950 813 951 814 if (completion_stat == FLUSH_GIVEUP) 815 + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ 952 816 return 1; 953 817 return 0; 954 818 } ··· 1105 967 stat->s_ntargself++; 1106 968 1107 969 bau_desc = bcp->descriptor_base; 1108 - bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; 970 + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); 1109 971 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 1110 972 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) 1111 973 return NULL; ··· 1118 980 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1119 981 * or 1 if it gave up and the original cpumask should be returned. 1120 982 */ 1121 - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) 983 + if (!uv_flush_send_and_wait(flush_mask, bcp)) 1122 984 return NULL; 1123 985 else 1124 986 return cpumask; 987 + } 988 + 989 + /* 990 + * Search the message queue for any 'other' message with the same software 991 + * acknowledge resource bit vector. 992 + */ 993 + struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, 994 + struct bau_control *bcp, unsigned char swack_vec) 995 + { 996 + struct bau_pq_entry *msg_next = msg + 1; 997 + 998 + if (msg_next > bcp->queue_last) 999 + msg_next = bcp->queue_first; 1000 + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { 1001 + if (msg_next->swack_vec == swack_vec) 1002 + return msg_next; 1003 + msg_next++; 1004 + if (msg_next > bcp->queue_last) 1005 + msg_next = bcp->queue_first; 1006 + } 1007 + return NULL; 1008 + } 1009 + 1010 + /* 1011 + * UV2 needs to work around a bug in which an arriving message has not 1012 + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. 1013 + * Such a message must be ignored. 1014 + */ 1015 + void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) 1016 + { 1017 + unsigned long mmr_image; 1018 + unsigned char swack_vec; 1019 + struct bau_pq_entry *msg = mdp->msg; 1020 + struct bau_pq_entry *other_msg; 1021 + 1022 + mmr_image = read_mmr_sw_ack(); 1023 + swack_vec = msg->swack_vec; 1024 + 1025 + if ((swack_vec & mmr_image) == 0) { 1026 + /* 1027 + * This message was assigned a swack resource, but no 1028 + * reserved acknowlegment is pending. 1029 + * The bug has prevented this message from setting the MMR. 1030 + * And no other message has used the same sw_ack resource. 1031 + * Do the requested shootdown but do not reply to the msg. 1032 + * (the 0 means make no acknowledge) 1033 + */ 1034 + bau_process_message(mdp, bcp, 0); 1035 + return; 1036 + } 1037 + 1038 + /* 1039 + * Some message has set the MMR 'pending' bit; it might have been 1040 + * another message. Look for that message. 1041 + */ 1042 + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); 1043 + if (other_msg) { 1044 + /* There is another. Do not ack the current one. */ 1045 + bau_process_message(mdp, bcp, 0); 1046 + /* 1047 + * Let the natural processing of that message acknowledge 1048 + * it. Don't get the processing of sw_ack's out of order. 1049 + */ 1050 + return; 1051 + } 1052 + 1053 + /* 1054 + * There is no other message using this sw_ack, so it is safe to 1055 + * acknowledge it. 1056 + */ 1057 + bau_process_message(mdp, bcp, 1); 1058 + 1059 + return; 1125 1060 } 1126 1061 1127 1062 /* ··· 1220 1009 struct ptc_stats *stat; 1221 1010 struct msg_desc msgdesc; 1222 1011 1012 + ack_APIC_irq(); 1223 1013 time_start = get_cycles(); 1224 1014 1225 1015 bcp = &per_cpu(bau_control, smp_processor_id()); ··· 1234 1022 count++; 1235 1023 1236 1024 msgdesc.msg_slot = msg - msgdesc.queue_first; 1237 - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; 1238 1025 msgdesc.msg = msg; 1239 - bau_process_message(&msgdesc, bcp); 1026 + if (bcp->uvhub_version == 2) 1027 + process_uv2_message(&msgdesc, bcp); 1028 + else 1029 + bau_process_message(&msgdesc, bcp, 1); 1240 1030 1241 1031 msg++; 1242 1032 if (msg > msgdesc.queue_last) ··· 1250 1036 stat->d_nomsg++; 1251 1037 else if (count > 1) 1252 1038 stat->d_multmsg++; 1253 - 1254 - ack_APIC_irq(); 1255 1039 } 1256 1040 1257 1041 /* ··· 1295 1083 */ 1296 1084 mmr_image |= (1L << SOFTACK_MSHIFT); 1297 1085 if (is_uv2_hub()) { 1298 - mmr_image |= (1L << UV2_LEG_SHFT); 1086 + mmr_image &= ~(1L << UV2_LEG_SHFT); 1299 1087 mmr_image |= (1L << UV2_EXT_SHFT); 1300 1088 } 1301 1089 write_mmr_misc_control(pnode, mmr_image); ··· 1348 1136 seq_printf(file, 1349 1137 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1350 1138 seq_printf(file, 1351 - "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); 1139 + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); 1352 1140 seq_printf(file, 1353 1141 "resetp resett giveup sto bz throt swack recv rtime "); 1354 1142 seq_printf(file, 1355 1143 "all one mult none retry canc nocan reset rcan "); 1356 1144 seq_printf(file, 1357 - "disable enable\n"); 1145 + "disable enable wars warshw warwaits\n"); 1358 1146 } 1359 1147 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1360 1148 stat = &per_cpu(ptcstats, cpu); ··· 1366 1154 stat->s_ntargremotes, stat->s_ntargcpu, 1367 1155 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1368 1156 stat->s_ntarguvhub, stat->s_ntarguvhub16); 1369 - seq_printf(file, "%ld %ld %ld %ld %ld ", 1157 + seq_printf(file, "%ld %ld %ld %ld %ld %ld ", 1370 1158 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 1371 1159 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 1372 - stat->s_dtimeout); 1160 + stat->s_dtimeout, stat->s_strongnacks); 1373 1161 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 1374 1162 stat->s_retry_messages, stat->s_retriesok, 1375 1163 stat->s_resets_plug, stat->s_resets_timeout, ··· 1385 1173 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1386 1174 stat->d_nocanceled, stat->d_resets, 1387 1175 stat->d_rcanceled); 1388 - seq_printf(file, "%ld %ld\n", 1389 - stat->s_bau_disabled, stat->s_bau_reenabled); 1176 + seq_printf(file, "%ld %ld %ld %ld %ld\n", 1177 + stat->s_bau_disabled, stat->s_bau_reenabled, 1178 + stat->s_uv2_wars, stat->s_uv2_wars_hw, 1179 + stat->s_uv2_war_waits); 1390 1180 } 1391 1181 return 0; 1392 1182 } ··· 1646 1432 { 1647 1433 int i; 1648 1434 int cpu; 1435 + int uv1 = 0; 1649 1436 unsigned long gpa; 1650 1437 unsigned long m; 1651 1438 unsigned long n; 1652 1439 size_t dsize; 1653 1440 struct bau_desc *bau_desc; 1654 1441 struct bau_desc *bd2; 1442 + struct uv1_bau_msg_header *uv1_hdr; 1443 + struct uv2_bau_msg_header *uv2_hdr; 1655 1444 struct bau_control *bcp; 1656 1445 1657 1446 /* ··· 1668 1451 gpa = uv_gpa(bau_desc); 1669 1452 n = uv_gpa_to_gnode(gpa); 1670 1453 m = uv_gpa_to_offset(gpa); 1454 + if (is_uv1_hub()) 1455 + uv1 = 1; 1671 1456 1672 1457 /* the 14-bit pnode */ 1673 1458 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); ··· 1680 1461 */ 1681 1462 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { 1682 1463 memset(bd2, 0, sizeof(struct bau_desc)); 1683 - bd2->header.swack_flag = 1; 1684 - /* 1685 - * The base_dest_nasid set in the message header is the nasid 1686 - * of the first uvhub in the partition. The bit map will 1687 - * indicate destination pnode numbers relative to that base. 1688 - * They may not be consecutive if nasid striding is being used. 1689 - */ 1690 - bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); 1691 - bd2->header.dest_subnodeid = UV_LB_SUBNODEID; 1692 - bd2->header.command = UV_NET_ENDPOINT_INTD; 1693 - bd2->header.int_both = 1; 1694 - /* 1695 - * all others need to be set to zero: 1696 - * fairness chaining multilevel count replied_to 1697 - */ 1464 + if (uv1) { 1465 + uv1_hdr = &bd2->header.uv1_hdr; 1466 + uv1_hdr->swack_flag = 1; 1467 + /* 1468 + * The base_dest_nasid set in the message header 1469 + * is the nasid of the first uvhub in the partition. 1470 + * The bit map will indicate destination pnode numbers 1471 + * relative to that base. They may not be consecutive 1472 + * if nasid striding is being used. 1473 + */ 1474 + uv1_hdr->base_dest_nasid = 1475 + UV_PNODE_TO_NASID(base_pnode); 1476 + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; 1477 + uv1_hdr->command = UV_NET_ENDPOINT_INTD; 1478 + uv1_hdr->int_both = 1; 1479 + /* 1480 + * all others need to be set to zero: 1481 + * fairness chaining multilevel count replied_to 1482 + */ 1483 + } else { 1484 + uv2_hdr = &bd2->header.uv2_hdr; 1485 + uv2_hdr->swack_flag = 1; 1486 + uv2_hdr->base_dest_nasid = 1487 + UV_PNODE_TO_NASID(base_pnode); 1488 + uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; 1489 + uv2_hdr->command = UV_NET_ENDPOINT_INTD; 1490 + } 1698 1491 } 1699 1492 for_each_present_cpu(cpu) { 1700 1493 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) ··· 1762 1531 write_mmr_payload_first(pnode, pn_first); 1763 1532 write_mmr_payload_tail(pnode, first); 1764 1533 write_mmr_payload_last(pnode, last); 1534 + write_gmmr_sw_ack(pnode, 0xffffUL); 1765 1535 1766 1536 /* in effect, all msg_type's are set to MSG_NOOP */ 1767 1537 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); ··· 1816 1584 ts_ns = base * mult1 * mult2; 1817 1585 ret = ts_ns / 1000; 1818 1586 } else { 1819 - /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ 1820 - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); 1587 + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ 1588 + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); 1821 1589 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; 1822 1590 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) 1823 - mult1 = 80; 1591 + base = 80; 1824 1592 else 1825 - mult1 = 10; 1826 - base = mmr_image & UV2_ACK_MASK; 1593 + base = 10; 1594 + mult1 = mmr_image & UV2_ACK_MASK; 1827 1595 ret = mult1 * base; 1828 1596 } 1829 1597 return ret; ··· 1850 1618 bcp->cong_response_us = congested_respns_us; 1851 1619 bcp->cong_reps = congested_reps; 1852 1620 bcp->cong_period = congested_period; 1621 + bcp->clocks_per_100_usec = usec_2_cycles(100); 1853 1622 } 1854 1623 } 1855 1624 ··· 1961 1728 bcp->cpus_in_socket = sdp->num_cpus; 1962 1729 bcp->socket_master = *smasterp; 1963 1730 bcp->uvhub = bdp->uvhub; 1731 + if (is_uv1_hub()) 1732 + bcp->uvhub_version = 1; 1733 + else if (is_uv2_hub()) 1734 + bcp->uvhub_version = 2; 1735 + else { 1736 + printk(KERN_EMERG "uvhub version not 1 or 2\n"); 1737 + return 1; 1738 + } 1964 1739 bcp->uvhub_master = *hmasterp; 1965 1740 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; 1741 + bcp->using_desc = bcp->uvhub_cpu; 1966 1742 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { 1967 1743 printk(KERN_EMERG "%d cpus per uvhub invalid\n", 1968 1744 bcp->uvhub_cpu); ··· 2087 1845 uv_base_pnode = uv_blade_to_pnode(uvhub); 2088 1846 } 2089 1847 1848 + enable_timeouts(); 1849 + 2090 1850 if (init_per_cpu(nuvhubs, uv_base_pnode)) { 2091 1851 nobau = 1; 2092 1852 return 0; ··· 2099 1855 if (uv_blade_nr_possible_cpus(uvhub)) 2100 1856 init_uvhub(uvhub, vector, uv_base_pnode); 2101 1857 2102 - enable_timeouts(); 2103 1858 alloc_intr_gate(vector, uv_bau_message_intr1); 2104 1859 2105 1860 for_each_possible_blade(uvhub) { ··· 2110 1867 val = 1L << 63; 2111 1868 write_gmmr_activation(pnode, val); 2112 1869 mmr = 1; /* should be 1 to broadcast to both sockets */ 2113 - write_mmr_data_broadcast(pnode, mmr); 1870 + if (!is_uv1_hub()) 1871 + write_mmr_data_broadcast(pnode, mmr); 2114 1872 } 2115 1873 } 2116 1874
+2
fs/proc/stat.c
··· 77 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 78 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 79 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 80 + sum += kstat_cpu_irqs_sum(i); 81 + sum += arch_irq_stat_cpu(i); 80 82 81 83 for (j = 0; j < NR_SOFTIRQS; j++) { 82 84 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+4 -3
kernel/tracepoint.c
··· 634 634 int ret = 0; 635 635 636 636 /* 637 - * We skip modules that tain the kernel, especially those with different 638 - * module header (for forced load), to make sure we don't cause a crash. 637 + * We skip modules that taint the kernel, especially those with different 638 + * module headers (for forced load), to make sure we don't cause a crash. 639 + * Staging and out-of-tree GPL modules are fine. 639 640 */ 640 - if (mod->taints) 641 + if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) 641 642 return 0; 642 643 mutex_lock(&tracepoints_mutex); 643 644 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);