Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2

+1 -1

fs/ocfs2/alloc.c

··· 5699 5699 OCFS2_JOURNAL_ACCESS_WRITE); 5700 5700 if (ret) { 5701 5701 mlog_errno(ret); 5702 - goto out; 5702 + goto out_commit; 5703 5703 } 5704 5704 5705 5705 dquot_free_space_nodirty(inode,

+61 -8

fs/ocfs2/aops.c

··· 290 290 } 291 291 292 292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 293 + /* 294 + * Unlock the page and cycle ip_alloc_sem so that we don't 295 + * busyloop waiting for ip_alloc_sem to unlock 296 + */ 293 297 ret = AOP_TRUNCATED_PAGE; 298 + unlock_page(page); 299 + unlock = 0; 300 + down_read(&oi->ip_alloc_sem); 301 + up_read(&oi->ip_alloc_sem); 294 302 goto out_inode_unlock; 295 303 } 296 304 ··· 571 563 { 572 564 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 573 565 int level; 566 + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); 574 567 575 568 /* this io's submitter should not have unlocked this before we could */ 576 569 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 577 570 578 571 if (ocfs2_iocb_is_sem_locked(iocb)) 579 572 ocfs2_iocb_clear_sem_locked(iocb); 573 + 574 + if (ocfs2_iocb_is_unaligned_aio(iocb)) { 575 + ocfs2_iocb_clear_unaligned_aio(iocb); 576 + 577 + if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && 578 + waitqueue_active(wq)) { 579 + wake_up_all(wq); 580 + } 581 + } 580 582 581 583 ocfs2_iocb_clear_rw_locked(iocb); 582 584 ··· 881 863 struct page *w_target_page; 882 864 883 865 /* 866 + * w_target_locked is used for page_mkwrite path indicating no unlocking 867 + * against w_target_page in ocfs2_write_end_nolock. 868 + */ 869 + unsigned int w_target_locked:1; 870 + 871 + /* 884 872 * ocfs2_write_end() uses this to know what the real range to 885 873 * write in the target should be. 886 874 */ ··· 919 895 920 896 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 921 897 { 898 + int i; 899 + 900 + /* 901 + * w_target_locked is only set to true in the page_mkwrite() case. 902 + * The intent is to allow us to lock the target page from write_begin() 903 + * to write_end(). The caller must hold a ref on w_target_page. 904 + */ 905 + if (wc->w_target_locked) { 906 + BUG_ON(!wc->w_target_page); 907 + for (i = 0; i < wc->w_num_pages; i++) { 908 + if (wc->w_target_page == wc->w_pages[i]) { 909 + wc->w_pages[i] = NULL; 910 + break; 911 + } 912 + } 913 + mark_page_accessed(wc->w_target_page); 914 + page_cache_release(wc->w_target_page); 915 + } 922 916 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 923 917 924 918 brelse(wc->w_di_bh); ··· 1174 1132 */ 1175 1133 lock_page(mmap_page); 1176 1134 1135 + /* Exit and let the caller retry */ 1177 1136 if (mmap_page->mapping != mapping) { 1137 + WARN_ON(mmap_page->mapping); 1178 1138 unlock_page(mmap_page); 1179 - /* 1180 - * Sanity check - the locking in 1181 - * ocfs2_pagemkwrite() should ensure 1182 - * that this code doesn't trigger. 1183 - */ 1184 - ret = -EINVAL; 1185 - mlog_errno(ret); 1139 + ret = -EAGAIN; 1186 1140 goto out; 1187 1141 } 1188 1142 1189 1143 page_cache_get(mmap_page); 1190 1144 wc->w_pages[i] = mmap_page; 1145 + wc->w_target_locked = true; 1191 1146 } else { 1192 1147 wc->w_pages[i] = find_or_create_page(mapping, index, 1193 1148 GFP_NOFS); ··· 1199 1160 wc->w_target_page = wc->w_pages[i]; 1200 1161 } 1201 1162 out: 1163 + if (ret) 1164 + wc->w_target_locked = false; 1202 1165 return ret; 1203 1166 } 1204 1167 ··· 1858 1817 */ 1859 1818 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, 1860 1819 cluster_of_pages, mmap_page); 1861 - if (ret) { 1820 + if (ret && ret != -EAGAIN) { 1862 1821 mlog_errno(ret); 1822 + goto out_quota; 1823 + } 1824 + 1825 + /* 1826 + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock 1827 + * the target page. In this case, we exit with no error and no target 1828 + * page. This will trigger the caller, page_mkwrite(), to re-try 1829 + * the operation. 1830 + */ 1831 + if (ret == -EAGAIN) { 1832 + BUG_ON(wc->w_target_page); 1833 + ret = 0; 1863 1834 goto out_quota; 1864 1835 } 1865 1836

+14

fs/ocfs2/aops.h

··· 78 78 OCFS2_IOCB_RW_LOCK = 0, 79 79 OCFS2_IOCB_RW_LOCK_LEVEL, 80 80 OCFS2_IOCB_SEM, 81 + OCFS2_IOCB_UNALIGNED_IO, 81 82 OCFS2_IOCB_NUM_LOCKS 82 83 }; 83 84 ··· 92 91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 93 92 #define ocfs2_iocb_is_sem_locked(iocb) \ 94 93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 94 + 95 + #define ocfs2_iocb_set_unaligned_aio(iocb) \ 96 + set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 97 + #define ocfs2_iocb_clear_unaligned_aio(iocb) \ 98 + clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 99 + #define ocfs2_iocb_is_unaligned_aio(iocb) \ 100 + test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 101 + 102 + #define OCFS2_IOEND_WQ_HASH_SZ 37 103 + #define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ 104 + OCFS2_IOEND_WQ_HASH_SZ]) 105 + extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; 106 + 95 107 #endif /* OCFS2_FILE_H */

+123 -73

fs/ocfs2/cluster/heartbeat.c

··· 216 216 217 217 struct list_head hr_all_item; 218 218 unsigned hr_unclean_stop:1, 219 + hr_aborted_start:1, 219 220 hr_item_pinned:1, 220 221 hr_item_dropped:1; 221 222 ··· 254 253 * has reached a 'steady' state. This will be fixed when we have 255 254 * a more complete api that doesn't lead to this sort of fragility. */ 256 255 atomic_t hr_steady_iterations; 256 + 257 + /* terminate o2hb thread if it does not reach steady state 258 + * (hr_steady_iterations == 0) within hr_unsteady_iterations */ 259 + atomic_t hr_unsteady_iterations; 257 260 258 261 char hr_dev_name[BDEVNAME_SIZE]; 259 262 ··· 329 324 330 325 static void o2hb_arm_write_timeout(struct o2hb_region *reg) 331 326 { 327 + /* Arm writeout only after thread reaches steady state */ 328 + if (atomic_read(&reg->hr_steady_iterations) != 0) 329 + return; 330 + 332 331 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 333 332 O2HB_MAX_WRITE_TIMEOUT_MS); 334 333 ··· 546 537 return read == computed; 547 538 } 548 539 549 - /* We want to make sure that nobody is heartbeating on top of us -- 550 - * this will help detect an invalid configuration. */ 551 - static void o2hb_check_last_timestamp(struct o2hb_region *reg) 540 + /* 541 + * Compare the slot data with what we wrote in the last iteration. 542 + * If the match fails, print an appropriate error message. This is to 543 + * detect errors like... another node hearting on the same slot, 544 + * flaky device that is losing writes, etc. 545 + * Returns 1 if check succeeds, 0 otherwise. 546 + */ 547 + static int o2hb_check_own_slot(struct o2hb_region *reg) 552 548 { 553 549 struct o2hb_disk_slot *slot; 554 550 struct o2hb_disk_heartbeat_block *hb_block; ··· 562 548 slot = &reg->hr_slots[o2nm_this_node()]; 563 549 /* Don't check on our 1st timestamp */ 564 550 if (!slot->ds_last_time) 565 - return; 551 + return 0; 566 552 567 553 hb_block = slot->ds_raw_block; 568 554 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && 569 555 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && 570 556 hb_block->hb_node == slot->ds_node_num) 571 - return; 557 + return 1; 572 558 573 559 #define ERRSTR1 "Another node is heartbeating on device" 574 560 #define ERRSTR2 "Heartbeat generation mismatch on device" ··· 588 574 (unsigned long long)slot->ds_last_time, hb_block->hb_node, 589 575 (unsigned long long)le64_to_cpu(hb_block->hb_generation), 590 576 (unsigned long long)le64_to_cpu(hb_block->hb_seq)); 577 + 578 + return 0; 591 579 } 592 580 593 581 static inline void o2hb_prepare_block(struct o2hb_region *reg, ··· 735 719 o2nm_node_put(node); 736 720 } 737 721 738 - static void o2hb_set_quorum_device(struct o2hb_region *reg, 739 - struct o2hb_disk_slot *slot) 722 + static void o2hb_set_quorum_device(struct o2hb_region *reg) 740 723 { 741 - assert_spin_locked(&o2hb_live_lock); 742 - 743 724 if (!o2hb_global_heartbeat_active()) 744 725 return; 745 726 746 - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 727 + /* Prevent race with o2hb_heartbeat_group_drop_item() */ 728 + if (kthread_should_stop()) 747 729 return; 730 + 731 + /* Tag region as quorum only after thread reaches steady state */ 732 + if (atomic_read(&reg->hr_steady_iterations) != 0) 733 + return; 734 + 735 + spin_lock(&o2hb_live_lock); 736 + 737 + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 738 + goto unlock; 748 739 749 740 /* 750 741 * A region can be added to the quorum only when it sees all ··· 760 737 */ 761 738 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, 762 739 sizeof(o2hb_live_node_bitmap))) 763 - return; 740 + goto unlock; 764 741 765 - if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) 766 - return; 767 - 768 - printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", 769 - config_item_name(&reg->hr_item)); 742 + printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", 743 + config_item_name(&reg->hr_item), reg->hr_dev_name); 770 744 771 745 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 772 746 ··· 774 754 if (o2hb_pop_count(&o2hb_quorum_region_bitmap, 775 755 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) 776 756 o2hb_region_unpin(NULL); 757 + unlock: 758 + spin_unlock(&o2hb_live_lock); 777 759 } 778 760 779 761 static int o2hb_check_slot(struct o2hb_region *reg, ··· 947 925 slot->ds_equal_samples = 0; 948 926 } 949 927 out: 950 - o2hb_set_quorum_device(reg, slot); 951 - 952 928 spin_unlock(&o2hb_live_lock); 953 929 954 930 o2hb_run_event_list(&event); ··· 977 957 978 958 static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) 979 959 { 980 - int i, ret, highest_node, change = 0; 960 + int i, ret, highest_node; 961 + int membership_change = 0, own_slot_ok = 0; 981 962 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 982 963 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 983 964 struct o2hb_bio_wait_ctxt write_wc; ··· 987 966 sizeof(configured_nodes)); 988 967 if (ret) { 989 968 mlog_errno(ret); 990 - return ret; 969 + goto bail; 991 970 } 992 971 993 972 /* ··· 1003 982 1004 983 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 1005 984 if (highest_node >= O2NM_MAX_NODES) { 1006 - mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 1007 - return -EINVAL; 985 + mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); 986 + ret = -EINVAL; 987 + goto bail; 1008 988 } 1009 989 1010 990 /* No sense in reading the slots of nodes that don't exist ··· 1015 993 ret = o2hb_read_slots(reg, highest_node + 1); 1016 994 if (ret < 0) { 1017 995 mlog_errno(ret); 1018 - return ret; 996 + goto bail; 1019 997 } 1020 998 1021 999 /* With an up to date view of the slots, we can check that no 1022 1000 * other node has been improperly configured to heartbeat in 1023 1001 * our slot. */ 1024 - o2hb_check_last_timestamp(reg); 1002 + own_slot_ok = o2hb_check_own_slot(reg); 1025 1003 1026 1004 /* fill in the proper info for our next heartbeat */ 1027 1005 o2hb_prepare_block(reg, reg->hr_generation); 1028 1006 1029 - /* And fire off the write. Note that we don't wait on this I/O 1030 - * until later. */ 1031 1007 ret = o2hb_issue_node_write(reg, &write_wc); 1032 1008 if (ret < 0) { 1033 1009 mlog_errno(ret); 1034 - return ret; 1010 + goto bail; 1035 1011 } 1036 1012 1037 1013 i = -1; 1038 1014 while((i = find_next_bit(configured_nodes, 1039 1015 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { 1040 - change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1016 + membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1041 1017 } 1042 1018 1043 1019 /* ··· 1050 1030 * disk */ 1051 1031 mlog(ML_ERROR, "Write error %d on device \"%s\"\n", 1052 1032 write_wc.wc_error, reg->hr_dev_name); 1053 - return write_wc.wc_error; 1033 + ret = write_wc.wc_error; 1034 + goto bail; 1054 1035 } 1055 1036 1056 - o2hb_arm_write_timeout(reg); 1037 + /* Skip disarming the timeout if own slot has stale/bad data */ 1038 + if (own_slot_ok) { 1039 + o2hb_set_quorum_device(reg); 1040 + o2hb_arm_write_timeout(reg); 1041 + } 1057 1042 1043 + bail: 1058 1044 /* let the person who launched us know when things are steady */ 1059 - if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) { 1060 - if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1061 - wake_up(&o2hb_steady_queue); 1045 + if (atomic_read(&reg->hr_steady_iterations) != 0) { 1046 + if (!ret && own_slot_ok && !membership_change) { 1047 + if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1048 + wake_up(&o2hb_steady_queue); 1049 + } 1062 1050 } 1063 1051 1064 - return 0; 1052 + if (atomic_read(&reg->hr_steady_iterations) != 0) { 1053 + if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) { 1054 + printk(KERN_NOTICE "o2hb: Unable to stabilize " 1055 + "heartbeart on region %s (%s)\n", 1056 + config_item_name(&reg->hr_item), 1057 + reg->hr_dev_name); 1058 + atomic_set(&reg->hr_steady_iterations, 0); 1059 + reg->hr_aborted_start = 1; 1060 + wake_up(&o2hb_steady_queue); 1061 + ret = -EIO; 1062 + } 1063 + } 1064 + 1065 + return ret; 1065 1066 } 1066 1067 1067 1068 /* Subtract b from a, storing the result in a. a *must* have a larger ··· 1136 1095 /* Pin node */ 1137 1096 o2nm_depend_this_node(); 1138 1097 1139 - while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1098 + while (!kthread_should_stop() && 1099 + !reg->hr_unclean_stop && !reg->hr_aborted_start) { 1140 1100 /* We track the time spent inside 1141 1101 * o2hb_do_disk_heartbeat so that we avoid more than 1142 1102 * hr_timeout_ms between disk writes. On busy systems ··· 1145 1103 * likely to time itself out. */ 1146 1104 do_gettimeofday(&before_hb); 1147 1105 1148 - i = 0; 1149 - do { 1150 - ret = o2hb_do_disk_heartbeat(reg); 1151 - } while (ret && ++i < 2); 1106 + ret = o2hb_do_disk_heartbeat(reg); 1152 1107 1153 1108 do_gettimeofday(&after_hb); 1154 1109 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); ··· 1156 1117 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1157 1118 elapsed_msec); 1158 1119 1159 - if (elapsed_msec < reg->hr_timeout_ms) { 1120 + if (!kthread_should_stop() && 1121 + elapsed_msec < reg->hr_timeout_ms) { 1160 1122 /* the kthread api has blocked signals for us so no 1161 1123 * need to record the return value. */ 1162 1124 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); ··· 1174 1134 * to timeout on this region when we could just as easily 1175 1135 * write a clear generation - thus indicating to them that 1176 1136 * this node has left this region. 1177 - * 1178 - * XXX: Should we skip this on unclean_stop? */ 1179 - o2hb_prepare_block(reg, 0); 1180 - ret = o2hb_issue_node_write(reg, &write_wc); 1181 - if (ret == 0) { 1182 - o2hb_wait_on_io(reg, &write_wc); 1183 - } else { 1184 - mlog_errno(ret); 1137 + */ 1138 + if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { 1139 + o2hb_prepare_block(reg, 0); 1140 + ret = o2hb_issue_node_write(reg, &write_wc); 1141 + if (ret == 0) 1142 + o2hb_wait_on_io(reg, &write_wc); 1143 + else 1144 + mlog_errno(ret); 1185 1145 } 1186 1146 1187 1147 /* Unpin node */ 1188 1148 o2nm_undepend_this_node(); 1189 1149 1190 - mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1150 + mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); 1191 1151 1192 1152 return 0; 1193 1153 } ··· 1198 1158 struct o2hb_debug_buf *db = inode->i_private; 1199 1159 struct o2hb_region *reg; 1200 1160 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1161 + unsigned long lts; 1201 1162 char *buf = NULL; 1202 1163 int i = -1; 1203 1164 int out = 0; ··· 1235 1194 1236 1195 case O2HB_DB_TYPE_REGION_ELAPSED_TIME: 1237 1196 reg = (struct o2hb_region *)db->db_data; 1238 - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", 1239 - jiffies_to_msecs(jiffies - 1240 - reg->hr_last_timeout_start)); 1197 + lts = reg->hr_last_timeout_start; 1198 + /* If 0, it has never been set before */ 1199 + if (lts) 1200 + lts = jiffies_to_msecs(jiffies - lts); 1201 + out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); 1241 1202 goto done; 1242 1203 1243 1204 case O2HB_DB_TYPE_REGION_PINNED: ··· 1468 1425 int i; 1469 1426 struct page *page; 1470 1427 struct o2hb_region *reg = to_o2hb_region(item); 1428 + 1429 + mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); 1471 1430 1472 1431 if (reg->hr_tmp_block) 1473 1432 kfree(reg->hr_tmp_block); ··· 1837 1792 live_threshold <<= 1; 1838 1793 spin_unlock(&o2hb_live_lock); 1839 1794 } 1840 - atomic_set(&reg->hr_steady_iterations, live_threshold + 1); 1795 + ++live_threshold; 1796 + atomic_set(&reg->hr_steady_iterations, live_threshold); 1797 + /* unsteady_iterations is double the steady_iterations */ 1798 + atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1)); 1841 1799 1842 1800 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1843 1801 reg->hr_item.ci_name); ··· 1857 1809 ret = wait_event_interruptible(o2hb_steady_queue, 1858 1810 atomic_read(&reg->hr_steady_iterations) == 0); 1859 1811 if (ret) { 1860 - /* We got interrupted (hello ptrace!). Clean up */ 1861 - spin_lock(&o2hb_live_lock); 1862 - hb_task = reg->hr_task; 1863 - reg->hr_task = NULL; 1864 - spin_unlock(&o2hb_live_lock); 1812 + atomic_set(&reg->hr_steady_iterations, 0); 1813 + reg->hr_aborted_start = 1; 1814 + } 1865 1815 1866 - if (hb_task) 1867 - kthread_stop(hb_task); 1816 + if (reg->hr_aborted_start) { 1817 + ret = -EIO; 1868 1818 goto out; 1869 1819 } 1870 1820 ··· 1879 1833 ret = -EIO; 1880 1834 1881 1835 if (hb_task && o2hb_global_heartbeat_active()) 1882 - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", 1883 - config_item_name(&reg->hr_item)); 1836 + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", 1837 + config_item_name(&reg->hr_item), reg->hr_dev_name); 1884 1838 1885 1839 out: 1886 1840 if (filp) ··· 2138 2092 2139 2093 /* stop the thread when the user removes the region dir */ 2140 2094 spin_lock(&o2hb_live_lock); 2141 - if (o2hb_global_heartbeat_active()) { 2142 - clear_bit(reg->hr_region_num, o2hb_region_bitmap); 2143 - clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); 2144 - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 2145 - quorum_region = 1; 2146 - clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 2147 - } 2148 2095 hb_task = reg->hr_task; 2149 2096 reg->hr_task = NULL; 2150 2097 reg->hr_item_dropped = 1; ··· 2146 2107 if (hb_task) 2147 2108 kthread_stop(hb_task); 2148 2109 2110 + if (o2hb_global_heartbeat_active()) { 2111 + spin_lock(&o2hb_live_lock); 2112 + clear_bit(reg->hr_region_num, o2hb_region_bitmap); 2113 + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); 2114 + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 2115 + quorum_region = 1; 2116 + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 2117 + spin_unlock(&o2hb_live_lock); 2118 + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", 2119 + ((atomic_read(&reg->hr_steady_iterations) == 0) ? 2120 + "stopped" : "start aborted"), config_item_name(item), 2121 + reg->hr_dev_name); 2122 + } 2123 + 2149 2124 /* 2150 2125 * If we're racing a dev_write(), we need to wake them. They will 2151 2126 * check reg->hr_task 2152 2127 */ 2153 2128 if (atomic_read(&reg->hr_steady_iterations) != 0) { 2129 + reg->hr_aborted_start = 1; 2154 2130 atomic_set(&reg->hr_steady_iterations, 0); 2155 2131 wake_up(&o2hb_steady_queue); 2156 2132 } 2157 - 2158 - if (o2hb_global_heartbeat_active()) 2159 - printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", 2160 - config_item_name(&reg->hr_item)); 2161 2133 2162 2134 config_item_put(item); 2163 2135

+69 -33

fs/ocfs2/cluster/netdebug.c

··· 47 47 #define SC_DEBUG_NAME "sock_containers" 48 48 #define NST_DEBUG_NAME "send_tracking" 49 49 #define STATS_DEBUG_NAME "stats" 50 + #define NODES_DEBUG_NAME "connected_nodes" 50 51 51 52 #define SHOW_SOCK_CONTAINERS 0 52 53 #define SHOW_SOCK_STATS 1 ··· 56 55 static struct dentry *sc_dentry; 57 56 static struct dentry *nst_dentry; 58 57 static struct dentry *stats_dentry; 58 + static struct dentry *nodes_dentry; 59 59 60 60 static DEFINE_SPINLOCK(o2net_debug_lock); 61 61 ··· 493 491 .release = sc_fop_release, 494 492 }; 495 493 496 - int o2net_debugfs_init(void) 494 + static int o2net_fill_bitmap(char *buf, int len) 497 495 { 498 - o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); 499 - if (!o2net_dentry) { 500 - mlog_errno(-ENOMEM); 501 - goto bail; 502 - } 496 + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 497 + int i = -1, out = 0; 503 498 504 - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, 505 - o2net_dentry, NULL, 506 - &nst_seq_fops); 507 - if (!nst_dentry) { 508 - mlog_errno(-ENOMEM); 509 - goto bail; 510 - } 499 + o2net_fill_node_map(map, sizeof(map)); 511 500 512 - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, 513 - o2net_dentry, NULL, 514 - &sc_seq_fops); 515 - if (!sc_dentry) { 516 - mlog_errno(-ENOMEM); 517 - goto bail; 518 - } 501 + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 502 + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 503 + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 519 504 520 - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, 521 - o2net_dentry, NULL, 522 - &stats_seq_fops); 523 - if (!stats_dentry) { 524 - mlog_errno(-ENOMEM); 525 - goto bail; 526 - } 505 + return out; 506 + } 507 + 508 + static int nodes_fop_open(struct inode *inode, struct file *file) 509 + { 510 + char *buf; 511 + 512 + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 513 + if (!buf) 514 + return -ENOMEM; 515 + 516 + i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); 517 + 518 + file->private_data = buf; 527 519 528 520 return 0; 529 - bail: 530 - debugfs_remove(stats_dentry); 531 - debugfs_remove(sc_dentry); 532 - debugfs_remove(nst_dentry); 533 - debugfs_remove(o2net_dentry); 534 - return -ENOMEM; 535 521 } 522 + 523 + static int o2net_debug_release(struct inode *inode, struct file *file) 524 + { 525 + kfree(file->private_data); 526 + return 0; 527 + } 528 + 529 + static ssize_t o2net_debug_read(struct file *file, char __user *buf, 530 + size_t nbytes, loff_t *ppos) 531 + { 532 + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, 533 + i_size_read(file->f_mapping->host)); 534 + } 535 + 536 + static const struct file_operations nodes_fops = { 537 + .open = nodes_fop_open, 538 + .release = o2net_debug_release, 539 + .read = o2net_debug_read, 540 + .llseek = generic_file_llseek, 541 + }; 536 542 537 543 void o2net_debugfs_exit(void) 538 544 { 545 + debugfs_remove(nodes_dentry); 539 546 debugfs_remove(stats_dentry); 540 547 debugfs_remove(sc_dentry); 541 548 debugfs_remove(nst_dentry); 542 549 debugfs_remove(o2net_dentry); 550 + } 551 + 552 + int o2net_debugfs_init(void) 553 + { 554 + mode_t mode = S_IFREG|S_IRUSR; 555 + 556 + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); 557 + if (o2net_dentry) 558 + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, 559 + o2net_dentry, NULL, &nst_seq_fops); 560 + if (nst_dentry) 561 + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, 562 + o2net_dentry, NULL, &sc_seq_fops); 563 + if (sc_dentry) 564 + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, 565 + o2net_dentry, NULL, &stats_seq_fops); 566 + if (stats_dentry) 567 + nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, 568 + o2net_dentry, NULL, &nodes_fops); 569 + if (nodes_dentry) 570 + return 0; 571 + 572 + o2net_debugfs_exit(); 573 + mlog_errno(-ENOMEM); 574 + return -ENOMEM; 543 575 } 544 576 545 577 #endif /* CONFIG_DEBUG_FS */

+72 -66

fs/ocfs2/cluster/tcp.c

··· 546 546 } 547 547 548 548 if (was_valid && !valid) { 549 - printk(KERN_NOTICE "o2net: no longer connected to " 549 + printk(KERN_NOTICE "o2net: No longer connected to " 550 550 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 551 551 o2net_complete_nodes_nsw(nn); 552 552 } ··· 556 556 cancel_delayed_work(&nn->nn_connect_expired); 557 557 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", 558 558 o2nm_this_node() > sc->sc_node->nd_num ? 559 - "connected to" : "accepted connection from", 559 + "Connected to" : "Accepted connection from", 560 560 SC_NODEF_ARGS(sc)); 561 561 } 562 562 ··· 644 644 o2net_sc_queue_work(sc, &sc->sc_connect_work); 645 645 break; 646 646 default: 647 - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT 647 + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT 648 648 " shutdown, state %d\n", 649 649 SC_NODEF_ARGS(sc), sk->sk_state); 650 650 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); ··· 1035 1035 return ret; 1036 1036 } 1037 1037 1038 + /* Get a map of all nodes to which this node is currently connected to */ 1039 + void o2net_fill_node_map(unsigned long *map, unsigned bytes) 1040 + { 1041 + struct o2net_sock_container *sc; 1042 + int node, ret; 1043 + 1044 + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); 1045 + 1046 + memset(map, 0, bytes); 1047 + for (node = 0; node < O2NM_MAX_NODES; ++node) { 1048 + o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); 1049 + if (!ret) { 1050 + set_bit(node, map); 1051 + sc_put(sc); 1052 + } 1053 + } 1054 + } 1055 + EXPORT_SYMBOL_GPL(o2net_fill_node_map); 1056 + 1038 1057 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 1039 1058 size_t caller_veclen, u8 target_node, int *status) 1040 1059 { ··· 1304 1285 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1305 1286 1306 1287 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { 1307 - mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " 1308 - "version %llu but %llu is required, disconnecting\n", 1309 - SC_NODEF_ARGS(sc), 1310 - (unsigned long long)be64_to_cpu(hand->protocol_version), 1311 - O2NET_PROTOCOL_VERSION); 1288 + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " 1289 + "protocol version %llu but %llu is required. " 1290 + "Disconnecting.\n", SC_NODEF_ARGS(sc), 1291 + (unsigned long long)be64_to_cpu(hand->protocol_version), 1292 + O2NET_PROTOCOL_VERSION); 1312 1293 1313 1294 /* don't bother reconnecting if its the wrong version. */ 1314 1295 o2net_ensure_shutdown(nn, sc, -ENOTCONN); ··· 1322 1303 */ 1323 1304 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1324 1305 o2net_idle_timeout()) { 1325 - mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1326 - "%u ms, but we use %u ms locally. disconnecting\n", 1327 - SC_NODEF_ARGS(sc), 1328 - be32_to_cpu(hand->o2net_idle_timeout_ms), 1329 - o2net_idle_timeout()); 1306 + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " 1307 + "idle timeout of %u ms, but we use %u ms locally. " 1308 + "Disconnecting.\n", SC_NODEF_ARGS(sc), 1309 + be32_to_cpu(hand->o2net_idle_timeout_ms), 1310 + o2net_idle_timeout()); 1330 1311 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1331 1312 return -1; 1332 1313 } 1333 1314 1334 1315 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1335 1316 o2net_keepalive_delay()) { 1336 - mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1337 - "%u ms, but we use %u ms locally. disconnecting\n", 1338 - SC_NODEF_ARGS(sc), 1339 - be32_to_cpu(hand->o2net_keepalive_delay_ms), 1340 - o2net_keepalive_delay()); 1317 + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " 1318 + "delay of %u ms, but we use %u ms locally. " 1319 + "Disconnecting.\n", SC_NODEF_ARGS(sc), 1320 + be32_to_cpu(hand->o2net_keepalive_delay_ms), 1321 + o2net_keepalive_delay()); 1341 1322 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1342 1323 return -1; 1343 1324 } 1344 1325 1345 1326 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1346 1327 O2HB_MAX_WRITE_TIMEOUT_MS) { 1347 - mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1348 - "%u ms, but we use %u ms locally. disconnecting\n", 1349 - SC_NODEF_ARGS(sc), 1350 - be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1351 - O2HB_MAX_WRITE_TIMEOUT_MS); 1328 + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " 1329 + "timeout of %u ms, but we use %u ms locally. " 1330 + "Disconnecting.\n", SC_NODEF_ARGS(sc), 1331 + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1332 + O2HB_MAX_WRITE_TIMEOUT_MS); 1352 1333 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1353 1334 return -1; 1354 1335 } ··· 1559 1540 { 1560 1541 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1561 1542 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1562 - 1563 1543 #ifdef CONFIG_DEBUG_FS 1564 - ktime_t now = ktime_get(); 1544 + unsigned long msecs = ktime_to_ms(ktime_get()) - 1545 + ktime_to_ms(sc->sc_tv_timer); 1546 + #else 1547 + unsigned long msecs = o2net_idle_timeout(); 1565 1548 #endif 1566 1549 1567 - printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1568 - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1569 - o2net_idle_timeout() / 1000, 1570 - o2net_idle_timeout() % 1000); 1571 - 1572 - #ifdef CONFIG_DEBUG_FS 1573 - mlog(ML_NOTICE, "Here are some times that might help debug the " 1574 - "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " 1575 - "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", 1576 - (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), 1577 - (long long)ktime_to_us(sc->sc_tv_data_ready), 1578 - (long long)ktime_to_us(sc->sc_tv_advance_start), 1579 - (long long)ktime_to_us(sc->sc_tv_advance_stop), 1580 - sc->sc_msg_key, sc->sc_msg_type, 1581 - (long long)ktime_to_us(sc->sc_tv_func_start), 1582 - (long long)ktime_to_us(sc->sc_tv_func_stop)); 1583 - #endif 1550 + printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " 1551 + "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), 1552 + msecs / 1000, msecs % 1000); 1584 1553 1585 1554 /* 1586 1555 * Initialize the nn_timeout so that the next connection attempt ··· 1701 1694 1702 1695 out: 1703 1696 if (ret) { 1704 - mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " 1705 - "with errno %d\n", SC_NODEF_ARGS(sc), ret); 1697 + printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT 1698 + " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); 1706 1699 /* 0 err so that another will be queued and attempted 1707 1700 * from set_nn_state */ 1708 1701 if (sc) ··· 1725 1718 1726 1719 spin_lock(&nn->nn_lock); 1727 1720 if (!nn->nn_sc_valid) { 1728 - mlog(ML_ERROR, "no connection established with node %u after " 1729 - "%u.%u seconds, giving up and returning errors.\n", 1721 + printk(KERN_NOTICE "o2net: No connection established with " 1722 + "node %u after %u.%u seconds, giving up.\n", 1730 1723 o2net_num_from_nn(nn), 1731 1724 o2net_idle_timeout() / 1000, 1732 1725 o2net_idle_timeout() % 1000); ··· 1869 1862 1870 1863 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); 1871 1864 if (node == NULL) { 1872 - mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", 1873 - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1865 + printk(KERN_NOTICE "o2net: Attempt to connect from unknown " 1866 + "node at %pI4:%d\n", &sin.sin_addr.s_addr, 1867 + ntohs(sin.sin_port)); 1874 1868 ret = -EINVAL; 1875 1869 goto out; 1876 1870 } 1877 1871 1878 1872 if (o2nm_this_node() >= node->nd_num) { 1879 1873 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1880 - mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" 1881 - "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", 1882 - local_node->nd_name, local_node->nd_num, 1883 - &(local_node->nd_ipv4_address), 1884 - ntohs(local_node->nd_ipv4_port), 1885 - node->nd_name, node->nd_num, &sin.sin_addr.s_addr, 1886 - ntohs(sin.sin_port)); 1874 + printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " 1875 + "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " 1876 + "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, 1877 + &(local_node->nd_ipv4_address), 1878 + ntohs(local_node->nd_ipv4_port), node->nd_name, 1879 + node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1887 1880 ret = -EINVAL; 1888 1881 goto out; 1889 1882 } ··· 1908 1901 ret = 0; 1909 1902 spin_unlock(&nn->nn_lock); 1910 1903 if (ret) { 1911 - mlog(ML_NOTICE, "attempt to connect from node '%s' at " 1912 - "%pI4:%d but it already has an open connection\n", 1913 - node->nd_name, &sin.sin_addr.s_addr, 1914 - ntohs(sin.sin_port)); 1904 + printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " 1905 + "at %pI4:%d but it already has an open connection\n", 1906 + node->nd_name, &sin.sin_addr.s_addr, 1907 + ntohs(sin.sin_port)); 1915 1908 goto out; 1916 1909 } 1917 1910 ··· 1991 1984 1992 1985 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1993 1986 if (ret < 0) { 1994 - mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); 1987 + printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); 1995 1988 goto out; 1996 1989 } 1997 1990 ··· 2008 2001 sock->sk->sk_reuse = 1; 2009 2002 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 2010 2003 if (ret < 0) { 2011 - mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " 2012 - "ret=%d\n", &addr, ntohs(port), ret); 2004 + printk(KERN_ERR "o2net: Error %d while binding socket at " 2005 + "%pI4:%u\n", ret, &addr, ntohs(port)); 2013 2006 goto out; 2014 2007 } 2015 2008 2016 2009 ret = sock->ops->listen(sock, 64); 2017 - if (ret < 0) { 2018 - mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", 2019 - &addr, ntohs(port), ret); 2020 - } 2010 + if (ret < 0) 2011 + printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", 2012 + ret, &addr, ntohs(port)); 2021 2013 2022 2014 out: 2023 2015 if (ret) {

+2

fs/ocfs2/cluster/tcp.h

··· 106 106 struct list_head *unreg_list); 107 107 void o2net_unregister_handler_list(struct list_head *list); 108 108 109 + void o2net_fill_node_map(unsigned long *map, unsigned bytes); 110 + 109 111 struct o2nm_node; 110 112 int o2net_register_hb_callbacks(void); 111 113 void o2net_unregister_hb_callbacks(void);

+1 -2

fs/ocfs2/dir.c

··· 1184 1184 if (pde) 1185 1185 le16_add_cpu(&pde->rec_len, 1186 1186 le16_to_cpu(de->rec_len)); 1187 - else 1188 - de->inode = 0; 1187 + de->inode = 0; 1189 1188 dir->i_version++; 1190 1189 ocfs2_journal_dirty(handle, bh); 1191 1190 goto bail;

+12 -44

fs/ocfs2/dlm/dlmcommon.h

··· 859 859 void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 860 860 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); 861 861 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 862 - int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 863 - int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); 862 + void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 863 + void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); 864 864 865 865 void dlm_put(struct dlm_ctxt *dlm); 866 866 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); ··· 877 877 kref_get(&res->refs); 878 878 } 879 879 void dlm_lockres_put(struct dlm_lock_resource *res); 880 - void __dlm_unhash_lockres(struct dlm_lock_resource *res); 881 - void __dlm_insert_lockres(struct dlm_ctxt *dlm, 882 - struct dlm_lock_resource *res); 880 + void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 881 + void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 883 882 struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 884 883 const char *name, 885 884 unsigned int len, ··· 901 902 const char *name, 902 903 unsigned int namelen); 903 904 904 - #define dlm_lockres_set_refmap_bit(bit,res) \ 905 - __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) 906 - #define dlm_lockres_clear_refmap_bit(bit,res) \ 907 - __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) 905 + void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, 906 + struct dlm_lock_resource *res, int bit); 907 + void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, 908 + struct dlm_lock_resource *res, int bit); 908 909 909 - static inline void __dlm_lockres_set_refmap_bit(int bit, 910 - struct dlm_lock_resource *res, 911 - const char *file, 912 - int line) 913 - { 914 - //printk("%s:%d:%.*s: setting bit %d\n", file, line, 915 - // res->lockname.len, res->lockname.name, bit); 916 - set_bit(bit, res->refmap); 917 - } 918 - 919 - static inline void __dlm_lockres_clear_refmap_bit(int bit, 920 - struct dlm_lock_resource *res, 921 - const char *file, 922 - int line) 923 - { 924 - //printk("%s:%d:%.*s: clearing bit %d\n", file, line, 925 - // res->lockname.len, res->lockname.name, bit); 926 - clear_bit(bit, res->refmap); 927 - } 928 - 929 - void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 930 - struct dlm_lock_resource *res, 931 - const char *file, 932 - int line); 933 - void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 934 - struct dlm_lock_resource *res, 935 - int new_lockres, 936 - const char *file, 937 - int line); 938 - #define dlm_lockres_drop_inflight_ref(d,r) \ 939 - __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) 940 - #define dlm_lockres_grab_inflight_ref(d,r) \ 941 - __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) 942 - #define dlm_lockres_grab_inflight_ref_new(d,r) \ 943 - __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) 910 + void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 911 + struct dlm_lock_resource *res); 912 + void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 913 + struct dlm_lock_resource *res); 944 914 945 915 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 946 916 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);

+22 -22

fs/ocfs2/dlm/dlmdomain.c

··· 157 157 158 158 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 159 159 160 - void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 160 + void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 161 161 { 162 - if (!hlist_unhashed(&lockres->hash_node)) { 163 - hlist_del_init(&lockres->hash_node); 164 - dlm_lockres_put(lockres); 165 - } 162 + if (hlist_unhashed(&res->hash_node)) 163 + return; 164 + 165 + mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, 166 + res->lockname.name); 167 + hlist_del_init(&res->hash_node); 168 + dlm_lockres_put(res); 166 169 } 167 170 168 - void __dlm_insert_lockres(struct dlm_ctxt *dlm, 169 - struct dlm_lock_resource *res) 171 + void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 170 172 { 171 173 struct hlist_head *bucket; 172 174 struct qstr *q; ··· 182 180 dlm_lockres_get(res); 183 181 184 182 hlist_add_head(&res->hash_node, bucket); 183 + 184 + mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, 185 + res->lockname.name); 185 186 } 186 187 187 188 struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, ··· 544 539 545 540 static void __dlm_print_nodes(struct dlm_ctxt *dlm) 546 541 { 547 - int node = -1; 542 + int node = -1, num = 0; 548 543 549 544 assert_spin_locked(&dlm->spinlock); 550 545 551 - printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); 552 - 546 + printk("( "); 553 547 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 554 548 node + 1)) < O2NM_MAX_NODES) { 555 549 printk("%d ", node); 550 + ++num; 556 551 } 557 - printk("\n"); 552 + printk(") %u nodes\n", num); 558 553 } 559 554 560 555 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, ··· 571 566 572 567 node = exit_msg->node_idx; 573 568 574 - printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); 575 - 576 569 spin_lock(&dlm->spinlock); 577 570 clear_bit(node, dlm->domain_map); 578 571 clear_bit(node, dlm->exit_domain_map); 572 + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); 579 573 __dlm_print_nodes(dlm); 580 574 581 575 /* notify anything attached to the heartbeat events */ ··· 759 755 760 756 dlm_mark_domain_leaving(dlm); 761 757 dlm_leave_domain(dlm); 758 + printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); 762 759 dlm_force_free_mles(dlm); 763 760 dlm_complete_dlm_shutdown(dlm); 764 761 } ··· 975 970 clear_bit(assert->node_idx, dlm->exit_domain_map); 976 971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 977 972 978 - printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 973 + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", 979 974 assert->node_idx, dlm->name); 980 975 __dlm_print_nodes(dlm); 981 976 ··· 1706 1701 bail: 1707 1702 spin_lock(&dlm->spinlock); 1708 1703 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 1709 - if (!status) 1704 + if (!status) { 1705 + printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); 1710 1706 __dlm_print_nodes(dlm); 1707 + } 1711 1708 spin_unlock(&dlm->spinlock); 1712 1709 1713 1710 if (ctxt) { ··· 2135 2128 if (strlen(domain) >= O2NM_MAX_NAME_LEN) { 2136 2129 ret = -ENAMETOOLONG; 2137 2130 mlog(ML_ERROR, "domain name length too long\n"); 2138 - goto leave; 2139 - } 2140 - 2141 - if (!o2hb_check_local_node_heartbeating()) { 2142 - mlog(ML_ERROR, "the local node has not been configured, or is " 2143 - "not heartbeating\n"); 2144 - ret = -EPROTO; 2145 2131 goto leave; 2146 2132 } 2147 2133

+26 -28

fs/ocfs2/dlm/dlmlock.c

··· 183 183 kick_thread = 1; 184 184 } 185 185 } 186 - /* reduce the inflight count, this may result in the lockres 187 - * being purged below during calc_usage */ 188 - if (lock->ml.node == dlm->node_num) 189 - dlm_lockres_drop_inflight_ref(dlm, res); 190 186 191 187 spin_unlock(&res->spinlock); 192 188 wake_up(&res->wq); ··· 227 231 lock->ml.type, res->lockname.len, 228 232 res->lockname.name, flags); 229 233 234 + /* 235 + * Wait if resource is getting recovered, remastered, etc. 236 + * If the resource was remastered and new owner is self, then exit. 237 + */ 230 238 spin_lock(&res->spinlock); 231 - 232 - /* will exit this call with spinlock held */ 233 239 __dlm_wait_on_lockres(res); 240 + if (res->owner == dlm->node_num) { 241 + spin_unlock(&res->spinlock); 242 + return DLM_RECOVERING; 243 + } 234 244 res->state |= DLM_LOCK_RES_IN_PROGRESS; 235 245 236 246 /* add lock to local (secondary) queue */ ··· 321 319 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, 322 320 sizeof(create), res->owner, &status); 323 321 if (tmpret >= 0) { 324 - // successfully sent and received 325 - ret = status; // this is already a dlm_status 322 + ret = status; 326 323 if (ret == DLM_REJECTED) { 327 - mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " 328 - "no longer owned by %u. that node is coming back " 329 - "up currently.\n", dlm->name, create.namelen, 324 + mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " 325 + "owned by node %u. That node is coming back up " 326 + "currently.\n", dlm->name, create.namelen, 330 327 create.name, res->owner); 331 328 dlm_print_one_lock_resource(res); 332 329 BUG(); 333 330 } 334 331 } else { 335 - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 336 - "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, 337 - res->owner); 338 - if (dlm_is_host_down(tmpret)) { 332 + mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " 333 + "node %u\n", dlm->name, create.namelen, create.name, 334 + tmpret, res->owner); 335 + if (dlm_is_host_down(tmpret)) 339 336 ret = DLM_RECOVERING; 340 - mlog(0, "node %u died so returning DLM_RECOVERING " 341 - "from lock message!\n", res->owner); 342 - } else { 337 + else 343 338 ret = dlm_err_to_dlm_status(tmpret); 344 - } 345 339 } 346 340 347 341 return ret; ··· 438 440 /* zero memory only if kernel-allocated */ 439 441 lksb = kzalloc(sizeof(*lksb), GFP_NOFS); 440 442 if (!lksb) { 441 - kfree(lock); 443 + kmem_cache_free(dlm_lock_cache, lock); 442 444 return NULL; 443 445 } 444 446 kernel_allocated = 1; ··· 716 718 717 719 if (status == DLM_RECOVERING || status == DLM_MIGRATING || 718 720 status == DLM_FORWARD) { 719 - mlog(0, "retrying lock with migration/" 720 - "recovery/in progress\n"); 721 721 msleep(100); 722 - /* no waiting for dlm_reco_thread */ 723 722 if (recovery) { 724 723 if (status != DLM_RECOVERING) 725 724 goto retry_lock; 726 - 727 - mlog(0, "%s: got RECOVERING " 728 - "for $RECOVERY lock, master " 729 - "was %u\n", dlm->name, 730 - res->owner); 731 725 /* wait to see the node go down, then 732 726 * drop down and allow the lockres to 733 727 * get cleaned up. need to remaster. */ ··· 730 740 goto retry_lock; 731 741 } 732 742 } 743 + 744 + /* Inflight taken in dlm_get_lock_resource() is dropped here */ 745 + spin_lock(&res->spinlock); 746 + dlm_lockres_drop_inflight_ref(dlm, res); 747 + spin_unlock(&res->spinlock); 748 + 749 + dlm_lockres_calc_usage(dlm, res); 750 + dlm_kick_thread(dlm, res); 733 751 734 752 if (status != DLM_NORMAL) { 735 753 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;

+91 -90

fs/ocfs2/dlm/dlmmaster.c

··· 631 631 return NULL; 632 632 } 633 633 634 - void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 635 - struct dlm_lock_resource *res, 636 - int new_lockres, 637 - const char *file, 638 - int line) 634 + void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, 635 + struct dlm_lock_resource *res, int bit) 639 636 { 640 - if (!new_lockres) 641 - assert_spin_locked(&res->spinlock); 637 + assert_spin_locked(&res->spinlock); 642 638 643 - if (!test_bit(dlm->node_num, res->refmap)) { 644 - BUG_ON(res->inflight_locks != 0); 645 - dlm_lockres_set_refmap_bit(dlm->node_num, res); 646 - } 647 - res->inflight_locks++; 648 - mlog(0, "%s:%.*s: inflight++: now %u\n", 649 - dlm->name, res->lockname.len, res->lockname.name, 650 - res->inflight_locks); 639 + mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, 640 + res->lockname.name, bit, __builtin_return_address(0)); 641 + 642 + set_bit(bit, res->refmap); 651 643 } 652 644 653 - void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 654 - struct dlm_lock_resource *res, 655 - const char *file, 656 - int line) 645 + void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, 646 + struct dlm_lock_resource *res, int bit) 647 + { 648 + assert_spin_locked(&res->spinlock); 649 + 650 + mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, 651 + res->lockname.name, bit, __builtin_return_address(0)); 652 + 653 + clear_bit(bit, res->refmap); 654 + } 655 + 656 + 657 + void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 658 + struct dlm_lock_resource *res) 659 + { 660 + assert_spin_locked(&res->spinlock); 661 + 662 + res->inflight_locks++; 663 + 664 + mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, 665 + res->lockname.len, res->lockname.name, res->inflight_locks, 666 + __builtin_return_address(0)); 667 + } 668 + 669 + void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 670 + struct dlm_lock_resource *res) 657 671 { 658 672 assert_spin_locked(&res->spinlock); 659 673 660 674 BUG_ON(res->inflight_locks == 0); 675 + 661 676 res->inflight_locks--; 662 - mlog(0, "%s:%.*s: inflight--: now %u\n", 663 - dlm->name, res->lockname.len, res->lockname.name, 664 - res->inflight_locks); 665 - if (res->inflight_locks == 0) 666 - dlm_lockres_clear_refmap_bit(dlm->node_num, res); 677 + 678 + mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, 679 + res->lockname.len, res->lockname.name, res->inflight_locks, 680 + __builtin_return_address(0)); 681 + 667 682 wake_up(&res->wq); 668 683 } 669 684 ··· 712 697 unsigned int hash; 713 698 int tries = 0; 714 699 int bit, wait_on_recovery = 0; 715 - int drop_inflight_if_nonlocal = 0; 716 700 717 701 BUG_ON(!lockid); 718 702 ··· 723 709 spin_lock(&dlm->spinlock); 724 710 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 725 711 if (tmpres) { 726 - int dropping_ref = 0; 727 - 728 712 spin_unlock(&dlm->spinlock); 729 - 730 713 spin_lock(&tmpres->spinlock); 731 - /* We wait for the other thread that is mastering the resource */ 714 + /* Wait on the thread that is mastering the resource */ 732 715 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 733 716 __dlm_wait_on_lockres(tmpres); 734 717 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 735 - } 736 - 737 - if (tmpres->owner == dlm->node_num) { 738 - BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 739 - dlm_lockres_grab_inflight_ref(dlm, tmpres); 740 - } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 741 - dropping_ref = 1; 742 - spin_unlock(&tmpres->spinlock); 743 - 744 - /* wait until done messaging the master, drop our ref to allow 745 - * the lockres to be purged, start over. */ 746 - if (dropping_ref) { 747 - spin_lock(&tmpres->spinlock); 748 - __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); 749 718 spin_unlock(&tmpres->spinlock); 750 719 dlm_lockres_put(tmpres); 751 720 tmpres = NULL; 752 721 goto lookup; 753 722 } 754 723 755 - mlog(0, "found in hash!\n"); 724 + /* Wait on the resource purge to complete before continuing */ 725 + if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { 726 + BUG_ON(tmpres->owner == dlm->node_num); 727 + __dlm_wait_on_lockres_flags(tmpres, 728 + DLM_LOCK_RES_DROPPING_REF); 729 + spin_unlock(&tmpres->spinlock); 730 + dlm_lockres_put(tmpres); 731 + tmpres = NULL; 732 + goto lookup; 733 + } 734 + 735 + /* Grab inflight ref to pin the resource */ 736 + dlm_lockres_grab_inflight_ref(dlm, tmpres); 737 + 738 + spin_unlock(&tmpres->spinlock); 756 739 if (res) 757 740 dlm_lockres_put(res); 758 741 res = tmpres; ··· 840 829 * but they might own this lockres. wait on them. */ 841 830 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 842 831 if (bit < O2NM_MAX_NODES) { 843 - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 844 - "recover before lock mastery can begin\n", 832 + mlog(0, "%s: res %.*s, At least one node (%d) " 833 + "to recover before lock mastery can begin\n", 845 834 dlm->name, namelen, (char *)lockid, bit); 846 835 wait_on_recovery = 1; 847 836 } ··· 854 843 855 844 /* finally add the lockres to its hash bucket */ 856 845 __dlm_insert_lockres(dlm, res); 857 - /* since this lockres is new it doesn't not require the spinlock */ 858 - dlm_lockres_grab_inflight_ref_new(dlm, res); 859 846 860 - /* if this node does not become the master make sure to drop 861 - * this inflight reference below */ 862 - drop_inflight_if_nonlocal = 1; 847 + /* Grab inflight ref to pin the resource */ 848 + spin_lock(&res->spinlock); 849 + dlm_lockres_grab_inflight_ref(dlm, res); 850 + spin_unlock(&res->spinlock); 863 851 864 852 /* get an extra ref on the mle in case this is a BLOCK 865 853 * if so, the creator of the BLOCK may try to put the last ··· 874 864 * dlm spinlock would be detectable be a change on the mle, 875 865 * so we only need to clear out the recovery map once. */ 876 866 if (dlm_is_recovery_lock(lockid, namelen)) { 877 - mlog(ML_NOTICE, "%s: recovery map is not empty, but " 878 - "must master $RECOVERY lock now\n", dlm->name); 867 + mlog(0, "%s: Recovery map is not empty, but must " 868 + "master $RECOVERY lock now\n", dlm->name); 879 869 if (!dlm_pre_master_reco_lockres(dlm, res)) 880 870 wait_on_recovery = 0; 881 871 else { ··· 893 883 spin_lock(&dlm->spinlock); 894 884 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 895 885 if (bit < O2NM_MAX_NODES) { 896 - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 897 - "recover before lock mastery can begin\n", 886 + mlog(0, "%s: res %.*s, At least one node (%d) " 887 + "to recover before lock mastery can begin\n", 898 888 dlm->name, namelen, (char *)lockid, bit); 899 889 wait_on_recovery = 1; 900 890 } else ··· 923 913 * yet, keep going until it does. this is how the 924 914 * master will know that asserts are needed back to 925 915 * the lower nodes. */ 926 - mlog(0, "%s:%.*s: requests only up to %u but master " 927 - "is %u, keep going\n", dlm->name, namelen, 916 + mlog(0, "%s: res %.*s, Requests only up to %u but " 917 + "master is %u, keep going\n", dlm->name, namelen, 928 918 lockid, nodenum, mle->master); 929 919 } 930 920 } ··· 934 924 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 935 925 if (ret < 0) { 936 926 wait_on_recovery = 1; 937 - mlog(0, "%s:%.*s: node map changed, redo the " 938 - "master request now, blocked=%d\n", 939 - dlm->name, res->lockname.len, 927 + mlog(0, "%s: res %.*s, Node map changed, redo the master " 928 + "request now, blocked=%d\n", dlm->name, res->lockname.len, 940 929 res->lockname.name, blocked); 941 930 if (++tries > 20) { 942 - mlog(ML_ERROR, "%s:%.*s: spinning on " 943 - "dlm_wait_for_lock_mastery, blocked=%d\n", 931 + mlog(ML_ERROR, "%s: res %.*s, Spinning on " 932 + "dlm_wait_for_lock_mastery, blocked = %d\n", 944 933 dlm->name, res->lockname.len, 945 934 res->lockname.name, blocked); 946 935 dlm_print_one_lock_resource(res); ··· 949 940 goto redo_request; 950 941 } 951 942 952 - mlog(0, "lockres mastered by %u\n", res->owner); 943 + mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, 944 + res->lockname.name, res->owner); 953 945 /* make sure we never continue without this */ 954 946 BUG_ON(res->owner == O2NM_MAX_NODES); 955 947 ··· 962 952 963 953 wake_waiters: 964 954 spin_lock(&res->spinlock); 965 - if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) 966 - dlm_lockres_drop_inflight_ref(dlm, res); 967 955 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 968 956 spin_unlock(&res->spinlock); 969 957 wake_up(&res->wq); ··· 1434 1426 } 1435 1427 1436 1428 if (res->owner == dlm->node_num) { 1437 - mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1438 - dlm->name, namelen, name, request->node_idx); 1439 - dlm_lockres_set_refmap_bit(request->node_idx, res); 1429 + dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); 1440 1430 spin_unlock(&res->spinlock); 1441 1431 response = DLM_MASTER_RESP_YES; 1442 1432 if (mle) ··· 1499 1493 * go back and clean the mles on any 1500 1494 * other nodes */ 1501 1495 dispatch_assert = 1; 1502 - dlm_lockres_set_refmap_bit(request->node_idx, res); 1503 - mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1504 - dlm->name, namelen, name, 1505 - request->node_idx); 1496 + dlm_lockres_set_refmap_bit(dlm, res, 1497 + request->node_idx); 1506 1498 } else 1507 1499 response = DLM_MASTER_RESP_NO; 1508 1500 } else { ··· 1706 1702 "lockres, set the bit in the refmap\n", 1707 1703 namelen, lockname, to); 1708 1704 spin_lock(&res->spinlock); 1709 - dlm_lockres_set_refmap_bit(to, res); 1705 + dlm_lockres_set_refmap_bit(dlm, res, to); 1710 1706 spin_unlock(&res->spinlock); 1711 1707 } 1712 1708 } ··· 2191 2187 namelen = res->lockname.len; 2192 2188 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2193 2189 2194 - mlog(0, "%s:%.*s: sending deref to %d\n", 2195 - dlm->name, namelen, lockname, res->owner); 2196 2190 memset(&deref, 0, sizeof(deref)); 2197 2191 deref.node_idx = dlm->node_num; 2198 2192 deref.namelen = namelen; ··· 2199 2197 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2200 2198 &deref, sizeof(deref), res->owner, &r); 2201 2199 if (ret < 0) 2202 - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 2203 - "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, 2204 - res->owner); 2200 + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", 2201 + dlm->name, namelen, lockname, ret, res->owner); 2205 2202 else if (r < 0) { 2206 2203 /* BAD. other node says I did not have a ref. */ 2207 - mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2208 - "(master=%u) got %d.\n", dlm->name, namelen, 2209 - lockname, res->owner, r); 2204 + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", 2205 + dlm->name, namelen, lockname, res->owner, r); 2210 2206 dlm_print_one_lock_resource(res); 2211 2207 BUG(); 2212 2208 } ··· 2260 2260 else { 2261 2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2262 2262 if (test_bit(node, res->refmap)) { 2263 - dlm_lockres_clear_refmap_bit(node, res); 2263 + dlm_lockres_clear_refmap_bit(dlm, res, node); 2264 2264 cleared = 1; 2265 2265 } 2266 2266 } ··· 2320 2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2321 2321 if (test_bit(node, res->refmap)) { 2322 2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2323 - dlm_lockres_clear_refmap_bit(node, res); 2323 + dlm_lockres_clear_refmap_bit(dlm, res, node); 2324 2324 cleared = 1; 2325 2325 } 2326 2326 spin_unlock(&res->spinlock); ··· 2802 2802 BUG_ON(!list_empty(&lock->bast_list)); 2803 2803 BUG_ON(lock->ast_pending); 2804 2804 BUG_ON(lock->bast_pending); 2805 - dlm_lockres_clear_refmap_bit(lock->ml.node, res); 2805 + dlm_lockres_clear_refmap_bit(dlm, res, 2806 + lock->ml.node); 2806 2807 list_del_init(&lock->list); 2807 2808 dlm_lock_put(lock); 2808 2809 /* In a normal unlock, we would have added a ··· 2824 2823 mlog(0, "%s:%.*s: node %u had a ref to this " 2825 2824 "migrating lockres, clearing\n", dlm->name, 2826 2825 res->lockname.len, res->lockname.name, bit); 2827 - dlm_lockres_clear_refmap_bit(bit, res); 2826 + dlm_lockres_clear_refmap_bit(dlm, res, bit); 2828 2827 } 2829 2828 bit++; 2830 2829 } ··· 2917 2916 &migrate, sizeof(migrate), nodenum, 2918 2917 &status); 2919 2918 if (ret < 0) { 2920 - mlog(ML_ERROR, "Error %d when sending message %u (key " 2921 - "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, 2922 - dlm->key, nodenum); 2919 + mlog(ML_ERROR, "%s: res %.*s, Error %d send " 2920 + "MIGRATE_REQUEST to node %u\n", dlm->name, 2921 + migrate.namelen, migrate.name, ret, nodenum); 2923 2922 if (!dlm_is_host_down(ret)) { 2924 2923 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2925 2924 BUG(); ··· 2938 2937 dlm->name, res->lockname.len, res->lockname.name, 2939 2938 nodenum); 2940 2939 spin_lock(&res->spinlock); 2941 - dlm_lockres_set_refmap_bit(nodenum, res); 2940 + dlm_lockres_set_refmap_bit(dlm, res, nodenum); 2942 2941 spin_unlock(&res->spinlock); 2943 2942 } 2944 2943 } ··· 3272 3271 * mastery reference here since old_master will briefly have 3273 3272 * a reference after the migration completes */ 3274 3273 spin_lock(&res->spinlock); 3275 - dlm_lockres_set_refmap_bit(old_master, res); 3274 + dlm_lockres_set_refmap_bit(dlm, res, old_master); 3276 3275 spin_unlock(&res->spinlock); 3277 3276 3278 3277 mlog(0, "now time to do a migrate request to other nodes\n");

+82 -82

fs/ocfs2/dlm/dlmrecovery.c

··· 362 362 } 363 363 364 364 365 - int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 365 + void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 366 366 { 367 - if (timeout) { 368 - mlog(ML_NOTICE, "%s: waiting %dms for notification of " 369 - "death of node %u\n", dlm->name, timeout, node); 367 + if (dlm_is_node_dead(dlm, node)) 368 + return; 369 + 370 + printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " 371 + "domain %s\n", node, dlm->name); 372 + 373 + if (timeout) 370 374 wait_event_timeout(dlm->dlm_reco_thread_wq, 371 - dlm_is_node_dead(dlm, node), 372 - msecs_to_jiffies(timeout)); 373 - } else { 374 - mlog(ML_NOTICE, "%s: waiting indefinitely for notification " 375 - "of death of node %u\n", dlm->name, node); 375 + dlm_is_node_dead(dlm, node), 376 + msecs_to_jiffies(timeout)); 377 + else 376 378 wait_event(dlm->dlm_reco_thread_wq, 377 379 dlm_is_node_dead(dlm, node)); 378 - } 379 - /* for now, return 0 */ 380 - return 0; 381 380 } 382 381 383 - int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) 382 + void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) 384 383 { 385 - if (timeout) { 386 - mlog(0, "%s: waiting %dms for notification of " 387 - "recovery of node %u\n", dlm->name, timeout, node); 384 + if (dlm_is_node_recovered(dlm, node)) 385 + return; 386 + 387 + printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " 388 + "domain %s\n", node, dlm->name); 389 + 390 + if (timeout) 388 391 wait_event_timeout(dlm->dlm_reco_thread_wq, 389 - dlm_is_node_recovered(dlm, node), 390 - msecs_to_jiffies(timeout)); 391 - } else { 392 - mlog(0, "%s: waiting indefinitely for notification " 393 - "of recovery of node %u\n", dlm->name, node); 392 + dlm_is_node_recovered(dlm, node), 393 + msecs_to_jiffies(timeout)); 394 + else 394 395 wait_event(dlm->dlm_reco_thread_wq, 395 396 dlm_is_node_recovered(dlm, node)); 396 - } 397 - /* for now, return 0 */ 398 - return 0; 399 397 } 400 398 401 399 /* callers of the top-level api calls (dlmlock/dlmunlock) should ··· 428 430 { 429 431 spin_lock(&dlm->spinlock); 430 432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 433 + printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", 434 + dlm->name, dlm->reco.dead_node); 431 435 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 432 436 spin_unlock(&dlm->spinlock); 433 437 } ··· 440 440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); 441 441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; 442 442 spin_unlock(&dlm->spinlock); 443 + printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); 443 444 wake_up(&dlm->reco.event); 445 + } 446 + 447 + static void dlm_print_recovery_master(struct dlm_ctxt *dlm) 448 + { 449 + printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " 450 + "dead node %u in domain %s\n", dlm->reco.new_master, 451 + (dlm->node_num == dlm->reco.new_master ? "me" : "he"), 452 + dlm->reco.dead_node, dlm->name); 444 453 } 445 454 446 455 static int dlm_do_recovery(struct dlm_ctxt *dlm) ··· 514 505 } 515 506 mlog(0, "another node will master this recovery session.\n"); 516 507 } 517 - mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", 518 - dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, 519 - dlm->node_num, dlm->reco.dead_node); 508 + 509 + dlm_print_recovery_master(dlm); 520 510 521 511 /* it is safe to start everything back up here 522 512 * because all of the dead node's lock resources ··· 526 518 return 0; 527 519 528 520 master_here: 529 - mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " 530 - "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), 531 - dlm->node_num, dlm->reco.dead_node, dlm->name); 521 + dlm_print_recovery_master(dlm); 532 522 533 523 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 534 524 if (status < 0) { 535 525 /* we should never hit this anymore */ 536 - mlog(ML_ERROR, "error %d remastering locks for node %u, " 537 - "retrying.\n", status, dlm->reco.dead_node); 526 + mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " 527 + "retrying.\n", dlm->name, status, dlm->reco.dead_node); 538 528 /* yield a bit to allow any final network messages 539 529 * to get handled on remaining nodes */ 540 530 msleep(100); ··· 573 567 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 574 568 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 575 569 576 - mlog(0, "requesting lock info from node %u\n", 570 + mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, 577 571 ndata->node_num); 578 572 579 573 if (ndata->node_num == dlm->node_num) { ··· 646 640 spin_unlock(&dlm_reco_state_lock); 647 641 } 648 642 649 - mlog(0, "done requesting all lock info\n"); 643 + mlog(0, "%s: Done requesting all lock info\n", dlm->name); 650 644 651 645 /* nodes should be sending reco data now 652 646 * just need to wait */ ··· 808 802 809 803 /* negative status is handled by caller */ 810 804 if (ret < 0) 811 - mlog(ML_ERROR, "Error %d when sending message %u (key " 812 - "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, 813 - dlm->key, request_from); 814 - 805 + mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " 806 + "to recover dead node %u\n", dlm->name, ret, 807 + request_from, dead_node); 815 808 // return from here, then 816 809 // sleep until all received or error 817 810 return ret; ··· 961 956 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 962 957 sizeof(done_msg), send_to, &tmpret); 963 958 if (ret < 0) { 964 - mlog(ML_ERROR, "Error %d when sending message %u (key " 965 - "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, 966 - dlm->key, send_to); 959 + mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " 960 + "to recover dead node %u\n", dlm->name, ret, send_to, 961 + dead_node); 967 962 if (!dlm_is_host_down(ret)) { 968 963 BUG(); 969 964 } ··· 1132 1127 if (ret < 0) { 1133 1128 /* XXX: negative status is not handled. 1134 1129 * this will end up killing this node. */ 1135 - mlog(ML_ERROR, "Error %d when sending message %u (key " 1136 - "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, 1137 - dlm->key, send_to); 1130 + mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " 1131 + "node %u (%s)\n", dlm->name, mres->lockname_len, 1132 + mres->lockname, ret, send_to, 1133 + (orig_flags & DLM_MRES_MIGRATION ? 1134 + "migration" : "recovery")); 1138 1135 } else { 1139 1136 /* might get an -ENOMEM back here */ 1140 1137 ret = status; ··· 1774 1767 dlm->name, mres->lockname_len, mres->lockname, 1775 1768 from); 1776 1769 spin_lock(&res->spinlock); 1777 - dlm_lockres_set_refmap_bit(from, res); 1770 + dlm_lockres_set_refmap_bit(dlm, res, from); 1778 1771 spin_unlock(&res->spinlock); 1779 1772 added++; 1780 1773 break; ··· 1972 1965 mlog(0, "%s:%.*s: added lock for node %u, " 1973 1966 "setting refmap bit\n", dlm->name, 1974 1967 res->lockname.len, res->lockname.name, ml->node); 1975 - dlm_lockres_set_refmap_bit(ml->node, res); 1968 + dlm_lockres_set_refmap_bit(dlm, res, ml->node); 1976 1969 added++; 1977 1970 } 1978 1971 spin_unlock(&res->spinlock); ··· 2091 2084 2092 2085 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { 2093 2086 if (res->owner == dead_node) { 2087 + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", 2088 + dlm->name, res->lockname.len, res->lockname.name, 2089 + res->owner, new_master); 2094 2090 list_del_init(&res->recovering); 2095 2091 spin_lock(&res->spinlock); 2096 2092 /* new_master has our reference from ··· 2115 2105 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2116 2106 bucket = dlm_lockres_hash(dlm, i); 2117 2107 hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 2118 - if (res->state & DLM_LOCK_RES_RECOVERING) { 2119 - if (res->owner == dead_node) { 2120 - mlog(0, "(this=%u) res %.*s owner=%u " 2121 - "was not on recovering list, but " 2122 - "clearing state anyway\n", 2123 - dlm->node_num, res->lockname.len, 2124 - res->lockname.name, new_master); 2125 - } else if (res->owner == dlm->node_num) { 2126 - mlog(0, "(this=%u) res %.*s owner=%u " 2127 - "was not on recovering list, " 2128 - "owner is THIS node, clearing\n", 2129 - dlm->node_num, res->lockname.len, 2130 - res->lockname.name, new_master); 2131 - } else 2132 - continue; 2108 + if (!(res->state & DLM_LOCK_RES_RECOVERING)) 2109 + continue; 2133 2110 2134 - if (!list_empty(&res->recovering)) { 2135 - mlog(0, "%s:%.*s: lockres was " 2136 - "marked RECOVERING, owner=%u\n", 2137 - dlm->name, res->lockname.len, 2138 - res->lockname.name, res->owner); 2139 - list_del_init(&res->recovering); 2140 - dlm_lockres_put(res); 2141 - } 2142 - spin_lock(&res->spinlock); 2143 - /* new_master has our reference from 2144 - * the lock state sent during recovery */ 2145 - dlm_change_lockres_owner(dlm, res, new_master); 2146 - res->state &= ~DLM_LOCK_RES_RECOVERING; 2147 - if (__dlm_lockres_has_locks(res)) 2148 - __dlm_dirty_lockres(dlm, res); 2149 - spin_unlock(&res->spinlock); 2150 - wake_up(&res->wq); 2111 + if (res->owner != dead_node && 2112 + res->owner != dlm->node_num) 2113 + continue; 2114 + 2115 + if (!list_empty(&res->recovering)) { 2116 + list_del_init(&res->recovering); 2117 + dlm_lockres_put(res); 2151 2118 } 2119 + 2120 + /* new_master has our reference from 2121 + * the lock state sent during recovery */ 2122 + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", 2123 + dlm->name, res->lockname.len, res->lockname.name, 2124 + res->owner, new_master); 2125 + spin_lock(&res->spinlock); 2126 + dlm_change_lockres_owner(dlm, res, new_master); 2127 + res->state &= ~DLM_LOCK_RES_RECOVERING; 2128 + if (__dlm_lockres_has_locks(res)) 2129 + __dlm_dirty_lockres(dlm, res); 2130 + spin_unlock(&res->spinlock); 2131 + wake_up(&res->wq); 2152 2132 } 2153 2133 } 2154 2134 } ··· 2252 2252 res->lockname.len, res->lockname.name, freed, dead_node); 2253 2253 __dlm_print_one_lock_resource(res); 2254 2254 } 2255 - dlm_lockres_clear_refmap_bit(dead_node, res); 2255 + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); 2256 2256 } else if (test_bit(dead_node, res->refmap)) { 2257 2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2258 2258 "no locks and had not purged before dying\n", dlm->name, 2259 2259 res->lockname.len, res->lockname.name, dead_node); 2260 - dlm_lockres_clear_refmap_bit(dead_node, res); 2260 + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); 2261 2261 } 2262 2262 2263 2263 /* do not kick thread yet */ ··· 2324 2324 dlm_revalidate_lvb(dlm, res, dead_node); 2325 2325 if (res->owner == dead_node) { 2326 2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2327 - mlog(ML_NOTICE, "Ignore %.*s for " 2327 + mlog(ML_NOTICE, "%s: res %.*s, Skip " 2328 2328 "recovery as it is being freed\n", 2329 - res->lockname.len, 2329 + dlm->name, res->lockname.len, 2330 2330 res->lockname.name); 2331 2331 } else 2332 2332 dlm_move_lockres_to_recovery_list(dlm,

+8 -8

fs/ocfs2/dlm/dlmthread.c

··· 94 94 { 95 95 int bit; 96 96 97 + assert_spin_locked(&res->spinlock); 98 + 97 99 if (__dlm_lockres_has_locks(res)) 100 + return 0; 101 + 102 + /* Locks are in the process of being created */ 103 + if (res->inflight_locks) 98 104 return 0; 99 105 100 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) ··· 109 103 if (res->state & DLM_LOCK_RES_RECOVERING) 110 104 return 0; 111 105 106 + /* Another node has this resource with this node as the master */ 112 107 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 113 108 if (bit < O2NM_MAX_NODES) 114 109 return 0; 115 110 116 - /* 117 - * since the bit for dlm->node_num is not set, inflight_locks better 118 - * be zero 119 - */ 120 - BUG_ON(res->inflight_locks != 0); 121 111 return 1; 122 112 } 123 113 ··· 187 185 /* clear our bit from the master's refmap, ignore errors */ 188 186 ret = dlm_drop_lockres_ref(dlm, res); 189 187 if (ret < 0) { 190 - mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, 191 - res->lockname.len, res->lockname.name, ret); 192 188 if (!dlm_is_host_down(ret)) 193 189 BUG(); 194 190 } ··· 209 209 BUG(); 210 210 } 211 211 212 - __dlm_unhash_lockres(res); 212 + __dlm_unhash_lockres(dlm, res); 213 213 214 214 /* lockres is not in the hash now. drop the flag and wake up 215 215 * any processes waiting in dlm_get_lock_resource. */

+15 -6

fs/ocfs2/dlmglue.c

··· 1692 1692 mlog(0, "inode %llu take PRMODE open lock\n", 1693 1693 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1694 1694 1695 - if (ocfs2_mount_local(osb)) 1695 + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) 1696 1696 goto out; 1697 1697 1698 1698 lockres = &OCFS2_I(inode)->ip_open_lockres; ··· 1717 1717 mlog(0, "inode %llu try to take %s open lock\n", 1718 1718 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1719 1719 write ? "EXMODE" : "PRMODE"); 1720 + 1721 + if (ocfs2_is_hard_readonly(osb)) { 1722 + if (write) 1723 + status = -EROFS; 1724 + goto out; 1725 + } 1720 1726 1721 1727 if (ocfs2_mount_local(osb)) 1722 1728 goto out; ··· 2304 2298 if (ocfs2_is_hard_readonly(osb)) { 2305 2299 if (ex) 2306 2300 status = -EROFS; 2307 - goto bail; 2301 + goto getbh; 2308 2302 } 2309 2303 2310 2304 if (ocfs2_mount_local(osb)) ··· 2362 2356 mlog_errno(status); 2363 2357 goto bail; 2364 2358 } 2365 - 2359 + getbh: 2366 2360 if (ret_bh) { 2367 2361 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 2368 2362 if (status < 0) { ··· 2634 2628 2635 2629 BUG_ON(!dl); 2636 2630 2637 - if (ocfs2_is_hard_readonly(osb)) 2638 - return -EROFS; 2631 + if (ocfs2_is_hard_readonly(osb)) { 2632 + if (ex) 2633 + return -EROFS; 2634 + return 0; 2635 + } 2639 2636 2640 2637 if (ocfs2_mount_local(osb)) 2641 2638 return 0; ··· 2656 2647 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2657 2648 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2658 2649 2659 - if (!ocfs2_mount_local(osb)) 2650 + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) 2660 2651 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); 2661 2652 } 2662 2653

+96

fs/ocfs2/extent_map.c

··· 832 832 return ret; 833 833 } 834 834 835 + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) 836 + { 837 + struct inode *inode = file->f_mapping->host; 838 + int ret; 839 + unsigned int is_last = 0, is_data = 0; 840 + u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; 841 + u32 cpos, cend, clen, hole_size; 842 + u64 extoff, extlen; 843 + struct buffer_head *di_bh = NULL; 844 + struct ocfs2_extent_rec rec; 845 + 846 + BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); 847 + 848 + ret = ocfs2_inode_lock(inode, &di_bh, 0); 849 + if (ret) { 850 + mlog_errno(ret); 851 + goto out; 852 + } 853 + 854 + down_read(&OCFS2_I(inode)->ip_alloc_sem); 855 + 856 + if (*offset >= inode->i_size) { 857 + ret = -ENXIO; 858 + goto out_unlock; 859 + } 860 + 861 + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 862 + if (origin == SEEK_HOLE) 863 + *offset = inode->i_size; 864 + goto out_unlock; 865 + } 866 + 867 + clen = 0; 868 + cpos = *offset >> cs_bits; 869 + cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); 870 + 871 + while (cpos < cend && !is_last) { 872 + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, 873 + &rec, &is_last); 874 + if (ret) { 875 + mlog_errno(ret); 876 + goto out_unlock; 877 + } 878 + 879 + extoff = cpos; 880 + extoff <<= cs_bits; 881 + 882 + if (rec.e_blkno == 0ULL) { 883 + clen = hole_size; 884 + is_data = 0; 885 + } else { 886 + clen = le16_to_cpu(rec.e_leaf_clusters) - 887 + (cpos - le32_to_cpu(rec.e_cpos)); 888 + is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; 889 + } 890 + 891 + if ((!is_data && origin == SEEK_HOLE) || 892 + (is_data && origin == SEEK_DATA)) { 893 + if (extoff > *offset) 894 + *offset = extoff; 895 + goto out_unlock; 896 + } 897 + 898 + if (!is_last) 899 + cpos += clen; 900 + } 901 + 902 + if (origin == SEEK_HOLE) { 903 + extoff = cpos; 904 + extoff <<= cs_bits; 905 + extlen = clen; 906 + extlen <<= cs_bits; 907 + 908 + if ((extoff + extlen) > inode->i_size) 909 + extlen = inode->i_size - extoff; 910 + extoff += extlen; 911 + if (extoff > *offset) 912 + *offset = extoff; 913 + goto out_unlock; 914 + } 915 + 916 + ret = -ENXIO; 917 + 918 + out_unlock: 919 + 920 + brelse(di_bh); 921 + 922 + up_read(&OCFS2_I(inode)->ip_alloc_sem); 923 + 924 + ocfs2_inode_unlock(inode, 0); 925 + out: 926 + if (ret && ret != -ENXIO) 927 + ret = -ENXIO; 928 + return ret; 929 + } 930 + 835 931 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 836 932 struct buffer_head *bhs[], int flags, 837 933 int (*validate)(struct super_block *sb,

+2

fs/ocfs2/extent_map.h

··· 53 53 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 54 54 u64 map_start, u64 map_len); 55 55 56 + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); 57 + 56 58 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 57 59 u32 *p_cluster, u32 *num_clusters, 58 60 struct ocfs2_extent_list *el,

+94 -2

fs/ocfs2/file.c

··· 1950 1950 if (ret < 0) 1951 1951 mlog_errno(ret); 1952 1952 1953 + if (file->f_flags & O_SYNC) 1954 + handle->h_sync = 1; 1955 + 1953 1956 ocfs2_commit_trans(osb, handle); 1954 1957 1955 1958 out_inode_unlock: ··· 2053 2050 } 2054 2051 out: 2055 2052 return ret; 2053 + } 2054 + 2055 + static void ocfs2_aiodio_wait(struct inode *inode) 2056 + { 2057 + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); 2058 + 2059 + wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); 2060 + } 2061 + 2062 + static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) 2063 + { 2064 + int blockmask = inode->i_sb->s_blocksize - 1; 2065 + loff_t final_size = pos + count; 2066 + 2067 + if ((pos & blockmask) || (final_size & blockmask)) 2068 + return 1; 2069 + return 0; 2056 2070 } 2057 2071 2058 2072 static int ocfs2_prepare_inode_for_refcount(struct inode *inode, ··· 2250 2230 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2251 2231 int full_coherency = !(osb->s_mount_opt & 2252 2232 OCFS2_MOUNT_COHERENCY_BUFFERED); 2233 + int unaligned_dio = 0; 2253 2234 2254 2235 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2255 2236 (unsigned long long)OCFS2_I(inode)->ip_blkno, ··· 2318 2297 goto out; 2319 2298 } 2320 2299 2300 + if (direct_io && !is_sync_kiocb(iocb)) 2301 + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, 2302 + *ppos); 2303 + 2321 2304 /* 2322 2305 * We can't complete the direct I/O as requested, fall back to 2323 2306 * buffered I/O. ··· 2334 2309 2335 2310 direct_io = 0; 2336 2311 goto relock; 2312 + } 2313 + 2314 + if (unaligned_dio) { 2315 + /* 2316 + * Wait on previous unaligned aio to complete before 2317 + * proceeding. 2318 + */ 2319 + ocfs2_aiodio_wait(inode); 2320 + 2321 + /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ 2322 + atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); 2323 + ocfs2_iocb_set_unaligned_aio(iocb); 2337 2324 } 2338 2325 2339 2326 /* ··· 2419 2382 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2420 2383 rw_level = -1; 2421 2384 have_alloc_sem = 0; 2385 + unaligned_dio = 0; 2422 2386 } 2387 + 2388 + if (unaligned_dio) 2389 + atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2423 2390 2424 2391 out: 2425 2392 if (rw_level != -1) ··· 2632 2591 return ret; 2633 2592 } 2634 2593 2594 + /* Refer generic_file_llseek_unlocked() */ 2595 + static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) 2596 + { 2597 + struct inode *inode = file->f_mapping->host; 2598 + int ret = 0; 2599 + 2600 + mutex_lock(&inode->i_mutex); 2601 + 2602 + switch (origin) { 2603 + case SEEK_SET: 2604 + break; 2605 + case SEEK_END: 2606 + offset += inode->i_size; 2607 + break; 2608 + case SEEK_CUR: 2609 + if (offset == 0) { 2610 + offset = file->f_pos; 2611 + goto out; 2612 + } 2613 + offset += file->f_pos; 2614 + break; 2615 + case SEEK_DATA: 2616 + case SEEK_HOLE: 2617 + ret = ocfs2_seek_data_hole_offset(file, &offset, origin); 2618 + if (ret) 2619 + goto out; 2620 + break; 2621 + default: 2622 + ret = -EINVAL; 2623 + goto out; 2624 + } 2625 + 2626 + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 2627 + ret = -EINVAL; 2628 + if (!ret && offset > inode->i_sb->s_maxbytes) 2629 + ret = -EINVAL; 2630 + if (ret) 2631 + goto out; 2632 + 2633 + if (offset != file->f_pos) { 2634 + file->f_pos = offset; 2635 + file->f_version = 0; 2636 + } 2637 + 2638 + out: 2639 + mutex_unlock(&inode->i_mutex); 2640 + if (ret) 2641 + return ret; 2642 + return offset; 2643 + } 2644 + 2635 2645 const struct inode_operations ocfs2_file_iops = { 2636 2646 .setattr = ocfs2_setattr, 2637 2647 .getattr = ocfs2_getattr, ··· 2707 2615 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2708 2616 */ 2709 2617 const struct file_operations ocfs2_fops = { 2710 - .llseek = generic_file_llseek, 2618 + .llseek = ocfs2_file_llseek, 2711 2619 .read = do_sync_read, 2712 2620 .write = do_sync_write, 2713 2621 .mmap = ocfs2_mmap, ··· 2755 2663 * the cluster. 2756 2664 */ 2757 2665 const struct file_operations ocfs2_fops_no_plocks = { 2758 - .llseek = generic_file_llseek, 2666 + .llseek = ocfs2_file_llseek, 2759 2667 .read = do_sync_read, 2760 2668 .write = do_sync_write, 2761 2669 .mmap = ocfs2_mmap,

+1 -1

fs/ocfs2/inode.c

··· 951 951 trace_ocfs2_cleanup_delete_inode( 952 952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 953 953 if (sync_data) 954 - write_inode_now(inode, 1); 954 + filemap_write_and_wait(inode->i_mapping); 955 955 truncate_inode_pages(&inode->i_data, 0); 956 956 } 957 957

+3

fs/ocfs2/inode.h

··· 43 43 /* protects extended attribute changes on this inode */ 44 44 struct rw_semaphore ip_xattr_sem; 45 45 46 + /* Number of outstanding AIO's which are not page aligned */ 47 + atomic_t ip_unaligned_aio; 48 + 46 49 /* These fields are protected by ip_lock */ 47 50 spinlock_t ip_lock; 48 51 u32 ip_open_count;

+6 -5

fs/ocfs2/ioctl.c

··· 122 122 if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & 123 123 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { 124 124 if (!capable(CAP_LINUX_IMMUTABLE)) 125 - goto bail_unlock; 125 + goto bail_commit; 126 126 } 127 127 128 128 ocfs2_inode->ip_attr = flags; ··· 132 132 if (status < 0) 133 133 mlog_errno(status); 134 134 135 + bail_commit: 135 136 ocfs2_commit_trans(osb, handle); 136 137 bail_unlock: 137 138 ocfs2_inode_unlock(inode, 1); ··· 382 381 if (!oifi) { 383 382 status = -ENOMEM; 384 383 mlog_errno(status); 385 - goto bail; 384 + goto out_err; 386 385 } 387 386 388 387 if (o2info_from_user(*oifi, req)) ··· 432 431 o2info_set_request_error(&oifi->ifi_req, req); 433 432 434 433 kfree(oifi); 435 - 434 + out_err: 436 435 return status; 437 436 } 438 437 ··· 667 666 if (!oiff) { 668 667 status = -ENOMEM; 669 668 mlog_errno(status); 670 - goto bail; 669 + goto out_err; 671 670 } 672 671 673 672 if (o2info_from_user(*oiff, req)) ··· 717 716 o2info_set_request_error(&oiff->iff_req, req); 718 717 719 718 kfree(oiff); 720 - 719 + out_err: 721 720 return status; 722 721 } 723 722

+20 -3

fs/ocfs2/journal.c

··· 1544 1544 /* we need to run complete recovery for offline orphan slots */ 1545 1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1546 1546 1547 - mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1548 - node_num, slot_num, 1549 - MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1547 + printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ 1548 + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), 1549 + MINOR(osb->sb->s_dev)); 1550 1550 1551 1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1552 1552 ··· 1601 1601 1602 1602 jbd2_journal_destroy(journal); 1603 1603 1604 + printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ 1605 + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), 1606 + MINOR(osb->sb->s_dev)); 1604 1607 done: 1605 1608 /* drop the lock on this nodes journal */ 1606 1609 if (got_lock) ··· 1810 1807 * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for 1811 1808 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This 1812 1809 * is done to catch any orphans that are left over in orphan directories. 1810 + * 1811 + * It scans all slots, even ones that are in use. It does so to handle the 1812 + * case described below: 1813 + * 1814 + * Node 1 has an inode it was using. The dentry went away due to memory 1815 + * pressure. Node 1 closes the inode, but it's on the free list. The node 1816 + * has the open lock. 1817 + * Node 2 unlinks the inode. It grabs the dentry lock to notify others, 1818 + * but node 1 has no dentry and doesn't get the message. It trylocks the 1819 + * open lock, sees that another node has a PR, and does nothing. 1820 + * Later node 2 runs its orphan dir. It igets the inode, trylocks the 1821 + * open lock, sees the PR still, and does nothing. 1822 + * Basically, we have to trigger an orphan iput on node 1. The only way 1823 + * for this to happen is if node 1 runs node 2's orphan dir. 1813 1824 * 1814 1825 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT 1815 1826 * seconds. It gets an EX lock on os_lockres and checks sequence number

+3 -2

fs/ocfs2/journal.h

··· 441 441 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 442 442 443 443 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 444 - * update on dir + index leaf + dx root update for free list */ 444 + * update on dir + index leaf + dx root update for free list + 445 + * previous dirblock update in the free list */ 445 446 static inline int ocfs2_link_credits(struct super_block *sb) 446 447 { 447 - return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + 448 + return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + 448 449 ocfs2_quota_trans_credits(sb); 449 450 } 450 451

+24 -29

fs/ocfs2/mmap.c

··· 61 61 static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, 62 62 struct page *page) 63 63 { 64 - int ret; 64 + int ret = VM_FAULT_NOPAGE; 65 65 struct inode *inode = file->f_path.dentry->d_inode; 66 66 struct address_space *mapping = inode->i_mapping; 67 67 loff_t pos = page_offset(page); ··· 71 71 void *fsdata; 72 72 loff_t size = i_size_read(inode); 73 73 74 - /* 75 - * Another node might have truncated while we were waiting on 76 - * cluster locks. 77 - * We don't check size == 0 before the shift. This is borrowed 78 - * from do_generic_file_read. 79 - */ 80 74 last_index = (size - 1) >> PAGE_CACHE_SHIFT; 81 - if (unlikely(!size || page->index > last_index)) { 82 - ret = -EINVAL; 83 - goto out; 84 - } 85 75 86 76 /* 87 - * The i_size check above doesn't catch the case where nodes 88 - * truncated and then re-extended the file. We'll re-check the 89 - * page mapping after taking the page lock inside of 90 - * ocfs2_write_begin_nolock(). 77 + * There are cases that lead to the page no longer bebongs to the 78 + * mapping. 79 + * 1) pagecache truncates locally due to memory pressure. 80 + * 2) pagecache truncates when another is taking EX lock against 81 + * inode lock. see ocfs2_data_convert_worker. 82 + * 83 + * The i_size check doesn't catch the case where nodes truncated and 84 + * then re-extended the file. We'll re-check the page mapping after 85 + * taking the page lock inside of ocfs2_write_begin_nolock(). 86 + * 87 + * Let VM retry with these cases. 91 88 */ 92 - if (!PageUptodate(page) || page->mapping != inode->i_mapping) { 93 - /* 94 - * the page has been umapped in ocfs2_data_downconvert_worker. 95 - * So return 0 here and let VFS retry. 96 - */ 97 - ret = 0; 89 + if ((page->mapping != inode->i_mapping) || 90 + (!PageUptodate(page)) || 91 + (page_offset(page) >= size)) 98 92 goto out; 99 - } 100 93 101 94 /* 102 95 * Call ocfs2_write_begin() and ocfs2_write_end() to take ··· 109 116 if (ret) { 110 117 if (ret != -ENOSPC) 111 118 mlog_errno(ret); 119 + if (ret == -ENOMEM) 120 + ret = VM_FAULT_OOM; 121 + else 122 + ret = VM_FAULT_SIGBUS; 112 123 goto out; 113 124 } 114 125 115 - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 116 - fsdata); 117 - if (ret < 0) { 118 - mlog_errno(ret); 126 + if (!locked_page) { 127 + ret = VM_FAULT_NOPAGE; 119 128 goto out; 120 129 } 130 + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 131 + fsdata); 121 132 BUG_ON(ret != len); 122 - ret = 0; 133 + ret = VM_FAULT_LOCKED; 123 134 out: 124 135 return ret; 125 136 } ··· 165 168 166 169 out: 167 170 ocfs2_unblock_signals(&oldset); 168 - if (ret) 169 - ret = VM_FAULT_SIGBUS; 170 171 return ret; 171 172 } 172 173

+1 -1

fs/ocfs2/move_extents.c

··· 745 745 */ 746 746 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 747 747 new_phys_cpos); 748 - if (!new_phys_cpos) { 748 + if (!*new_phys_cpos) { 749 749 ret = -ENOSPC; 750 750 goto out_commit; 751 751 }

+49 -2

fs/ocfs2/ocfs2.h

··· 836 836 837 837 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 838 838 { 839 - __test_and_set_bit_le(bit, bitmap); 839 + __set_bit_le(bit, bitmap); 840 840 } 841 841 #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) 842 842 843 843 static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) 844 844 { 845 - __test_and_clear_bit_le(bit, bitmap); 845 + __clear_bit_le(bit, bitmap); 846 846 } 847 847 #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) 848 848 849 849 #define ocfs2_test_bit test_bit_le 850 850 #define ocfs2_find_next_zero_bit find_next_zero_bit_le 851 851 #define ocfs2_find_next_bit find_next_bit_le 852 + 853 + static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) 854 + { 855 + #if BITS_PER_LONG == 64 856 + *bit += ((unsigned long) addr & 7UL) << 3; 857 + addr = (void *) ((unsigned long) addr & ~7UL); 858 + #elif BITS_PER_LONG == 32 859 + *bit += ((unsigned long) addr & 3UL) << 3; 860 + addr = (void *) ((unsigned long) addr & ~3UL); 861 + #else 862 + #error "how many bits you are?!" 863 + #endif 864 + return addr; 865 + } 866 + 867 + static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) 868 + { 869 + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); 870 + ocfs2_set_bit(bit, bitmap); 871 + } 872 + 873 + static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) 874 + { 875 + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); 876 + ocfs2_clear_bit(bit, bitmap); 877 + } 878 + 879 + static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) 880 + { 881 + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); 882 + return ocfs2_test_bit(bit, bitmap); 883 + } 884 + 885 + static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, 886 + int start) 887 + { 888 + int fix = 0, ret, tmpmax; 889 + bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); 890 + tmpmax = max + fix; 891 + start += fix; 892 + 893 + ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; 894 + if (ret > max) 895 + return max; 896 + return ret; 897 + } 898 + 852 899 #endif /* OCFS2_H */ 853 900

+14 -9

fs/ocfs2/quota_local.c

··· 404 404 int status = 0; 405 405 struct ocfs2_quota_recovery *rec; 406 406 407 - mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); 407 + printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " 408 + "slot %u\n", osb->dev_str, slot_num); 409 + 408 410 rec = ocfs2_alloc_quota_recovery(); 409 411 if (!rec) 410 412 return ERR_PTR(-ENOMEM); ··· 551 549 goto out_commit; 552 550 } 553 551 lock_buffer(qbh); 554 - WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); 555 - ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 552 + WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); 553 + ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); 556 554 le32_add_cpu(&dchunk->dqc_free, 1); 557 555 unlock_buffer(qbh); 558 556 ocfs2_journal_dirty(handle, qbh); ··· 598 596 struct inode *lqinode; 599 597 unsigned int flags; 600 598 601 - mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); 599 + printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " 600 + "slot %u\n", osb->dev_str, slot_num); 601 + 602 602 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 603 603 for (type = 0; type < MAXQUOTAS; type++) { 604 604 if (list_empty(&(rec->r_list[type]))) ··· 616 612 /* Someone else is holding the lock? Then he must be 617 613 * doing the recovery. Just skip the file... */ 618 614 if (status == -EAGAIN) { 619 - mlog(ML_NOTICE, "skipping quota recovery for slot %d " 620 - "because quota file is locked.\n", slot_num); 615 + printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " 616 + "device (%s) for slot %d because quota file is " 617 + "locked.\n", osb->dev_str, slot_num); 621 618 status = 0; 622 619 goto out_put; 623 620 } else if (status < 0) { ··· 949 944 * ol_quota_entries_per_block(sb); 950 945 } 951 946 952 - found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); 947 + found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); 953 948 /* We failed? */ 954 949 if (found == len) { 955 950 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" ··· 1213 1208 struct ocfs2_local_disk_chunk *dchunk; 1214 1209 1215 1210 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 1216 - ocfs2_set_bit(*offset, dchunk->dqc_bitmap); 1211 + ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); 1217 1212 le32_add_cpu(&dchunk->dqc_free, -1); 1218 1213 } 1219 1214 ··· 1294 1289 (od->dq_chunk->qc_headerbh->b_data); 1295 1290 /* Mark structure as freed */ 1296 1291 lock_buffer(od->dq_chunk->qc_headerbh); 1297 - ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1292 + ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); 1298 1293 le32_add_cpu(&dchunk->dqc_free, 1); 1299 1294 unlock_buffer(od->dq_chunk->qc_headerbh); 1300 1295 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);

+2 -2

fs/ocfs2/slot_map.c

··· 493 493 goto bail; 494 494 } 495 495 } else 496 - mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 497 - slot); 496 + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " 497 + "allocated to this node!\n", slot, osb->dev_str); 498 498 499 499 ocfs2_set_slot(si, slot, osb->node_num); 500 500 osb->slot_num = slot;

+63 -8

fs/ocfs2/stack_o2cb.c

··· 28 28 #include "cluster/masklog.h" 29 29 #include "cluster/nodemanager.h" 30 30 #include "cluster/heartbeat.h" 31 + #include "cluster/tcp.h" 31 32 32 33 #include "stackglue.h" 33 34 ··· 257 256 } 258 257 259 258 /* 259 + * Check if this node is heartbeating and is connected to all other 260 + * heartbeating nodes. 261 + */ 262 + static int o2cb_cluster_check(void) 263 + { 264 + u8 node_num; 265 + int i; 266 + unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 267 + unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 268 + 269 + node_num = o2nm_this_node(); 270 + if (node_num == O2NM_MAX_NODES) { 271 + printk(KERN_ERR "o2cb: This node has not been configured.\n"); 272 + return -EINVAL; 273 + } 274 + 275 + /* 276 + * o2dlm expects o2net sockets to be created. If not, then 277 + * dlm_join_domain() fails with a stack of errors which are both cryptic 278 + * and incomplete. The idea here is to detect upfront whether we have 279 + * managed to connect to all nodes or not. If not, then list the nodes 280 + * to allow the user to check the configuration (incorrect IP, firewall, 281 + * etc.) Yes, this is racy. But its not the end of the world. 282 + */ 283 + #define O2CB_MAP_STABILIZE_COUNT 60 284 + for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { 285 + o2hb_fill_node_map(hbmap, sizeof(hbmap)); 286 + if (!test_bit(node_num, hbmap)) { 287 + printk(KERN_ERR "o2cb: %s heartbeat has not been " 288 + "started.\n", (o2hb_global_heartbeat_active() ? 289 + "Global" : "Local")); 290 + return -EINVAL; 291 + } 292 + o2net_fill_node_map(netmap, sizeof(netmap)); 293 + /* Force set the current node to allow easy compare */ 294 + set_bit(node_num, netmap); 295 + if (!memcmp(hbmap, netmap, sizeof(hbmap))) 296 + return 0; 297 + if (i < O2CB_MAP_STABILIZE_COUNT) 298 + msleep(1000); 299 + } 300 + 301 + printk(KERN_ERR "o2cb: This node could not connect to nodes:"); 302 + i = -1; 303 + while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, 304 + i + 1)) < O2NM_MAX_NODES) { 305 + if (!test_bit(i, netmap)) 306 + printk(" %u", i); 307 + } 308 + printk(".\n"); 309 + 310 + return -ENOTCONN; 311 + } 312 + 313 + /* 260 314 * Called from the dlm when it's about to evict a node. This is how the 261 315 * classic stack signals node death. 262 316 */ ··· 319 263 { 320 264 struct ocfs2_cluster_connection *conn = data; 321 265 322 - mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", 323 - node_num, conn->cc_namelen, conn->cc_name); 266 + printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", 267 + node_num, conn->cc_namelen, conn->cc_name); 324 268 325 269 conn->cc_recovery_handler(node_num, conn->cc_recovery_data); 326 270 } ··· 336 280 BUG_ON(conn == NULL); 337 281 BUG_ON(conn->cc_proto == NULL); 338 282 339 - /* for now we only have one cluster/node, make sure we see it 340 - * in the heartbeat universe */ 341 - if (!o2hb_check_local_node_heartbeating()) { 342 - if (o2hb_global_heartbeat_active()) 343 - mlog(ML_ERROR, "Global heartbeat not started\n"); 344 - rc = -EINVAL; 283 + /* Ensure cluster stack is up and all nodes are connected */ 284 + rc = o2cb_cluster_check(); 285 + if (rc) { 286 + printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " 287 + "before retrying.\n"); 345 288 goto out; 346 289 } 347 290

+16 -9

fs/ocfs2/super.c

··· 54 54 #include "ocfs1_fs_compat.h" 55 55 56 56 #include "alloc.h" 57 + #include "aops.h" 57 58 #include "blockcheck.h" 58 59 #include "dlmglue.h" 59 60 #include "export.h" ··· 1108 1107 1109 1108 ocfs2_set_ro_flag(osb, 1); 1110 1109 1111 - printk(KERN_NOTICE "Readonly device detected. No cluster " 1112 - "services will be utilized for this mount. Recovery " 1113 - "will be skipped.\n"); 1110 + printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " 1111 + "Cluster services will not be used for this mount. " 1112 + "Recovery will be skipped.\n", osb->dev_str); 1114 1113 } 1115 1114 1116 1115 if (!ocfs2_is_hard_readonly(osb)) { ··· 1617 1616 return 0; 1618 1617 } 1619 1618 1619 + wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; 1620 + 1620 1621 static int __init ocfs2_init(void) 1621 1622 { 1622 - int status; 1623 + int status, i; 1623 1624 1624 1625 ocfs2_print_version(); 1626 + 1627 + for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) 1628 + init_waitqueue_head(&ocfs2__ioend_wq[i]); 1625 1629 1626 1630 status = init_ocfs2_uptodate_cache(); 1627 1631 if (status < 0) { ··· 1766 1760 ocfs2_extent_map_init(&oi->vfs_inode); 1767 1761 INIT_LIST_HEAD(&oi->ip_io_markers); 1768 1762 oi->ip_dir_start_lookup = 0; 1769 - 1763 + atomic_set(&oi->ip_unaligned_aio, 0); 1770 1764 init_rwsem(&oi->ip_alloc_sem); 1771 1765 init_rwsem(&oi->ip_xattr_sem); 1772 1766 mutex_init(&oi->ip_io_mutex); ··· 1980 1974 * If we failed before we got a uuid_str yet, we can't stop 1981 1975 * heartbeat. Otherwise, do it. 1982 1976 */ 1983 - if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) 1977 + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && 1978 + !ocfs2_is_hard_readonly(osb)) 1984 1979 hangup_needed = 1; 1985 1980 1986 1981 if (osb->cconn) ··· 2360 2353 mlog_errno(status); 2361 2354 goto bail; 2362 2355 } 2363 - cleancache_init_shared_fs((char *)&uuid_net_key, sb); 2356 + cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); 2364 2357 2365 2358 bail: 2366 2359 return status; ··· 2469 2462 goto finally; 2470 2463 } 2471 2464 } else { 2472 - mlog(ML_NOTICE, "File system was not unmounted cleanly, " 2473 - "recovering volume.\n"); 2465 + printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " 2466 + "unmounted cleanly, recovering it.\n", osb->dev_str); 2474 2467 } 2475 2468 2476 2469 local = ocfs2_mount_local(osb);

+6 -4

fs/ocfs2/xattr.c

··· 2376 2376 } 2377 2377 2378 2378 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 2379 - if (ret < 0) { 2380 - mlog_errno(ret); 2381 - break; 2382 - } 2383 2379 2384 2380 ocfs2_commit_trans(osb, ctxt.handle); 2385 2381 if (ctxt.meta_ac) { 2386 2382 ocfs2_free_alloc_context(ctxt.meta_ac); 2387 2383 ctxt.meta_ac = NULL; 2388 2384 } 2385 + 2386 + if (ret < 0) { 2387 + mlog_errno(ret); 2388 + break; 2389 + } 2390 + 2389 2391 } 2390 2392 2391 2393 if (ctxt.meta_ac)

Configure Feed

Configure Feed