@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Implement `bin/repository thaw` for unfreezing cluster repositories

Summary:
Ref T10751. Add support tooling for manually prying your way out of trouble if disaster strikes.

Refine documentation, try to refer to devices as "devices" more consistently instead of sometimes calling them "nodes".

Test Plan: Promoted and demoted repository devices with `bin/repository thaw`.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T10751

Differential Revision: https://secure.phabricator.com/D15768

+311 -52
+2
src/__phutil_library_map__.php
··· 3183 3183 'PhabricatorRepositoryManagementPullWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementPullWorkflow.php', 3184 3184 'PhabricatorRepositoryManagementRefsWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementRefsWorkflow.php', 3185 3185 'PhabricatorRepositoryManagementReparseWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementReparseWorkflow.php', 3186 + 'PhabricatorRepositoryManagementThawWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php', 3186 3187 'PhabricatorRepositoryManagementUpdateWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementUpdateWorkflow.php', 3187 3188 'PhabricatorRepositoryManagementWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementWorkflow.php', 3188 3189 'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php', ··· 7834 7835 'PhabricatorRepositoryManagementPullWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 7835 7836 'PhabricatorRepositoryManagementRefsWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 7836 7837 'PhabricatorRepositoryManagementReparseWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 7838 + 'PhabricatorRepositoryManagementThawWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 7837 7839 'PhabricatorRepositoryManagementUpdateWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 7838 7840 'PhabricatorRepositoryManagementWorkflow' => 'PhabricatorManagementWorkflow', 7839 7841 'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'PhabricatorRepositoryCommitChangeParserWorker',
+186
src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php
··· 1 + <?php 2 + 3 + final class PhabricatorRepositoryManagementThawWorkflow 4 + extends PhabricatorRepositoryManagementWorkflow { 5 + 6 + protected function didConstruct() { 7 + $this 8 + ->setName('thaw') 9 + ->setExamples('**thaw** [options] __repository__ ...') 10 + ->setSynopsis( 11 + pht( 12 + 'Resolve issues with frozen cluster repositories. Very advanced '. 13 + 'and dangerous.')) 14 + ->setArguments( 15 + array( 16 + array( 17 + 'name' => 'demote', 18 + 'param' => 'device', 19 + 'help' => pht( 20 + 'Demote a device, discarding local changes. Clears stuck '. 21 + 'write locks and recovers from lost leaders.'), 22 + ), 23 + array( 24 + 'name' => 'promote', 25 + 'param' => 'device', 26 + 'help' => pht( 27 + 'Promote a device, discarding changes on other devices. '. 28 + 'Resolves ambiguous leadership and recovers from demotion '. 29 + 'mistakes.'), 30 + ), 31 + array( 32 + 'name' => 'force', 33 + 'help' => pht('Run operations without asking for confirmation.'), 34 + ), 35 + array( 36 + 'name' => 'repositories', 37 + 'wildcard' => true, 38 + ), 39 + )); 40 + } 41 + 42 + public function execute(PhutilArgumentParser $args) { 43 + $viewer = $this->getViewer(); 44 + 45 + $repositories = $this->loadRepositories($args, 'repositories'); 46 + if (!$repositories) { 47 + throw new PhutilArgumentUsageException( 48 + pht('Specify one or more repositories to thaw.')); 49 + } 50 + 51 + $promote = $args->getArg('promote'); 52 + $demote = $args->getArg('demote'); 53 + 54 + if (!$promote && !$demote) { 55 + throw new PhutilArgumentUsageException( 56 + pht('You must choose a device to --promote or --demote.')); 57 + } 58 + 59 + if ($promote && $demote) { 60 + throw new PhutilArgumentUsageException( 61 + pht('Specify either --promote or --demote, but not both.')); 62 + } 63 + 64 + $device_name = nonempty($promote, $demote); 65 + 66 + $device = id(new AlmanacDeviceQuery()) 67 + ->setViewer($viewer) 68 + ->withNames(array($device_name)) 69 + ->executeOne(); 70 + if (!$device) { 71 + throw new PhutilArgumentUsageException( 72 + pht('No device "%s" exists.', $device_name)); 73 + } 74 + 75 + if ($promote) { 76 + $risk_message = pht( 77 + 'Promoting a device can cause the loss of any repository data which '. 78 + 'only exists on other devices. The version of the repository on the '. 79 + 'promoted device will become authoritative.'); 80 + } else { 81 + $risk_message = pht( 82 + 'Demoting a device can cause the loss of any repository data which '. 83 + 'only exists on the demoted device. The version of the repository '. 84 + 'on some other device will become authoritative.'); 85 + } 86 + 87 + echo tsprintf( 88 + "**<bg:red> %s </bg>** %s\n", 89 + pht('DATA AT RISK'), 90 + $risk_message); 91 + 92 + $is_force = $args->getArg('force'); 93 + $prompt = pht('Accept the possibilty of permanent data loss?'); 94 + if (!$is_force && !phutil_console_confirm($prompt)) { 95 + throw new PhutilArgumentUsageException( 96 + pht('User aborted the workflow.')); 97 + } 98 + 99 + foreach ($repositories as $repository) { 100 + $repository_phid = $repository->getPHID(); 101 + 102 + $write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock( 103 + $repository_phid); 104 + 105 + echo tsprintf( 106 + "%s\n", 107 + pht( 108 + 'Waiting to acquire write lock for "%s"...', 109 + $repository->getDisplayName())); 110 + 111 + $write_lock->lock(phutil_units('5 minutes in seconds')); 112 + try { 113 + 114 + $service = $repository->loadAlmanacService(); 115 + if (!$service) { 116 + throw new PhutilArgumentUsageException( 117 + pht( 118 + 'Repository "%s" is not a cluster repository: it is not '. 119 + 'bound to an Almanac service.', 120 + $repository->getDisplayName())); 121 + } 122 + 123 + $bindings = $service->getActiveBindings(); 124 + $bindings = mpull($bindings, null, 'getDevicePHID'); 125 + if (empty($bindings[$device->getPHID()])) { 126 + throw new PhutilArgumentUsageException( 127 + pht( 128 + 'Repository "%s" has no active binding to device "%s". Only '. 129 + 'actively bound devices can be promoted or demoted.', 130 + $repository->getDisplayName(), 131 + $device->getName())); 132 + } 133 + 134 + $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions( 135 + $repository->getPHID()); 136 + 137 + $versions = mpull($versions, null, 'getDevicePHID'); 138 + $versions = array_select_keys($versions, array_keys($bindings)); 139 + 140 + if ($versions && $promote) { 141 + throw new PhutilArgumentUsageException( 142 + pht( 143 + 'Unable to promote "%s" for repository "%s": the leaders for '. 144 + 'this cluster are not ambiguous.', 145 + $device->getName(), 146 + $repository->getDisplayName())); 147 + } 148 + 149 + if ($promote) { 150 + PhabricatorRepositoryWorkingCopyVersion::updateVersion( 151 + $repository->getPHID(), 152 + $device->getPHID(), 153 + 0); 154 + 155 + echo tsprintf( 156 + "%s\n", 157 + pht( 158 + 'Promoted "%s" to become a leader for "%s".', 159 + $device->getName(), 160 + $repository->getDisplayName())); 161 + } 162 + 163 + if ($demote) { 164 + PhabricatorRepositoryWorkingCopyVersion::demoteDevice( 165 + $repository->getPHID(), 166 + $device->getPHID()); 167 + 168 + echo tsprintf( 169 + "%s\n", 170 + pht( 171 + 'Demoted "%s" from leadership of repository "%s".', 172 + $device->getName(), 173 + $repository->getDisplayName())); 174 + } 175 + } catch (Exception $ex) { 176 + $write_lock->unlock(); 177 + throw $ex; 178 + } 179 + 180 + $write_lock->unlock(); 181 + } 182 + 183 + return 0; 184 + } 185 + 186 + }
+5 -4
src/applications/repository/storage/PhabricatorRepository.php
··· 2397 2397 continue; 2398 2398 } 2399 2399 2400 - // TODO: This should provide more help so users can resolve the issue. 2401 2400 throw new Exception( 2402 2401 pht( 2403 - 'An incomplete write was previously performed to this repository; '. 2404 - 'refusing new writes.')); 2402 + 'An previous write to this repository was interrupted; refusing '. 2403 + 'new writes. This issue resolves operator intervention to resolve, '. 2404 + 'see "Write Interruptions" in the "Cluster: Repositories" in the '. 2405 + 'documentation for instructions.')); 2405 2406 } 2406 2407 2407 2408 try { ··· 2566 2567 ->setPath($path); 2567 2568 } 2568 2569 2569 - private function loadAlmanacService() { 2570 + public function loadAlmanacService() { 2570 2571 $service_phid = $this->getAlmanacServicePHID(); 2571 2572 if (!$service_phid) { 2572 2573 // No service, so this is a local repository.
+20
src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
··· 132 132 $repository_phid, 133 133 $device_phid, 134 134 $new_version) { 135 + 135 136 $version = new self(); 136 137 $conn_w = $version->establishConnection('w'); 137 138 $table = $version->getTableName(); ··· 151 152 0); 152 153 } 153 154 155 + 156 + /** 157 + * Explicitly demote a device. 158 + */ 159 + public static function demoteDevice( 160 + $repository_phid, 161 + $device_phid) { 162 + 163 + $version = new self(); 164 + $conn_w = $version->establishConnection('w'); 165 + $table = $version->getTableName(); 166 + 167 + queryfx( 168 + $conn_w, 169 + 'DELETE FROM %T WHERE repositoryPHID = %s AND devicePHID = %s', 170 + $table, 171 + $repository_phid, 172 + $device_phid); 173 + } 154 174 155 175 }
+98 -48
src/docs/user/cluster/cluster_repositories.diviner
··· 98 98 Monitoring Replication 99 99 ====================== 100 100 101 - You can review the current status of a repository on cluster nodes in 101 + You can review the current status of a repository on cluster devices in 102 102 {nav Diffusion > (Repository) > Manage Repository > Cluster Configuration}. 103 103 104 104 This screen shows all the configured devices which are hosting the repository ··· 106 106 107 107 **Version**: When a repository is mutated by a push, Phabricator increases 108 108 an internal version number for the repository. This column shows which version 109 - is on disk on the corresponding node. 109 + is on disk on the corresponding device. 110 110 111 - After a change is pushed, the node which received the change will have a larger 112 - version number than the other nodes. The change should be passively replicated 113 - to the remaining nodes after a brief period of time, although this can take 114 - a while if the change was large or the network connection between nodes is 115 - slow or unreliable. 111 + After a change is pushed, the device which received the change will have a 112 + larger version number than the other devices. The change should be passively 113 + replicated to the remaining devices after a brief period of time, although this 114 + can take a while if the change was large or the network connection between 115 + devices is slow or unreliable. 116 116 117 117 You can click the version number to see the corresponding push logs for that 118 118 change. The logs contain details about what was changed, and can help you 119 119 identify if replication is slow because a change is large or for some other 120 120 reason. 121 121 122 - **Writing**: This shows that the node is currently holding a write lock. This 122 + **Writing**: This shows that the device is currently holding a write lock. This 123 123 normally means that it is actively receiving a push, but can also mean that 124 124 there was a write interruption. See "Write Interruptions" below for details. 125 125 ··· 131 131 currently held, this shows when the lock was acquired. 132 132 133 133 134 + Cluster Failure Modes 135 + ===================== 136 + 137 + There are three major cluster failure modes: 138 + 139 + - **Write Interruptions**: A write started but did not complete, leaving 140 + the disk state and cluster state out of sync. 141 + - **Loss of Leaders**: None of the devices with the most up-to-date data 142 + are reachable. 143 + - **Ambiguous Leaders**: The internal state of the repository is unclear. 144 + 145 + Phabricator can detect these issues, and responds by freezing the repository 146 + (usually preventing all reads and writes) until the issue is resolved. These 147 + conditions are normally rare and very little data is at risk, but Phabricator 148 + errs on the side of caution and requires decisions which may result in data 149 + loss to be confirmed by a human. 150 + 151 + The next sections cover these failure modes and appropriate responses in 152 + more detail. In general, you will respond to these issues by assessing the 153 + situation and then possibly choosing to discard some data. 154 + 155 + 134 156 Write Interruptions 135 157 =================== 136 158 137 159 A repository cluster can be put into an inconsistent state by an interruption 138 - in a brief window during and immediately after a write. 160 + in a brief window during and immediately after a write. This looks like this: 161 + 162 + - A change is pushed to a server. 163 + - The server acquires a write lock and begins writing the change. 164 + - During or immediately after the write, lightning strikes the server 165 + and destroys it. 139 166 140 167 Phabricator can not commit changes to a working copy (stored on disk) and to 141 - the global state (stored in a database) atomically, so there is a narrow window 142 - between committing these two different states when some tragedy (like a 143 - lightning strike) can befall a server, leaving the global and local views of 144 - the repository state possibly divergent. 168 + the global state (stored in a database) atomically, so there is necessarily a 169 + narrow window between committing these two different states when some tragedy 170 + can befall a server, leaving the global and local views of the repository state 171 + possibly divergent. 145 172 146 173 In these cases, Phabricator fails into a frozen state where further writes 147 - are not permitted until the failure is investigated and resolved. 174 + are not permitted until the failure is investigated and resolved. When a 175 + repository is frozen in this way it remains readable. 148 176 149 177 You can use the monitoring console to review the state of a frozen repository 150 - with a held write lock. The **Writing** column will show which node is holding 151 - the lock, and whoever is named in the **Last Writer** column may be able to 152 - help you figure out what happened by providing more information about what they 153 - were doing and what they observed. 178 + with a held write lock. The **Writing** column will show which device is 179 + holding the lock, and whoever is named in the **Last Writer** column may be 180 + able to help you figure out what happened by providing more information about 181 + what they were doing and what they observed. 154 182 155 - Because the push was not acknowledged, it is normally safe to demote the node: 156 - the user should have received an error anyway, and should not expect their push 157 - to have worked. However, data is technically at risk and you may want to 158 - investigate further and try to understand the issue in more detail before 183 + Because the push was not acknowledged, it is normally safe to resolve this 184 + issue by demoting the device. Demoting the device will undo any changes 185 + committed by the push, and they will be lost forever. 186 + 187 + However, the user should have received an error anyway, and should not expect 188 + their push to have worked. Still, data is technically at risk and you may want 189 + to investigate further and try to understand the issue in more detail before 159 190 continuing. 160 191 161 192 There is no way to explicitly keep the write, but if it was committed to disk 162 - you can recover it manually from the working copy on the device and then push 163 - it again. 193 + you can recover it manually from the working copy on the device (for example, 194 + by using `git format-patch`) and then push it again after recovering. 164 195 165 - If you demote the node, the in-process write will be thrown away, even if it 166 - was complete on disk. To demote the node and release the write lock, run this 196 + If you demote the device, the in-process write will be thrown away, even if it 197 + was complete on disk. To demote the device and release the write lock, run this 167 198 command: 168 199 169 200 ``` 170 - phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net 201 + phabricator/ $ ./bin/repository thaw <repository> --demote <device> 171 202 ``` 172 203 173 204 {icon exclamation-triangle, color="yellow"} Any committed but unacknowledged ··· 181 212 cluster which have the most up-to-date copy of a repository. This looks like 182 213 this: 183 214 184 - - There is a cluster setup with two nodes, X and Y. 215 + - There is a cluster setup with two devices, X and Y. 185 216 - A new change is pushed to server X. 186 217 - Before the change can propagate to server Y, lightning strikes server X 187 218 and destroys it. 188 219 189 - Here, all of the "leader" nodes with the most up-to-date copy of the repository 190 - have been lost. Phabricator will refuse to serve this repository because it 191 - can not serve it consistently, and can not accept writes without data loss. 220 + Here, all of the "leader" devices with the most up-to-date copy of the 221 + repository have been lost. Phabricator will freeze the repository refuse to 222 + serve requests because it can not serve it consistently, and can not accept new 223 + writes without data loss. 192 224 193 225 The most straightforward way to resolve this issue is to restore any leader to 194 - service. The change will be able to replicate to other nodes once a leader 226 + service. The change will be able to replicate to other devices once a leader 195 227 comes back online. 196 228 197 229 If you are unable to restore a leader or unsure that you can restore one ··· 201 233 202 234 If you are comfortable discarding these changes, you can instruct Phabricator 203 235 that it can forget about the leaders in two ways: disable the service bindings 204 - to all of the leader nodes so they are no longer part of the cluster, or 205 - use `bin/repository thaw` to `--demote` the leaders explicitly. 236 + to all of the leader devices so they are no longer part of the cluster, or use 237 + `bin/repository thaw` to `--demote` the leaders explicitly. 206 238 207 239 If you do this, **you will lose data**. Either action will discard any changes 208 - on the affected leaders which have not replicated to other nodes in the cluster. 240 + on the affected leaders which have not replicated to other devices in the 241 + cluster. 209 242 210 - To demote a device, run this command: 243 + To remove a device from the cluster, disable all of the bindings to it 244 + in Almanac, using the web UI. 245 + 246 + {icon exclamation-triangle, color="red"} Any data which is only present on 247 + the disabled device will be lost. 248 + 249 + To demote a device without removing it from the cluster, run this command: 211 250 212 251 ``` 213 252 phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net ··· 220 259 Ambiguous Leaders 221 260 ================= 222 261 223 - Repository clusters can also freeze if the leader nodes are ambiguous. This 262 + Repository clusters can also freeze if the leader devices are ambiguous. This 224 263 can happen if you replace an entire cluster with new devices suddenly, or 225 - make a mistake with the `--demote` flag. 264 + make a mistake with the `--demote` flag. This generally arises from some kind 265 + of operator error, like this: 226 266 227 - When Phabricator can not tell which node in a cluster is a leader, it freezes 228 - the cluster because it is possible that some nodes have less data and others 267 + - Someone accidentally uses `bin/repository thaw ... --demote` to demote 268 + every device in a cluster. 269 + - Someone accidentally deletes all the version information for a repository 270 + from the database by making a mistake with a `DELETE` or `UPDATE` query. 271 + - Someone accidentally disable all of the devices in a cluster, then add 272 + entirely new ones before repositories can propagate. 273 + 274 + When Phabricator can not tell which device in a cluster is a leader, it freezes 275 + the cluster because it is possible that some devices have less data and others 229 276 have more, and if it choses a leader arbitrarily it may destroy some data 230 277 which you would prefer to retain. 231 278 232 - To resolve this, you need to tell Phabricator which node has the most 233 - up-to-date data and promote that node to become a leader. If you do this, 234 - **you may lose data** if you promote the wrong node, and some other node 235 - really had more up-to-date data. If you want to double check, you can examine 236 - the working copies on disk before promoting, by connecting to the machines and 237 - using commands like `git log` to inspect state. 279 + To resolve this, you need to tell Phabricator which device has the most 280 + up-to-date data and promote that device to become a leader. If you know all 281 + devices have the same data, you are free to promote any device. 238 282 239 - Once you have identified a node which has data you're happy with, use 240 - `bin/repository thaw` to `--promote` the device: 283 + If you promote a device, **you may lose data** if you promote the wrong device 284 + and some other device really had more up-to-date data. If you want to double 285 + check, you can examine the working copies on disk before promoting by 286 + connecting to the machines and using commands like `git log` to inspect state. 287 + 288 + Once you have identified a device which has data you're happy with, use 289 + `bin/repository thaw` to `--promote` the device. The data on the chosen 290 + device will become authoritative: 241 291 242 292 ``` 243 293 phabricator/ $ ./bin/repository thaw rXYZ --promote repo002.corp.net