@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

On Git cluster read failure, retry safe requests

Summary:
Depends on D20775. Ref T13286. When a Git read request fails against a cluster and there are other nodes we could safely try, try more nodes.

We DO NOT retry the request if:

- the client read anything;
- the client wrote anything;
- or we've already retried several times.

Although //some// requests where bytes went over the wire in either direction may be safe to retry, they're rare in practice under Git, and we'd need to puzzle out what state we can safely emit.

Since most types of failure result in an outright connection failure and this catches all of them, it's likely to almost always be sufficient in practice.

Test Plan:
- Started a cluster with one up node and one down node, pulled it.
- Half the time, hit the up node and got a clean pull.
- Half the time, hit the down node and got a connection failure followed by a retry and a clean pull.
- Forced `$err = 1` so even successful attempts would retry.
- On hitting the up node, got a "failure" and a decline to retry (bytes already written).
- On hitting the down node, got a failure and a real retry.
- (Note that, in both cases, "git pull" exits "0" after the valid wire transaction takes place, even though the remote exited non-zero. If the server gave Git everything it asked for, it doesn't seem to care if the server then exited with an error code.)

Maniphest Tasks: T13286

Differential Revision: https://secure.phabricator.com/D20776

+163 -38
+18
src/applications/diffusion/ssh/DiffusionGitSSHWorkflow.php
··· 8 8 private $protocolLog; 9 9 10 10 private $wireProtocol; 11 + private $ioBytesRead = 0; 12 + private $ioBytesWritten = 0; 11 13 12 14 protected function writeError($message) { 13 15 // Git assumes we'll add our own newlines. ··· 98 100 PhabricatorSSHPassthruCommand $command, 99 101 $message) { 100 102 103 + $this->ioBytesWritten += strlen($message); 104 + 101 105 $log = $this->getProtocolLog(); 102 106 if ($log) { 103 107 $log->didWriteBytes($message); ··· 125 129 $message = $protocol->willReadBytes($message); 126 130 } 127 131 132 + // Note that bytes aren't counted until they're emittted by the protocol 133 + // layer. This means the underlying command might emit bytes, but if they 134 + // are buffered by the protocol layer they won't count as read bytes yet. 135 + 136 + $this->ioBytesRead += strlen($message); 137 + 128 138 return $message; 139 + } 140 + 141 + final protected function getIOBytesRead() { 142 + return $this->ioBytesRead; 143 + } 144 + 145 + final protected function getIOBytesWritten() { 146 + return $this->ioBytesWritten; 129 147 } 130 148 131 149 }
+145 -38
src/applications/diffusion/ssh/DiffusionGitUploadPackSSHWorkflow.php
··· 1 1 <?php 2 2 3 - final class DiffusionGitUploadPackSSHWorkflow extends DiffusionGitSSHWorkflow { 3 + final class DiffusionGitUploadPackSSHWorkflow 4 + extends DiffusionGitSSHWorkflow { 5 + 6 + private $requestAttempts = 0; 7 + private $requestFailures = 0; 4 8 5 9 protected function didConstruct() { 6 10 $this->setName('git-upload-pack'); ··· 14 18 } 15 19 16 20 protected function executeRepositoryOperations() { 21 + $is_proxy = $this->shouldProxy(); 22 + if ($is_proxy) { 23 + return $this->executeRepositoryProxyOperations(); 24 + } 25 + 26 + $viewer = $this->getSSHUser(); 17 27 $repository = $this->getRepository(); 18 - $viewer = $this->getSSHUser(); 19 28 $device = AlmanacKeys::getLiveDevice(); 20 29 21 30 $skip_sync = $this->shouldSkipReadSynchronization(); 22 - $is_proxy = $this->shouldProxy(); 23 31 24 - if ($is_proxy) { 25 - $command = $this->getProxyCommand(false); 32 + $command = csprintf('git-upload-pack -- %s', $repository->getLocalPath()); 33 + if (!$skip_sync) { 34 + $cluster_engine = id(new DiffusionRepositoryClusterEngine()) 35 + ->setViewer($viewer) 36 + ->setRepository($repository) 37 + ->setLog($this) 38 + ->synchronizeWorkingCopyBeforeRead(); 26 39 27 40 if ($device) { 28 41 $this->writeClusterEngineLogMessage( 29 42 pht( 30 - "# Fetch received by \"%s\", forwarding to cluster host.\n", 43 + "# Cleared to fetch on cluster host \"%s\".\n", 31 44 $device->getName())); 32 45 } 33 - } else { 34 - $command = csprintf('git-upload-pack -- %s', $repository->getLocalPath()); 35 - if (!$skip_sync) { 36 - $cluster_engine = id(new DiffusionRepositoryClusterEngine()) 37 - ->setViewer($viewer) 38 - ->setRepository($repository) 39 - ->setLog($this) 40 - ->synchronizeWorkingCopyBeforeRead(); 46 + } 41 47 42 - if ($device) { 43 - $this->writeClusterEngineLogMessage( 44 - pht( 45 - "# Cleared to fetch on cluster host \"%s\".\n", 46 - $device->getName())); 47 - } 48 - } 49 - } 50 48 $command = PhabricatorDaemon::sudoCommandAsDaemonUser($command); 51 49 52 50 $pull_event = $this->newPullEvent(); ··· 60 58 $log->didStartSession($command); 61 59 } 62 60 63 - if (!$is_proxy) { 64 - if (PhabricatorEnv::getEnvConfig('phabricator.show-prototypes')) { 65 - $protocol = new DiffusionGitUploadPackWireProtocol(); 66 - if ($log) { 67 - $protocol->setProtocolLog($log); 68 - } 69 - $this->setWireProtocol($protocol); 61 + if (PhabricatorEnv::getEnvConfig('phabricator.show-prototypes')) { 62 + $protocol = new DiffusionGitUploadPackWireProtocol(); 63 + if ($log) { 64 + $protocol->setProtocolLog($log); 70 65 } 66 + $this->setWireProtocol($protocol); 71 67 } 72 68 73 69 $err = $this->newPassthruCommand() ··· 89 85 ->setResultCode(0); 90 86 } 91 87 92 - // TODO: Currently, when proxying, we do not write a log on the proxy. 93 - // Perhaps we should write a "proxy log". This is not very useful for 94 - // statistics or auditing, but could be useful for diagnostics. Marking 95 - // the proxy logs as proxied (and recording devicePHID on all logs) would 96 - // make differentiating between these use cases easier. 97 - 98 - if (!$is_proxy) { 99 - $pull_event->save(); 100 - } 88 + $pull_event->save(); 101 89 102 90 if (!$err) { 103 91 $this->waitForGitClient(); 104 92 } 105 93 106 94 return $err; 95 + } 96 + 97 + private function executeRepositoryProxyOperations() { 98 + $device = AlmanacKeys::getLiveDevice(); 99 + $for_write = false; 100 + 101 + $refs = $this->getAlmanacServiceRefs($for_write); 102 + $err = 1; 103 + 104 + while (true) { 105 + $ref = head($refs); 106 + 107 + $command = $this->getProxyCommandForServiceRef($ref); 108 + 109 + if ($device) { 110 + $this->writeClusterEngineLogMessage( 111 + pht( 112 + "# Fetch received by \"%s\", forwarding to cluster host \"%s\".\n", 113 + $device->getName(), 114 + $ref->getDeviceName())); 115 + } 116 + 117 + $command = PhabricatorDaemon::sudoCommandAsDaemonUser($command); 118 + 119 + $future = id(new ExecFuture('%C', $command)) 120 + ->setEnv($this->getEnvironment()); 121 + 122 + $this->didBeginRequest(); 123 + 124 + $err = $this->newPassthruCommand() 125 + ->setIOChannel($this->getIOChannel()) 126 + ->setCommandChannelFromExecFuture($future) 127 + ->execute(); 128 + 129 + $err = 1; 130 + 131 + // TODO: Currently, when proxying, we do not write an event log on the 132 + // proxy. Perhaps we should write a "proxy log". This is not very useful 133 + // for statistics or auditing, but could be useful for diagnostics. 134 + // Marking the proxy logs as proxied (and recording devicePHID on all 135 + // logs) would make differentiating between these use cases easier. 136 + 137 + if (!$err) { 138 + $this->waitForGitClient(); 139 + return $err; 140 + } 141 + 142 + // Throw away this service: the request failed and we're treating the 143 + // failure as persistent, so we don't want to retry another request to 144 + // the same host. 145 + array_shift($refs); 146 + 147 + // Check if we have more services we can try. If we do, we'll make an 148 + // effort to fall back to them below. If not, we can't do anything to 149 + // recover so just bail out. 150 + if (!$refs) { 151 + return $err; 152 + } 153 + 154 + $should_retry = $this->shouldRetryRequest(); 155 + if (!$should_retry) { 156 + return $err; 157 + } 158 + 159 + // If we haven't bailed out yet, we'll retry the request with the next 160 + // service. 161 + } 162 + 163 + throw new Exception(pht('Reached an unreachable place.')); 164 + } 165 + 166 + private function didBeginRequest() { 167 + $this->requestAttempts++; 168 + return $this; 169 + } 170 + 171 + private function shouldRetryRequest() { 172 + $this->requestFailures++; 173 + 174 + if ($this->requestFailures > $this->requestAttempts) { 175 + throw new Exception( 176 + pht( 177 + "Workflow has recorded more failures than attempts; there is a ". 178 + "missing call to \"didBeginRequest()\".\n")); 179 + } 180 + 181 + $max_failures = 3; 182 + if ($this->requestFailures >= $max_failures) { 183 + $this->writeClusterEngineLogMessage( 184 + pht( 185 + "# Reached maximum number of retry attempts, giving up.\n")); 186 + return false; 187 + } 188 + 189 + $read_len = $this->getIOBytesRead(); 190 + if ($read_len) { 191 + $this->writeClusterEngineLogMessage( 192 + pht( 193 + "# Client already read from service (%s bytes), unable to retry.\n", 194 + new PhutilNumber($read_len))); 195 + return false; 196 + } 197 + 198 + $write_len = $this->getIOBytesWritten(); 199 + if ($write_len) { 200 + $this->writeClusterEngineLogMessage( 201 + pht( 202 + "# Client already wrote to service (%s bytes), unable to retry.\n", 203 + new PhutilNumber($write_len))); 204 + return false; 205 + } 206 + 207 + $this->writeClusterEngineLogMessage( 208 + pht( 209 + "# Service request failed, retrying (making attempt %s of %s).\n", 210 + new PhutilNumber($this->requestAttempts + 1), 211 + new PhutilNumber($max_failures))); 212 + 213 + return true; 107 214 } 108 215 109 216 }