@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Automatically degrade to read-only mode when unable to connect to the master

Summary:
Ref T4571. If we fail to connect to the master, automatically try to degrade into a temporary read-only mode ("UNREACHABLE") for the remainder of the request, if possible.

If the request was something like "load the homepage", that'll work fine. If it was something like "submit a comment", there's nothing we can do and we just have to fail.

Detecting this condition imposes a performance penalty: every request checks the connection and gives the database a long time to respond, since we don't want to drop writes unless we have to. So the degraded mode works, but it's really slow, and may perpetuate the problem if the root issue is load-related.

This lays the groundwork for improving this case by degrading futher into a "SEVERED" mode which will persist across requests. In the future, if several requests in a short period of time fail, we'll sever the database host and refuse to try to connect to it for a little while, connecting directly to replicas instead (basically, we're "health checking" the master, like a load balancer would health check a web application server). This will give us a better (much faster) degraded mode in a major service disruption, and reduce load on the master if the root cause is load-related, giving it a better chance of recovering on its own.

Test Plan:
- Disabled master in config by changing the host/username, got degraded automatically to UNREACAHBLE mode immediately.
- Faked full SEVERED mode, requests hit replicas and put me in the mode properly.
- Made stuff work, hit some good pages.
- Hit some non-cluster pages.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T4571

Differential Revision: https://secure.phabricator.com/D15674

+209 -35
+4
src/__phutil_library_map__.php
··· 1989 1989 'PhabricatorClusterDatabasesConfigOptionType' => 'infrastructure/cluster/PhabricatorClusterDatabasesConfigOptionType.php', 1990 1990 'PhabricatorClusterException' => 'infrastructure/cluster/PhabricatorClusterException.php', 1991 1991 'PhabricatorClusterExceptionHandler' => 'infrastructure/cluster/PhabricatorClusterExceptionHandler.php', 1992 + 'PhabricatorClusterImpossibleWriteException' => 'infrastructure/cluster/PhabricatorClusterImpossibleWriteException.php', 1992 1993 'PhabricatorClusterImproperWriteException' => 'infrastructure/cluster/PhabricatorClusterImproperWriteException.php', 1994 + 'PhabricatorClusterStrandedException' => 'infrastructure/cluster/PhabricatorClusterStrandedException.php', 1993 1995 'PhabricatorColumnProxyInterface' => 'applications/project/interface/PhabricatorColumnProxyInterface.php', 1994 1996 'PhabricatorColumnsEditField' => 'applications/transactions/editfield/PhabricatorColumnsEditField.php', 1995 1997 'PhabricatorCommentEditEngineExtension' => 'applications/transactions/engineextension/PhabricatorCommentEditEngineExtension.php', ··· 6402 6404 'PhabricatorClusterDatabasesConfigOptionType' => 'PhabricatorConfigJSONOptionType', 6403 6405 'PhabricatorClusterException' => 'Exception', 6404 6406 'PhabricatorClusterExceptionHandler' => 'PhabricatorRequestExceptionHandler', 6407 + 'PhabricatorClusterImpossibleWriteException' => 'PhabricatorClusterException', 6405 6408 'PhabricatorClusterImproperWriteException' => 'PhabricatorClusterException', 6409 + 'PhabricatorClusterStrandedException' => 'PhabricatorClusterException', 6406 6410 'PhabricatorColumnsEditField' => 'PhabricatorPHIDListEditField', 6407 6411 'PhabricatorCommentEditEngineExtension' => 'PhabricatorEditEngineExtension', 6408 6412 'PhabricatorCommentEditField' => 'PhabricatorEditField',
+63
src/applications/system/controller/PhabricatorSystemReadOnlyController.php
··· 8 8 } 9 9 10 10 public function handleRequest(AphrontRequest $request) { 11 + $viewer = $this->getViewer(); 11 12 $reason = $request->getURIData('reason'); 12 13 13 14 $body = array(); ··· 48 49 phutil_tag('tt', array(), 'cluster.databases')); 49 50 $button = pht('Wait Patiently'); 50 51 break; 52 + case PhabricatorEnv::READONLY_UNREACHABLE: 53 + $title = pht('Unable to Reach Master'); 54 + $body[] = pht( 55 + 'Phabricator was unable to connect to the writable ("master") '. 56 + 'database while handling this request, and automatically degraded '. 57 + 'into read-only mode.'); 58 + $body[] = pht( 59 + 'This may happen if there is a temporary network anomaly on the '. 60 + 'server side, like cosmic radiation or spooky ghosts. If this '. 61 + 'failure was caused by a transient service interruption, '. 62 + 'Phabricator will recover momentarily.'); 63 + $body[] = pht( 64 + 'This may also indicate that a more serious failure has occurred. '. 65 + 'If this interruption does not resolve on its own, Phabricator '. 66 + 'will soon detect the persistent disruption and degrade into '. 67 + 'read-only mode until the issue is resolved.'); 68 + $button = pht('Quite Unsettling'); 69 + break; 70 + case PhabricatorEnv::READONLY_SEVERED: 71 + $title = pht('Severed From Master'); 72 + $body[] = pht( 73 + 'Phabricator has consistently been unable to reach the writable '. 74 + '("master") database while processing recent requests.'); 75 + $body[] = pht( 76 + 'This likely indicates a severe misconfiguration or major service '. 77 + 'interruption.'); 78 + $body[] = pht( 79 + 'Phabricator will periodically retry the connection and recover '. 80 + 'once service is restored. Most causes of persistent service '. 81 + 'interruption will require administrative intervention in order '. 82 + 'to restore service.'); 83 + $body[] = pht( 84 + 'Although this may be the result of a misconfiguration or '. 85 + 'operational error, this is also the state you reach if a '. 86 + 'meteor recently obliterated a datacenter.'); 87 + $button = pht('Panic!'); 88 + break; 51 89 default: 52 90 return new Aphront404Response(); 53 91 } 54 92 93 + switch ($reason) { 94 + case PhabricatorEnv::READONLY_UNREACHABLE: 95 + case PhabricatorEnv::READONLY_SEVERED: 96 + $body[] = pht( 97 + 'This request was served from a replica database. Replica '. 98 + 'databases may lag behind the master, so very recent activity '. 99 + 'may not be reflected in the UI. This data will be restored if '. 100 + 'the master database is restored, but may have been lost if the '. 101 + 'master database has been reduced to a pile of ash.'); 102 + break; 103 + } 104 + 55 105 $body[] = pht( 56 106 'In read-only mode you can read existing information, but you will not '. 57 107 'be able to edit objects or create new objects until this mode is '. 58 108 'disabled.'); 109 + 110 + if ($viewer->getIsAdmin()) { 111 + $body[] = pht( 112 + 'As an administrator, you can review status information from the '. 113 + '%s control panel. This may provide more information about the '. 114 + 'current state of affairs.', 115 + phutil_tag( 116 + 'a', 117 + array( 118 + 'href' => '/config/cluster/databases/', 119 + ), 120 + pht('Cluster Database Status'))); 121 + } 59 122 60 123 $dialog = $this->newDialog() 61 124 ->setTitle($title)
+5 -1
src/infrastructure/cluster/PhabricatorClusterExceptionHandler.php
··· 25 25 26 26 $title = $ex->getExceptionTitle(); 27 27 28 - return id(new AphrontDialogView()) 28 + $dialog = id(new AphrontDialogView()) 29 29 ->setTitle($title) 30 30 ->setUser($viewer) 31 31 ->appendParagraph($ex->getMessage()) 32 32 ->addCancelButton('/', pht('Proceed With Caution')); 33 + 34 + return id(new AphrontDialogResponse()) 35 + ->setDialog($dialog) 36 + ->setHTTPResponseCode(500); 33 37 } 34 38 35 39 }
+10
src/infrastructure/cluster/PhabricatorClusterImpossibleWriteException.php
··· 1 + <?php 2 + 3 + final class PhabricatorClusterImpossibleWriteException 4 + extends PhabricatorClusterException { 5 + 6 + public function getExceptionTitle() { 7 + return pht('Impossible Cluster Write'); 8 + } 9 + 10 + }
+10
src/infrastructure/cluster/PhabricatorClusterStrandedException.php
··· 1 + <?php 2 + 3 + final class PhabricatorClusterStrandedException 4 + extends PhabricatorClusterException { 5 + 6 + public function getExceptionTitle() { 7 + return pht('Unable to Reach Any Database'); 8 + } 9 + 10 + }
+71 -29
src/infrastructure/cluster/PhabricatorDatabaseRef.php
··· 13 13 const REPLICATION_REPLICA_NONE = 'replica-none'; 14 14 const REPLICATION_SLOW = 'replica-slow'; 15 15 16 + const KEY_REFS = 'cluster.db.refs'; 17 + 16 18 private $host; 17 19 private $port; 18 20 private $user; ··· 27 29 private $replicaStatus; 28 30 private $replicaMessage; 29 31 private $replicaDelay; 32 + 33 + private $didFailToConnect; 30 34 31 35 public function setHost($host) { 32 36 $this->host = $host; ··· 190 194 ); 191 195 } 192 196 193 - public static function loadAll() { 197 + public static function getLiveRefs() { 198 + $cache = PhabricatorCaches::getRequestCache(); 199 + 200 + $refs = $cache->getKey(self::KEY_REFS); 201 + if (!$refs) { 202 + $refs = self::newRefs(); 203 + $cache->setKey(self::KEY_REFS, $refs); 204 + } 205 + 206 + return $refs; 207 + } 208 + 209 + public static function newRefs() { 194 210 $refs = array(); 195 211 196 212 $default_port = PhabricatorEnv::getEnvConfig('mysql.port'); ··· 232 248 } 233 249 234 250 public static function queryAll() { 235 - $refs = self::loadAll(); 251 + $refs = self::newRefs(); 236 252 237 253 foreach ($refs as $ref) { 238 254 if ($ref->getDisabled()) { ··· 242 258 $conn = $ref->newManagementConnection(); 243 259 244 260 $t_start = microtime(true); 261 + $replica_status = false; 245 262 try { 246 263 $replica_status = queryfx_one($conn, 'SHOW SLAVE STATUS'); 247 264 $ref->setConnectionStatus(self::STATUS_OKAY); ··· 269 286 $t_end = microtime(true); 270 287 $ref->setConnectionLatency($t_end - $t_start); 271 288 272 - $is_replica = (bool)$replica_status; 273 - if ($ref->getIsMaster() && $is_replica) { 274 - $ref->setReplicaStatus(self::REPLICATION_MASTER_REPLICA); 275 - $ref->setReplicaMessage( 276 - pht( 277 - 'This host has a "master" role, but is replicating data from '. 278 - 'another host ("%s")!', 279 - idx($replica_status, 'Master_Host'))); 280 - } else if (!$ref->getIsMaster() && !$is_replica) { 281 - $ref->setReplicaStatus(self::REPLICATION_REPLICA_NONE); 282 - $ref->setReplicaMessage( 283 - pht( 284 - 'This host has a "replica" role, but is not replicating data '. 285 - 'from a master (no output from "SHOW SLAVE STATUS").')); 286 - } else { 287 - $ref->setReplicaStatus(self::REPLICATION_OKAY); 288 - } 289 - 290 - if ($is_replica) { 291 - $latency = (int)idx($replica_status, 'Seconds_Behind_Master'); 292 - $ref->setReplicaDelay($latency); 293 - if ($latency > 30) { 294 - $ref->setReplicaStatus(self::REPLICATION_SLOW); 289 + if ($replica_status !== false) { 290 + $is_replica = (bool)$replica_status; 291 + if ($ref->getIsMaster() && $is_replica) { 292 + $ref->setReplicaStatus(self::REPLICATION_MASTER_REPLICA); 295 293 $ref->setReplicaMessage( 296 294 pht( 297 - 'This replica is lagging far behind the master. Data is at '. 298 - 'risk!')); 295 + 'This host has a "master" role, but is replicating data from '. 296 + 'another host ("%s")!', 297 + idx($replica_status, 'Master_Host'))); 298 + } else if (!$ref->getIsMaster() && !$is_replica) { 299 + $ref->setReplicaStatus(self::REPLICATION_REPLICA_NONE); 300 + $ref->setReplicaMessage( 301 + pht( 302 + 'This host has a "replica" role, but is not replicating data '. 303 + 'from a master (no output from "SHOW SLAVE STATUS").')); 304 + } else { 305 + $ref->setReplicaStatus(self::REPLICATION_OKAY); 306 + } 307 + 308 + if ($is_replica) { 309 + $latency = (int)idx($replica_status, 'Seconds_Behind_Master'); 310 + $ref->setReplicaDelay($latency); 311 + if ($latency > 30) { 312 + $ref->setReplicaStatus(self::REPLICATION_SLOW); 313 + $ref->setReplicaMessage( 314 + pht( 315 + 'This replica is lagging far behind the master. Data is at '. 316 + 'risk!')); 317 + } 299 318 } 300 319 } 301 320 } ··· 318 337 )); 319 338 } 320 339 340 + public function isSevered() { 341 + return $this->didFailToConnect; 342 + } 343 + 344 + public function isReachable(AphrontDatabaseConnection $connection) { 345 + if ($this->isSevered()) { 346 + return false; 347 + } 348 + 349 + try { 350 + $connection->openConnection(); 351 + $reachable = true; 352 + } catch (Exception $ex) { 353 + $reachable = false; 354 + } 355 + 356 + if (!$reachable) { 357 + $this->didFailToConnect = true; 358 + } 359 + 360 + return $reachable; 361 + } 362 + 321 363 public static function getMasterDatabaseRef() { 322 - $refs = self::loadAll(); 364 + $refs = self::getLiveRefs(); 323 365 324 366 if (!$refs) { 325 367 $conf = PhabricatorEnv::newObjectFromConfig( ··· 348 390 } 349 391 350 392 public static function getReplicaDatabaseRef() { 351 - $refs = self::loadAll(); 393 + $refs = self::getLiveRefs(); 352 394 353 395 if (!$refs) { 354 396 return null;
+10
src/infrastructure/env/PhabricatorEnv.php
··· 60 60 private static $readOnlyReason; 61 61 62 62 const READONLY_CONFIG = 'config'; 63 + const READONLY_UNREACHABLE = 'unreachable'; 64 + const READONLY_SEVERED = 'severed'; 63 65 const READONLY_MASTERLESS = 'masterless'; 64 66 65 67 /** ··· 217 219 $master = PhabricatorDatabaseRef::getMasterDatabaseRef(); 218 220 if (!$master) { 219 221 self::setReadOnly(true, self::READONLY_MASTERLESS); 222 + } else if ($master->isSevered()) { 223 + self::setReadOnly(true, self::READONLY_SEVERED); 220 224 } 221 225 222 226 try { ··· 468 472 return pht( 469 473 'Phabricator is in read-only mode (no writable database '. 470 474 'is configured).'); 475 + case self::READONLY_UNREACHABLE: 476 + return pht( 477 + 'Phabricator is in read-only mode (unreachable master).'); 478 + case self::READONLY_SEVERED: 479 + return pht( 480 + 'Phabricator is in read-only mode (major interruption).'); 471 481 } 472 482 473 483 return pht('Phabricator is in read-only mode.');
+36 -5
src/infrastructure/storage/lisk/PhabricatorLiskDAO.php
··· 60 60 $this->raiseImproperWrite($database); 61 61 } 62 62 63 - $refs = PhabricatorDatabaseRef::loadAll(); 64 - if ($refs) { 63 + $is_cluster = (bool)PhabricatorEnv::getEnvConfig('cluster.databases'); 64 + if ($is_cluster) { 65 65 $connection = $this->newClusterConnection($database, $mode); 66 66 } else { 67 67 $connection = $this->newBasicConnection($database, $mode, $namespace); ··· 99 99 100 100 private function newClusterConnection($database, $mode) { 101 101 $master = PhabricatorDatabaseRef::getMasterDatabaseRef(); 102 - if ($master) { 103 - return $master->newApplicationConnection($database); 102 + 103 + if ($master && !$master->isSevered()) { 104 + $connection = $master->newApplicationConnection($database); 105 + if ($master->isReachable($connection)) { 106 + return $connection; 107 + } else { 108 + if ($mode == 'w') { 109 + $this->raiseImpossibleWrite($database); 110 + } 111 + PhabricatorEnv::setReadOnly( 112 + true, 113 + PhabricatorEnv::READONLY_UNREACHABLE); 114 + } 104 115 } 105 116 106 117 $replica = PhabricatorDatabaseRef::getReplicaDatabaseRef(); ··· 111 122 112 123 $connection = $replica->newApplicationConnection($database); 113 124 $connection->setReadOnly(true); 125 + if ($replica->isReachable($connection)) { 126 + return $connection; 127 + } 114 128 115 - return $connection; 129 + $this->raiseUnreachable($database); 116 130 } 117 131 118 132 private function raiseImproperWrite($database) { ··· 121 135 'Unable to establish a write-mode connection (to application '. 122 136 'database "%s") because Phabricator is in read-only mode. Whatever '. 123 137 'you are trying to do does not function correctly in read-only mode.', 138 + $database)); 139 + } 140 + 141 + private function raiseImpossibleWrite($database) { 142 + throw new PhabricatorClusterImpossibleWriteException( 143 + pht( 144 + 'Unable to connect to master database ("%s"). This is a severe '. 145 + 'failure; your request did not complete.', 146 + $database)); 147 + } 148 + 149 + private function raiseUnreachable($database) { 150 + throw new PhabricatorClusterStrandedException( 151 + pht( 152 + 'Unable to establish a connection to ANY database host '. 153 + '(while trying "%s"). All masters and replicas are completely '. 154 + 'unreachable.', 124 155 $database)); 125 156 } 126 157