@recaptime-dev's working patches + fork for Phorge, a community fork of Phabricator. (Upstream dev and stable branches are at upstream/main and upstream/stable respectively.) hq.recaptime.dev/wiki/Phorge
phorge phabricator
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Automatically sever databases after prolonged unreachability

Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.

However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).

Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).

We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.

This is similar to what most load balancers do when pulling web servers in and out of pools.

For now, the specific numbers are:

- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.

Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:

{F1213397}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T4571

Differential Revision: https://secure.phabricator.com/D15677

+278 -6
+2
src/__phutil_library_map__.php
··· 2241 2241 'PhabricatorDashboardViewController' => 'applications/dashboard/controller/PhabricatorDashboardViewController.php', 2242 2242 'PhabricatorDataCacheSpec' => 'applications/cache/spec/PhabricatorDataCacheSpec.php', 2243 2243 'PhabricatorDataNotAttachedException' => 'infrastructure/storage/lisk/PhabricatorDataNotAttachedException.php', 2244 + 'PhabricatorDatabaseHealthRecord' => 'infrastructure/cluster/PhabricatorDatabaseHealthRecord.php', 2244 2245 'PhabricatorDatabaseRef' => 'infrastructure/cluster/PhabricatorDatabaseRef.php', 2245 2246 'PhabricatorDatabaseSetupCheck' => 'applications/config/check/PhabricatorDatabaseSetupCheck.php', 2246 2247 'PhabricatorDatasourceEditField' => 'applications/transactions/editfield/PhabricatorDatasourceEditField.php', ··· 6697 6698 'PhabricatorDashboardViewController' => 'PhabricatorDashboardController', 6698 6699 'PhabricatorDataCacheSpec' => 'PhabricatorCacheSpec', 6699 6700 'PhabricatorDataNotAttachedException' => 'Exception', 6701 + 'PhabricatorDatabaseHealthRecord' => 'Phobject', 6700 6702 'PhabricatorDatabaseRef' => 'Phobject', 6701 6703 'PhabricatorDatabaseSetupCheck' => 'PhabricatorSetupCheck', 6702 6704 'PhabricatorDatasourceEditField' => 'PhabricatorTokenizerEditField',
+5
src/applications/cache/PhabricatorCaches.php
··· 174 174 * @task setup 175 175 */ 176 176 private static function buildSetupCaches() { 177 + // If this is the CLI, just build a setup cache. 178 + if (php_sapi_name() == 'cli') { 179 + return array(); 180 + } 181 + 177 182 // In most cases, we should have APC. This is an ideal cache for our 178 183 // purposes -- it's fast and empties on server restart. 179 184 $apc = new PhutilAPCKeyValueCache();
+27
src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php
··· 115 115 $replica_label, 116 116 ); 117 117 118 + $health = $database->getHealthRecord(); 119 + $health_up = $health->getUpEventCount(); 120 + $health_down = $health->getDownEventCount(); 121 + 122 + if ($health->getIsHealthy()) { 123 + $health_icon = id(new PHUIIconView()) 124 + ->setIcon('fa-plus green'); 125 + } else { 126 + $health_icon = id(new PHUIIconView()) 127 + ->setIcon('fa-times red'); 128 + } 129 + 130 + $health_count = pht( 131 + '%s / %s', 132 + new PhutilNumber($health_up), 133 + new PhutilNumber($health_up + $health_down)); 134 + 135 + $health_status = array( 136 + $health_icon, 137 + ' ', 138 + $health_count, 139 + ); 140 + 118 141 $messages = array(); 119 142 120 143 $conn_message = $database->getConnectionMessage(); ··· 136 159 $database->getUser(), 137 160 $connection, 138 161 $replication, 162 + $health_status, 139 163 $messages, 140 164 ); 141 165 } 142 166 167 + 143 168 $table = id(new AphrontTableView($rows)) 144 169 ->setNoDataString( 145 170 pht('Phabricator is not configured in cluster mode.')) ··· 151 176 pht('User'), 152 177 pht('Connection'), 153 178 pht('Replication'), 179 + pht('Health'), 154 180 pht('Messages'), 155 181 )) 156 182 ->setColumnClasses( 157 183 array( 184 + null, 158 185 null, 159 186 null, 160 187 null,
+185
src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php
··· 1 + <?php 2 + 3 + final class PhabricatorDatabaseHealthRecord 4 + extends Phobject { 5 + 6 + private $ref; 7 + private $shouldCheck; 8 + private $isHealthy; 9 + private $upEventCount; 10 + private $downEventCount; 11 + 12 + public function __construct(PhabricatorDatabaseRef $ref) { 13 + $this->ref = $ref; 14 + $this->readState(); 15 + } 16 + 17 + 18 + /** 19 + * Is the database currently healthy? 20 + */ 21 + public function getIsHealthy() { 22 + return $this->isHealthy; 23 + } 24 + 25 + 26 + /** 27 + * Should this request check database health? 28 + */ 29 + public function getShouldCheck() { 30 + return $this->shouldCheck; 31 + } 32 + 33 + 34 + /** 35 + * How many recent health checks were successful? 36 + */ 37 + public function getUpEventCount() { 38 + return $this->upEventCount; 39 + } 40 + 41 + 42 + /** 43 + * How many recent health checks failed? 44 + */ 45 + public function getDownEventCount() { 46 + return $this->downEventCount; 47 + } 48 + 49 + 50 + /** 51 + * Number of failures or successes we need to see in a row before we change 52 + * the state. 53 + */ 54 + public function getRequiredEventCount() { 55 + return 5; 56 + } 57 + 58 + 59 + /** 60 + * Seconds to wait between health checks. 61 + */ 62 + public function getHealthCheckFrequency() { 63 + return 3; 64 + } 65 + 66 + 67 + public function didHealthCheck($result) { 68 + $now = microtime(true); 69 + $check_frequency = $this->getHealthCheckFrequency(); 70 + $event_count = $this->getRequiredEventCount(); 71 + 72 + $record = $this->readHealthRecord(); 73 + 74 + $log = $record['log']; 75 + foreach ($log as $key => $event) { 76 + $when = idx($event, 'timestamp'); 77 + 78 + // If the log already has another nearby event, just ignore this one. 79 + // We raced with another process and our result can just be thrown away. 80 + if (($now - $when) <= $check_frequency) { 81 + return $this; 82 + } 83 + } 84 + 85 + $log[] = array( 86 + 'timestamp' => $now, 87 + 'up' => $result, 88 + ); 89 + 90 + // Throw away older events which are now obsolete. 91 + $log = array_slice($log, -$event_count); 92 + 93 + $count_up = 0; 94 + $count_down = 0; 95 + foreach ($log as $event) { 96 + if ($event['up']) { 97 + $count_up++; 98 + } else { 99 + $count_down++; 100 + } 101 + } 102 + 103 + // If all of the events are the same, change the state. 104 + if ($count_up == $event_count) { 105 + $record['up'] = true; 106 + } else if ($count_down == $event_count) { 107 + $record['up'] = false; 108 + } 109 + 110 + $record['log'] = $log; 111 + 112 + $this->writeHealthRecord($record); 113 + 114 + $this->isHealthy = $record['up']; 115 + $this->shouldCheck = false; 116 + $this->updateStatistics($record); 117 + 118 + return $this; 119 + } 120 + 121 + 122 + private function readState() { 123 + $now = microtime(true); 124 + $check_frequency = $this->getHealthCheckFrequency(); 125 + 126 + $record = $this->readHealthRecord(); 127 + 128 + $last_check = $record['lastCheck']; 129 + 130 + if (($now - $last_check) >= $check_frequency) { 131 + $record['lastCheck'] = $now; 132 + $this->writeHealthRecord($record); 133 + $this->shouldCheck = true; 134 + } else { 135 + $this->shouldCheck = false; 136 + } 137 + 138 + $this->isHealthy = $record['up']; 139 + $this->updateStatistics($record); 140 + } 141 + 142 + private function updateStatistics(array $record) { 143 + $this->upEventCount = 0; 144 + $this->downEventCount = 0; 145 + foreach ($record['log'] as $event) { 146 + if ($event['up']) { 147 + $this->upEventCount++; 148 + } else { 149 + $this->downEventCount++; 150 + } 151 + } 152 + } 153 + 154 + private function getHealthRecordCacheKey() { 155 + $ref = $this->ref; 156 + 157 + $host = $ref->getHost(); 158 + $port = $ref->getPort(); 159 + 160 + return "cluster.db.health({$host}, {$port})"; 161 + } 162 + 163 + private function readHealthRecord() { 164 + $cache = PhabricatorCaches::getSetupCache(); 165 + $cache_key = $this->getHealthRecordCacheKey(); 166 + $health_record = $cache->getKey($cache_key); 167 + 168 + if (!is_array($health_record)) { 169 + $health_record = array( 170 + 'up' => true, 171 + 'lastCheck' => 0, 172 + 'log' => array(), 173 + ); 174 + } 175 + 176 + return $health_record; 177 + } 178 + 179 + private function writeHealthRecord(array $record) { 180 + $cache = PhabricatorCaches::getSetupCache(); 181 + $cache_key = $this->getHealthRecordCacheKey(); 182 + $cache->setKey($cache_key, $record); 183 + } 184 + 185 + }
+55 -5
src/infrastructure/cluster/PhabricatorDatabaseRef.php
··· 30 30 private $replicaMessage; 31 31 private $replicaDelay; 32 32 33 + private $healthRecord; 33 34 private $didFailToConnect; 34 35 35 36 public function setHost($host) { ··· 326 327 return $this->newConnection( 327 328 array( 328 329 'retries' => 0, 329 - 'timeout' => 3, 330 + 'timeout' => 2, 330 331 )); 331 332 } 332 333 ··· 338 339 } 339 340 340 341 public function isSevered() { 341 - return $this->didFailToConnect; 342 + if ($this->didFailToConnect) { 343 + return true; 344 + } 345 + 346 + $record = $this->getHealthRecord(); 347 + $is_healthy = $record->getIsHealthy(); 348 + if (!$is_healthy) { 349 + return true; 350 + } 351 + 352 + return false; 342 353 } 343 354 344 355 public function isReachable(AphrontDatabaseConnection $connection) { 345 - if ($this->isSevered()) { 356 + $record = $this->getHealthRecord(); 357 + $should_check = $record->getShouldCheck(); 358 + 359 + if ($this->isSevered() && !$should_check) { 346 360 return false; 347 361 } 348 362 ··· 351 365 $reachable = true; 352 366 } catch (Exception $ex) { 353 367 $reachable = false; 368 + } 369 + 370 + if ($should_check) { 371 + $record->didHealthCheck($reachable); 354 372 } 355 373 356 374 if (!$reachable) { ··· 360 378 return $reachable; 361 379 } 362 380 381 + public function checkHealth() { 382 + $health = $this->getHealthRecord(); 383 + 384 + $should_check = $health->getShouldCheck(); 385 + if ($should_check) { 386 + // This does an implicit health update. 387 + $connection = $this->newManagementConnection(); 388 + $this->isReachable($connection); 389 + } 390 + 391 + return $this; 392 + } 393 + 394 + public function getHealthRecord() { 395 + if (!$this->healthRecord) { 396 + $this->healthRecord = new PhabricatorDatabaseHealthRecord($this); 397 + } 398 + return $this->healthRecord; 399 + } 400 + 363 401 public static function getMasterDatabaseRef() { 364 402 $refs = self::getLiveRefs(); 365 403 ··· 415 453 } 416 454 417 455 private function newConnection(array $options) { 456 + // If we believe the database is unhealthy, don't spend as much time 457 + // trying to connect to it, since it's likely to continue to fail and 458 + // hammering it can only make the problem worse. 459 + $record = $this->getHealthRecord(); 460 + if ($record->getIsHealthy()) { 461 + $default_retries = 3; 462 + $default_timeout = 10; 463 + } else { 464 + $default_retries = 0; 465 + $default_timeout = 2; 466 + } 467 + 418 468 $spec = $options + array( 419 469 'user' => $this->getUser(), 420 470 'pass' => $this->getPass(), 421 471 'host' => $this->getHost(), 422 472 'port' => $this->getPort(), 423 473 'database' => null, 424 - 'retries' => 3, 425 - 'timeout' => 15, 474 + 'retries' => $default_retries, 475 + 'timeout' => $default_timeout, 426 476 ); 427 477 428 478 return PhabricatorEnv::newObjectFromConfig(
+4 -1
src/infrastructure/env/PhabricatorEnv.php
··· 220 220 if (!$master) { 221 221 self::setReadOnly(true, self::READONLY_MASTERLESS); 222 222 } else if ($master->isSevered()) { 223 - self::setReadOnly(true, self::READONLY_SEVERED); 223 + $master->checkHealth(); 224 + if ($master->isSevered()) { 225 + self::setReadOnly(true, self::READONLY_SEVERED); 226 + } 224 227 } 225 228 226 229 try {