import binomialTest from '@stdlib/stats-binomial-test'; import _ from 'lodash'; import { RuleAlarmStatus } from '../moderationConfigService/index.js'; const { sum } = _; /** * Determine whether we should flag the rule as being 'in alarm', meaning that * its most-recent pass rate looks anomalous. * * I'm not exactly sure what the math should be here, but it seems like there * are two ways to view a rule's pass rate. * * One is to imagine that there's some static probability of a rule passing, so * the number of passes you'd expect to observe would be given by a binomial * distribution, which you're sampling (historically and in the latest period). * Then, you can look at how many passes were in the latest period and ask (in * various ways) if that's consistent w/ the pass rate observed historically. * * The second view is to see a rule's pass rate as a variable that changes over * time, with an independent "seasonal" component (i.e., different cohorts of * users regualarly come onto the platform at different times of day/days of the * week) and "trend" component (e.g., the pass rate may be going down over time * as bad users get deleted). Then, you forecast the next period's pass rate, * with a confidence interval on the forecast, and use the error between the * forecast and the actual pass rate to detect anomalies. * * The second approach feels more accurate, but also obviously more complicated. * The statistics are beyond me, and I couldn't find any off-the-shelf packages * in JS for decomposing the historical data into these trend + seasonal + * residual components (they're all in Python or R.) * * My hacky approach was gonna be to check if the difference between this hour's * pass rate and last hour's pass rate was abnormally large (by estimating a * normal distribution for the rule's hour-by-hour _pass rate change_, from the * samples), and then mark the rule as in alarm if the pass rate went up a lot. * The idea was that, by comparing the current hour to the last hour (when * finding the pass rate change), we'd be assuming that the best estimate for * this hour's pass rate -- if things aren't anomalous -- is last hour's pass * rate, which thereby accounts for "seasonal" effects that we might miss if we * used some longer-run average as the starting point for estimating what this * period's pass rate "should be" (a la the 'static pass rate' model). * * But one issue is that the last hour's pass rate could also be abnormally low * by chance, which we wouldn't have alerted on (since we only alert if it looks * like the pass rate is unusually _high_), so then simple regression to the * mean in the next period would result in a big difference that could trigger * an alert. Relatedly, some rules, especially at some hours, really don't run * that many times, so using the hour-by-hour pass rates is not very reliable/ * accurate (doubly so if we start onboarding smaller users), even though * pass rates are very nice in their ability to abstract away changes in the * absolute amount of usage at each hour, which we don't care about (and does * change a lot). Still, many rules don't pass at all in a given hour, which * totally messes this approach up. * * There's also a bigger issue with looking at the pass rate as a time series, * which is that, if we go into alert mode when the pass rate jumps, we don't * want to then automatically go out of alert mode while the anomaly is still * occurring (i.e., we don't want to too-quickly 'learn' the anomolous value as * the 'correct' value, which could happen if we use the last value as the basis * for our prediction of the next value, and the anomaly lasts for over an hour). * * So, for now, I'm sticking with the more naive option (1), but, to kinda-sorta * try to capture part of this seasonality idea, while avoiding the trap of an * anomalous most-recent hour(s), I'm very-slightly weighting more recent data * more heavily when calculating the historical data. This is kinda like * "exponential smoothing" in time series analysis. * * @param stats An array where each item represents one time period, with newer * time periods first. Within an item, the keys represent how many passes and * rule executions there were in that time period. * @param confidence A value between 0 and 1 that represents how confident we * have to be before we consider the rule in alarm. */ export default function getRuleAlarmStatus( stats: { passes: number; runs: number }[], confidence = 0.995, ): RuleAlarmStatus { const [lastPeriod, ...priorPeriods] = stats; // alpha is the exponential smoothing factor (i.e., the weights exponentially // decay such that each period gets (1 - alpha) times the weight of the prior // one. alpha, is quite low because, again, we don't want to learn an anomaly // as the new 'real' value, and because any 'seasonal' changes should be small // (so we don't actually need to give much more weight to recent data). A .02 // smoothing factor each gives each prior period 98% as much weight as the // next (i.e., more-recent) period. // // NB: We're throwing away the _number of weighted passes and runs_, and just // looking at the weighted _pass rate_. If we ever want to do a statistical // test that relies on the absolute number of (weighted) prior runs to derive // some uncertainty bounds, we'll need to scale the weights to sum to 1 first. const alpha = 0.02; const weights = priorPeriods.map((_, i) => (1 - alpha) ** i); const historicalWeightedPassRate = sum(priorPeriods.map((it, i) => it.passes * weights[i])) / sum(priorPeriods.map((it, i) => it.runs * weights[i])); // We've got a few options for the kind of test we can do here: // // 1) we could just assume that the pass rate from `priorPeriodsMerged` is // the underlying pass rate for the rule (even though it's actually just // an estimate, which might be off if we don't have much historical data), // and then do an exact binomial test against lastPeriod's sample. // // 2) we can find the difference in pass rates between lastPeriod and the // historical data, and assume that that difference is normally distributed. // this would account for the sample size of the historical data. // (See https://www.itl.nist.gov/div898/handbook/prc/section3/prc33.htm) // the issue is that our binomial distributions are hella skewed (because // the pass rate for some rules is very, very close to zero), so assuming // the pass rate difference is normally distributed isn't strictly right // iiuc. there is (luckily) an off-the-shelf heuristic for when it's good // enough, which is when there's at least 10 passes in both samples. // // 3) we can run one of the exact tests that work around the fact that our // historical proportion is just an estimate, using different approaches. // see https://stats.stackexchange.com/a/551617/277172. the issue here is // that these are complicated and/or slow. // // For now, i just stick with the simplest option (1), but return false if it // looks like we might not have enough historical data. The threshold for // "enough" is hard to find because, if we want our estimated pass rate to be // off by no more than 10%, 95% of the time, the number of raw runs we'd need // could vary _wildly_ based on the rule's true pass rate. Our rule pass rates // seem to be as low as 1 in 100,000, which would require 250,000 runs to have // a reliable estimate, which is gonna be too many for smaller users and // is unnecessary for rules that pass much more often (e.g., a rule that // passed 5% of the time, which is the highest pass rate I can imagine, would // only need ~8000 runs, and ~4000 if we only care about our estimate being // too high, since we're only alerting when the threshold is exceeded, not // when the pass rate is anomolously low). However, if we say that the rule // must have been run at least 4000 times, and passed at least twice or been // run 125,000+ times, then this covers both extremes, I think. So, we require // that + at least 24 periods of data, to hopefully capture some seasonality. const priorPasses = sum(priorPeriods.map((it) => it.passes)); const priorRuns = sum(priorPeriods.map((it) => it.runs)); const hasAdequateData = priorPeriods.length >= 24 && priorRuns >= 4000 && (priorPasses > 2 || priorRuns > 125000); if (!hasAdequateData) { return RuleAlarmStatus.INSUFFICIENT_DATA; } // Even with our heuristics above, there are some cases where we're // incorrectly flagging things as anomolous because, when the sample sizes are // big (as w/ our large users), then even very small errors in the // estimate used for the pass rate, or small seasonal effects, can lead us to // log an anomaly incorrectly. // // For example: a rule has run almost 2 million times and has a weighted pass // rate of 0.017752. Now, in the latest period, if the rule runs many times // (say 8,000+), that's a big enough sample size that we'd expect the pass // rate to be pretty close to .017752. Specifically, with 8,000 runs, the // expected value is 8000*.017752 = 142 successes, but 99.5% of the time, // we'll have 171 passes (pass rate: .021375) or fewer. In real life, we'll // semi-routinely get 172-182 passes, and this won't represent anything // anomalous. Instead, it just means that our pass rate estimate was still a // little off or some seasonal thing is going on in this hour. So, to prevent // these kinds of false positives, we make sure that the pass rate in this // period is at least 25% higher than the historical weighted pass rate. if (lastPeriod.passes / lastPeriod.runs < historicalWeightedPassRate * 1.25) { return RuleAlarmStatus.OK; } const testResults = binomialTest(lastPeriod.passes, lastPeriod.runs, { p: historicalWeightedPassRate, alternative: 'greater', alpha: 1 - confidence, }); // It's an anomaly if we rejected the null hypothesis. return testResults.rejected ? RuleAlarmStatus.ALARM : RuleAlarmStatus.OK; }