a collection of lightweight TypeScript packages for AT Protocol, the protocol powering Bluesky
atproto bluesky typescript npm
101
fork

Configure Feed

Select the types of activity you want to include in your feed.

chore(util-text): add benchmark for fast-path grapheme counting

Mary b1e49ec0 625335ed

+311
+311
packages/misc/util-text/lib/index.bench.ts
··· 1 + import { bench, do_not_optimize, run, summary } from 'mitata'; 2 + 3 + // SIMD-like loop (current implementation) 4 + const isAsciiWithoutCr_loop = (text: string): boolean => { 5 + const len = text.length; 6 + let idx = 0; 7 + 8 + while (idx + 3 < len) { 9 + const a = text.charCodeAt(idx); 10 + const b = text.charCodeAt(idx + 1); 11 + const c = text.charCodeAt(idx + 2); 12 + const d = text.charCodeAt(idx + 3); 13 + 14 + if ((a | b | c | d) > 0x7f || a === 0x0d || b === 0x0d || c === 0x0d || d === 0x0d) { 15 + return false; 16 + } 17 + 18 + idx += 4; 19 + } 20 + 21 + while (idx < len) { 22 + const code = text.charCodeAt(idx); 23 + if (code > 0x7f || code === 0x0d) { 24 + return false; 25 + } 26 + 27 + idx++; 28 + } 29 + 30 + return true; 31 + }; 32 + 33 + // regex equivalent 34 + const RE_NON_ASCII_OR_CR = /[^\x00-\x7f]|\r/; 35 + const isAsciiWithoutCr_regex = (text: string): boolean => { 36 + return !RE_NON_ASCII_OR_CR.test(text); 37 + }; 38 + 39 + // test inputs 40 + const SHORT_ASCII = 'hello world'; 41 + const MEDIUM_ASCII = 'The quick brown fox jumps over the lazy dog. '.repeat(10); 42 + const LONG_ASCII = 'abcdefghijklmnopqrstuvwxyz0123456789 '.repeat(100); 43 + const SHORT_UNICODE = 'hello \u{1F600} world'; 44 + const MEDIUM_UNICODE = 'The quick brown fox \u{1F600} jumps over the lazy dog. '.repeat(10); 45 + const EARLY_FAIL = '\u{1F600}' + 'a'.repeat(1000); 46 + const LATE_FAIL = 'a'.repeat(1000) + '\u{1F600}'; 47 + const CR_EARLY = '\r' + 'a'.repeat(1000); 48 + const CR_LATE = 'a'.repeat(1000) + '\r'; 49 + const EMPTY = ''; 50 + 51 + // sanity checks 52 + for (const [name, input] of Object.entries({ 53 + SHORT_ASCII, 54 + MEDIUM_ASCII, 55 + LONG_ASCII, 56 + SHORT_UNICODE, 57 + MEDIUM_UNICODE, 58 + EARLY_FAIL, 59 + LATE_FAIL, 60 + CR_EARLY, 61 + CR_LATE, 62 + EMPTY, 63 + })) { 64 + const a = isAsciiWithoutCr_loop(input); 65 + const b = isAsciiWithoutCr_regex(input); 66 + if (a !== b) { 67 + throw new Error(`mismatch on ${name}: loop=${a}, regex=${b}`); 68 + } 69 + } 70 + 71 + summary(() => { 72 + bench(`loop: short ascii (${SHORT_ASCII.length})`, function* () { 73 + yield { 74 + [0]() { 75 + return SHORT_ASCII; 76 + }, 77 + bench(text: string) { 78 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 79 + }, 80 + }; 81 + }); 82 + 83 + bench(`regex: short ascii (${SHORT_ASCII.length})`, function* () { 84 + yield { 85 + [0]() { 86 + return SHORT_ASCII; 87 + }, 88 + bench(text: string) { 89 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 90 + }, 91 + }; 92 + }); 93 + }); 94 + 95 + summary(() => { 96 + bench(`loop: medium ascii (${MEDIUM_ASCII.length})`, function* () { 97 + yield { 98 + [0]() { 99 + return MEDIUM_ASCII; 100 + }, 101 + bench(text: string) { 102 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 103 + }, 104 + }; 105 + }); 106 + 107 + bench(`regex: medium ascii (${MEDIUM_ASCII.length})`, function* () { 108 + yield { 109 + [0]() { 110 + return MEDIUM_ASCII; 111 + }, 112 + bench(text: string) { 113 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 114 + }, 115 + }; 116 + }); 117 + }); 118 + 119 + summary(() => { 120 + bench(`loop: long ascii (${LONG_ASCII.length})`, function* () { 121 + yield { 122 + [0]() { 123 + return LONG_ASCII; 124 + }, 125 + bench(text: string) { 126 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 127 + }, 128 + }; 129 + }); 130 + 131 + bench(`regex: long ascii (${LONG_ASCII.length})`, function* () { 132 + yield { 133 + [0]() { 134 + return LONG_ASCII; 135 + }, 136 + bench(text: string) { 137 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 138 + }, 139 + }; 140 + }); 141 + }); 142 + 143 + summary(() => { 144 + bench(`loop: short unicode (${SHORT_UNICODE.length})`, function* () { 145 + yield { 146 + [0]() { 147 + return SHORT_UNICODE; 148 + }, 149 + bench(text: string) { 150 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 151 + }, 152 + }; 153 + }); 154 + 155 + bench(`regex: short unicode (${SHORT_UNICODE.length})`, function* () { 156 + yield { 157 + [0]() { 158 + return SHORT_UNICODE; 159 + }, 160 + bench(text: string) { 161 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 162 + }, 163 + }; 164 + }); 165 + }); 166 + 167 + summary(() => { 168 + bench(`loop: medium unicode (${MEDIUM_UNICODE.length})`, function* () { 169 + yield { 170 + [0]() { 171 + return MEDIUM_UNICODE; 172 + }, 173 + bench(text: string) { 174 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 175 + }, 176 + }; 177 + }); 178 + 179 + bench(`regex: medium unicode (${MEDIUM_UNICODE.length})`, function* () { 180 + yield { 181 + [0]() { 182 + return MEDIUM_UNICODE; 183 + }, 184 + bench(text: string) { 185 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 186 + }, 187 + }; 188 + }); 189 + }); 190 + 191 + summary(() => { 192 + bench(`loop: early unicode fail (${EARLY_FAIL.length})`, function* () { 193 + yield { 194 + [0]() { 195 + return EARLY_FAIL; 196 + }, 197 + bench(text: string) { 198 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 199 + }, 200 + }; 201 + }); 202 + 203 + bench(`regex: early unicode fail (${EARLY_FAIL.length})`, function* () { 204 + yield { 205 + [0]() { 206 + return EARLY_FAIL; 207 + }, 208 + bench(text: string) { 209 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 210 + }, 211 + }; 212 + }); 213 + }); 214 + 215 + summary(() => { 216 + bench(`loop: late unicode fail (${LATE_FAIL.length})`, function* () { 217 + yield { 218 + [0]() { 219 + return LATE_FAIL; 220 + }, 221 + bench(text: string) { 222 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 223 + }, 224 + }; 225 + }); 226 + 227 + bench(`regex: late unicode fail (${LATE_FAIL.length})`, function* () { 228 + yield { 229 + [0]() { 230 + return LATE_FAIL; 231 + }, 232 + bench(text: string) { 233 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 234 + }, 235 + }; 236 + }); 237 + }); 238 + 239 + summary(() => { 240 + bench(`loop: early CR fail (${CR_EARLY.length})`, function* () { 241 + yield { 242 + [0]() { 243 + return CR_EARLY; 244 + }, 245 + bench(text: string) { 246 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 247 + }, 248 + }; 249 + }); 250 + 251 + bench(`regex: early CR fail (${CR_EARLY.length})`, function* () { 252 + yield { 253 + [0]() { 254 + return CR_EARLY; 255 + }, 256 + bench(text: string) { 257 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 258 + }, 259 + }; 260 + }); 261 + }); 262 + 263 + summary(() => { 264 + bench(`loop: late CR fail (${CR_LATE.length})`, function* () { 265 + yield { 266 + [0]() { 267 + return CR_LATE; 268 + }, 269 + bench(text: string) { 270 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 271 + }, 272 + }; 273 + }); 274 + 275 + bench(`regex: late CR fail (${CR_LATE.length})`, function* () { 276 + yield { 277 + [0]() { 278 + return CR_LATE; 279 + }, 280 + bench(text: string) { 281 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 282 + }, 283 + }; 284 + }); 285 + }); 286 + 287 + summary(() => { 288 + bench('loop: empty', function* () { 289 + yield { 290 + [0]() { 291 + return EMPTY; 292 + }, 293 + bench(text: string) { 294 + return do_not_optimize(isAsciiWithoutCr_loop(text)); 295 + }, 296 + }; 297 + }); 298 + 299 + bench('regex: empty', function* () { 300 + yield { 301 + [0]() { 302 + return EMPTY; 303 + }, 304 + bench(text: string) { 305 + return do_not_optimize(isAsciiWithoutCr_regex(text)); 306 + }, 307 + }; 308 + }); 309 + }); 310 + 311 + await run();