a collection of lightweight TypeScript packages for AT Protocol, the protocol powering Bluesky
atproto bluesky typescript npm
101
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor(util-text): single-pass grapheme counting

Mary c998b350 2c6a8815

+100 -109
+100 -109
packages/misc/util-text/src/grapheme.c
··· 148 148 }; 149 149 // #endregion 150 150 151 - // #region UTF-8 decoder + grapheme counter 152 - 153 - static inline size_t decode_utf16_prop_cached( 154 - const char16_t *s, 155 - size_t len, 156 - uint32_t *prop, 157 - uint32_t *cached_hi, 158 - uint32_t *cached_base 159 - ) { 160 - uint32_t first = s[0]; 161 - 162 - if (first < 0xD800 || first > 0xDFFF) { 163 - *prop = char_break_bmp[first]; 164 - return 1; 165 - } 166 - 167 - if (first <= 0xDBFF && len >= 2) { 168 - uint32_t second = s[1]; 169 - if (second >= 0xDC00 && second <= 0xDFFF) { 170 - uint32_t cp = 0x10000 + ((first - 0xD800) << 10) + (second - 0xDC00); 171 - if (cp >= 0x1F1E6 && cp <= 0x1F1FF) { 172 - *prop = CHAR_BREAK_PROP_REGIONAL_INDICATOR; 173 - return 2; 174 - } 175 - 176 - uint32_t hi = cp >> 8; 177 - if (hi != *cached_hi) { 178 - *cached_hi = hi; 179 - *cached_base = char_break_major[hi]; 180 - } 181 - 182 - *prop = char_break_minor[*cached_base + (cp & 0xFF)]; 183 - return 2; 184 - } 185 - } 186 - 187 - *prop = char_break_bmp[0xFFFD]; 188 - return 1; 189 - } 151 + // #region grapheme counter 190 152 191 153 static inline bool is_gb9c_extend(uint32_t p) { 192 154 return p == CHAR_BREAK_PROP_ICB_EXTEND || ··· 211 173 return count; 212 174 } 213 175 214 - static inline bool is_all_ascii(const char16_t *str, int len) { 215 - int i = 0; 216 - 217 - for (; i + 7 < len; i += 8) { 218 - uint16_t m = str[i] | str[i+1] | str[i+2] | str[i+3] | 219 - str[i+4] | str[i+5] | str[i+6] | str[i+7]; 220 - if (m > 0x7F) { 221 - return false; 222 - } 223 - } 224 - 225 - for (; i < len; i++) { 226 - if (str[i] > 0x7F) { 227 - return false; 228 - } 229 - } 230 - 231 - return true; 232 - } 233 - 234 - static inline bool is_all_bmp_without_surrogates(const char16_t *str, int len) { 235 - for (int i = 0; i < len; i++) { 236 - uint16_t unit = str[i]; 237 - if (unit >= 0xD800 && unit <= 0xDFFF) { 238 - return false; 239 - } 240 - } 241 - 242 - return true; 243 - } 244 - 245 176 static inline uint8_t advance_gb9c(uint8_t state, uint32_t prop) { 246 177 if (state == 0) { 247 178 return prop == CHAR_BREAK_PROP_ICB_CONSONANT ? 1 : 0; ··· 294 225 return !no_break; 295 226 } 296 227 297 - static inline int grapheme_count_bmp(const char16_t *str, int len, int max_len) { 298 - int count = 1; 299 - grapheme_break_state st = {0}; 300 - uint32_t p0 = char_break_bmp[str[0]]; 228 + static int grapheme_count_impl(const char16_t *str, int len, int max_len) { 229 + if (len == 0) return 0; 301 230 302 - for (int i = 1; i < len; i++) { 303 - uint32_t p1 = char_break_bmp[str[i]]; 231 + int i = 0; 304 232 305 - if (is_grapheme_break(&st, p0, p1)) { 306 - count++; 307 - if (max_len >= 0 && count > max_len) { 308 - return count; 309 - } 233 + // fast ASCII prefix: skip in chunks of 8 234 + { 235 + int ascii_end = len - 7; 236 + while (i < ascii_end) { 237 + uint16_t m = str[i] | str[i+1] | str[i+2] | str[i+3] | 238 + str[i+4] | str[i+5] | str[i+6] | str[i+7]; 239 + if (m > 0x7F) break; 240 + i += 8; 310 241 } 242 + while (i < len && str[i] <= 0x7F) { 243 + i++; 244 + } 245 + if (i == len) { 246 + return ascii_grapheme_count(str, len); 247 + } 248 + } 311 249 312 - p0 = p1; 250 + // count graphemes in the ASCII prefix (adjusting for CRLF) 251 + int count = i; 252 + for (int j = 0; j + 1 < i; j++) { 253 + if (str[j] == 0x0D && str[j+1] == 0x0A) { 254 + count--; 255 + } 313 256 } 314 257 315 - return count; 316 - } 258 + // set up state machine from the last ASCII character (if any) 259 + grapheme_break_state st = {0}; 260 + uint32_t p0; 317 261 318 - static int grapheme_count_impl(const char16_t *str, int len, int max_len) { 319 - if (len == 0) return 0; 320 - 321 - if (is_all_ascii(str, len)) { 322 - return ascii_grapheme_count(str, len); 262 + if (i > 0) { 263 + p0 = char_break_bmp[str[i - 1]]; 264 + } else { 265 + // string starts with non-ASCII; decode first char properly 266 + uint32_t first = str[0]; 267 + if (first >= 0xD800 && first <= 0xDBFF && len >= 2) { 268 + uint32_t second = str[1]; 269 + if (second >= 0xDC00 && second <= 0xDFFF) { 270 + uint32_t cp = 0x10000 + ((first - 0xD800) << 10) + (second - 0xDC00); 271 + if (cp >= 0x1F1E6 && cp <= 0x1F1FF) { 272 + p0 = CHAR_BREAK_PROP_REGIONAL_INDICATOR; 273 + } else { 274 + uint32_t hi = cp >> 8; 275 + p0 = char_break_minor[char_break_major[hi] + (cp & 0xFF)]; 276 + } 277 + i = 2; 278 + } else { 279 + p0 = char_break_bmp[0xFFFD]; 280 + i = 1; 281 + } 282 + } else if (first >= 0xDC00 && first <= 0xDFFF) { 283 + p0 = char_break_bmp[0xFFFD]; 284 + i = 1; 285 + } else { 286 + p0 = char_break_bmp[first]; 287 + i = 1; 288 + } 289 + count = 1; 323 290 } 324 291 325 - if (is_all_bmp_without_surrogates(str, len)) { 326 - return grapheme_count_bmp(str, len, max_len); 327 - } 292 + // single pass: BMP-direct with inline surrogate handling 293 + while (i < len) { 294 + uint32_t first = str[i]; 328 295 329 - int count = 1; 330 - grapheme_break_state st = {0}; 331 - size_t off = 0; 332 - uint32_t cached_hi = UINT32_MAX; 333 - uint32_t cached_base = 0; 334 - uint32_t p0; 296 + if (likely(first < 0xD800 || first > 0xDFFF)) { 297 + uint32_t p1 = char_break_bmp[first]; 335 298 336 - off += decode_utf16_prop_cached(str + off, len - off, &p0, &cached_hi, &cached_base); 299 + if (is_grapheme_break(&st, p0, p1)) { 300 + count++; 301 + if (max_len >= 0 && count > max_len) { 302 + return count; 303 + } 304 + } 337 305 338 - while (off < (size_t)len) { 339 - uint32_t p1; 340 - size_t adv = decode_utf16_prop_cached(str + off, len - off, &p1, &cached_hi, &cached_base); 306 + p0 = p1; 307 + i++; 308 + } else if (first <= 0xDBFF && i + 1 < len) { 309 + uint32_t second = str[i + 1]; 310 + if (second >= 0xDC00 && second <= 0xDFFF) { 311 + uint32_t cp = 0x10000 + ((first - 0xD800) << 10) + (second - 0xDC00); 312 + uint32_t p1; 341 313 342 - if (is_grapheme_break(&st, p0, p1)) { 343 - count++; 344 - if (max_len >= 0 && count > max_len) { 345 - return count; 314 + if (cp >= 0x1F1E6 && cp <= 0x1F1FF) { 315 + p1 = CHAR_BREAK_PROP_REGIONAL_INDICATOR; 316 + } else { 317 + uint32_t hi = cp >> 8; 318 + p1 = char_break_minor[char_break_major[hi] + (cp & 0xFF)]; 319 + } 320 + 321 + if (is_grapheme_break(&st, p0, p1)) { 322 + count++; 323 + if (max_len >= 0 && count > max_len) { 324 + return count; 325 + } 326 + } 327 + 328 + p0 = p1; 329 + i += 2; 330 + } else { 331 + uint32_t p1 = char_break_bmp[0xFFFD]; 332 + if (is_grapheme_break(&st, p0, p1)) count++; 333 + p0 = p1; 334 + i++; 346 335 } 336 + } else { 337 + uint32_t p1 = char_break_bmp[0xFFFD]; 338 + if (is_grapheme_break(&st, p0, p1)) count++; 339 + p0 = p1; 340 + i++; 347 341 } 348 - 349 - p0 = p1; 350 - off += adv; 351 342 } 352 343 353 344 return count;