a collection of lightweight TypeScript packages for AT Protocol, the protocol powering Bluesky
atproto bluesky typescript npm
101
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor(util-text): alter grapheme counting bench cases

Mary a1e6900e 2705c52a

+106 -220
+34 -62
packages/misc/util-text/lib/index.bench.ts
··· 14 14 throw new Error(`native binding not available`); 15 15 } 16 16 17 - const benchFilter = process.env.BENCH_FILTER?.toLowerCase() ?? ''; 18 - const benchGroup = process.env.BENCH_GROUP?.toLowerCase() ?? 'all'; 19 - const benchCases = new Set( 20 - (process.env.BENCH_CASES ?? '') 21 - .split(',') 22 - .map((value) => value.trim().toLowerCase()) 23 - .filter(Boolean), 24 - ); 25 - 26 - const shouldRun = (group: 'length' | 'range', name: string): boolean => { 27 - if (benchGroup !== 'all' && benchGroup !== group) { 28 - return false; 29 - } 30 - 31 - if (benchCases.size !== 0) { 32 - return benchCases.has(name.toLowerCase()); 33 - } 34 - 35 - if (benchFilter === '') { 36 - return true; 37 - } 38 - 39 - return `${group}:${name}`.toLowerCase().includes(benchFilter); 40 - }; 41 - 17 + // grapheme counting is used for AT Protocol lexicon validation: 18 + // - display names: max 64 graphemes 19 + // - posts: max 300 graphemes 20 + // - descriptions: max 256-300 graphemes 21 + // - alt text: max 1000 graphemes 22 + // - emoji reactions: exactly 1 grapheme 42 23 const cases = { 43 - ascii: 'The quick brown fox jumps over the lazy dog. '.repeat(16), 44 - combining: 'e\u0301'.repeat(256), 45 - bmpMixed: ('e\u0301a\r\n' + 'नमस्ते दुनिया ').repeat(160), 46 - crlf: 'a\r\nb'.repeat(256), 47 - family: '\u{1F468}\u200D\u{1F469}\u200D\u{1F467}\u200D\u{1F466}'.repeat(64), 48 - flags: '\u{1F1FA}\u{1F1F8}'.repeat(256), 49 - devanagari: 'नमस्ते दुनिया '.repeat(128), 50 - mixed: ('hello \u{1F600} e\u0301 🇺🇸\r\n' + 'नमस्ते ').repeat(96), 24 + displayName: 'maria \u{1F338}', 25 + post: 'just mass-migrated to bsky from twitter!! \u{1F389}\u{1F389}\u{1F389}\n\nfollow me for cat pics \u{1F431} and hot takes \u{1F525}\n\n#bsky #newhere #introduction', 26 + postJa: 'きょうの天気はとても良かったです\u{2600}\uFE0F 散歩に行ってきました\u{1F6B6}\u200D\u2640\uFE0F\nお花見のシーズンですね\u{1F338}\u{1F338}', 27 + postEmoji: '\u{1F468}\u200D\u{1F4BB} shipping code at 2am \u{1F602}\u{1F602}\u{1F602} \u{1F1FA}\u{1F1F8}\u{1F1E7}\u{1F1F7} who needs sleep when you have \u2615\u2615\u2615 #devlife \u{1F525}\u{1F4AF}', 28 + altText: 'a photograph of a sunset over the pacific ocean. the sky is painted in gradients of deep orange \u{1F7E0}, pink \u{1F338}, and purple \u{1F49C}. in the foreground, silhouettes of palm trees frame the scene. a small sailboat is visible on the horizon. the water reflects the warm colors of the sky, creating a mirror-like effect on the calm surface.', 29 + skinTone: '\u{1F44B}\u{1F3FB} \u{1F44B}\u{1F3FC} \u{1F44B}\u{1F3FD} \u{1F44B}\u{1F3FE} \u{1F44B}\u{1F3FF}', 30 + korean: '안녕하세요! 오늘 블루스카이에 가입했어요 \u{1F60A} 잘 부탁드립니다 \u{1F64F}', 51 31 } as const; 52 32 53 - const lengths = Object.fromEntries( 54 - Object.entries(cases).map(([name, text]) => [name, getGraphemeLengthJs(text)]), 55 - ) as Record<keyof typeof cases, number>; 56 - 57 33 const rangeCases = { 58 - ascii: { text: cases.ascii, min: 0, max: cases.ascii.length + 1 }, 59 - bmpMixed: { text: cases.bmpMixed, min: 0, max: lengths.bmpMixed }, 60 - combining: { text: cases.combining, min: 0, max: lengths.combining }, 61 - crlf: { text: cases.crlf, min: 0, max: lengths.crlf }, 62 - family: { text: cases.family, min: 0, max: lengths.family }, 63 - flags: { text: cases.flags, min: 0, max: lengths.flags }, 64 - devanagari: { text: cases.devanagari, min: 0, max: lengths.devanagari }, 65 - mixed: { text: cases.mixed, min: 0, max: lengths.mixed }, 66 - overflowMixed: { text: cases.mixed, min: 0, max: 100 }, 34 + // typical validation: min=0, string fits within max — exercises the short-circuit 35 + displayName: { text: cases.displayName, min: 0, max: 64 }, 36 + post: { text: cases.post, min: 0, max: 300 }, 37 + postJa: { text: cases.postJa, min: 0, max: 300 }, 38 + postEmoji: { text: cases.postEmoji, min: 0, max: 300 }, 39 + altText: { text: cases.altText, min: 0, max: 1000 }, 40 + korean: { text: cases.korean, min: 0, max: 300 }, 41 + 42 + // emoji reaction validation: exactly 1 grapheme (min=1 forces counting) 43 + emojiReaction: { text: '\u{1F468}\u200D\u{1F469}\u200D\u{1F467}', min: 1, max: 1 }, 44 + 45 + // overflow: post exceeds max, needs counting to detect 46 + postOverflow: { text: cases.altText, min: 0, max: 50 }, 67 47 } as const; 68 48 69 49 for (const [name, text] of Object.entries(cases)) { ··· 82 62 } 83 63 } 84 64 85 - summary(() => { 86 - for (const [name, text] of Object.entries(cases)) { 87 - if (!shouldRun('length', name)) { 88 - continue; 89 - } 90 - 65 + for (const [name, text] of Object.entries(cases)) { 66 + summary(() => { 91 67 bench(`native length: ${name}`, function* () { 92 68 yield { 93 69 [0]() { ··· 109 85 }, 110 86 }; 111 87 }); 112 - } 113 - }); 114 - 115 - summary(() => { 116 - for (const [name, { text, min, max }] of Object.entries(rangeCases)) { 117 - if (!shouldRun('range', name)) { 118 - continue; 119 - } 88 + }); 89 + } 120 90 91 + for (const [name, { text, min, max }] of Object.entries(rangeCases)) { 92 + summary(() => { 121 93 bench(`native range: ${name}`, function* () { 122 94 yield { 123 95 [0]() { ··· 139 111 }, 140 112 }; 141 113 }); 142 - } 143 - }); 114 + }); 115 + } 144 116 145 117 await run();
+41 -96
packages/misc/util-text/lib/index.node.ts
··· 2 2 import { createRequire } from 'node:module'; 3 3 import { arch, platform } from 'node:process'; 4 4 5 - const segmenter = new Intl.Segmenter(); 5 + import { isAsciiWithoutCr } from './utils.ts'; 6 + 7 + import { 8 + getGraphemeLength as getGraphemeLengthJs, 9 + isGraphemeLengthInRange as isGraphemeLengthInRangeJs, 10 + } from './index.ts'; 6 11 7 12 type GraphemeBinding = { 8 13 getGraphemeLength: (str: string) => number; ··· 11 16 12 17 /** 13 18 * whether the native module is available for the current runtime. 19 + * @internal 14 20 */ 15 21 export let hasNative = false; 16 22 17 - let nativeGetGraphemeLength: ((str: string) => number) | null = null; 18 - let nativeIsGraphemeLengthInRange: ((str: string, min: number, max: number) => boolean) | null = null; 23 + /** 24 + * returns the grapheme length of a string 25 + * @param text string to count graphemes in 26 + * @returns grapheme count 27 + */ 28 + export let getGraphemeLength: (text: string) => number = getGraphemeLengthJs; 29 + 30 + /** 31 + * checks if the grapheme length of a string is within the specified range 32 + * @param text string to check 33 + * @param min minimum grapheme length (inclusive) 34 + * @param max maximum grapheme length (inclusive) 35 + * @returns true if the grapheme length is within range 36 + */ 37 + export let isGraphemeLengthInRange: (text: string, min: number, max: number) => boolean = isGraphemeLengthInRangeJs; 19 38 20 39 try { 21 40 const getPrebuildDir = (): string => { ··· 33 52 const require = createRequire(import.meta.url); 34 53 const binding: GraphemeBinding = require(`../prebuilds/${getPrebuildDir()}/grapheme.node`); 35 54 36 - nativeGetGraphemeLength = binding.getGraphemeLength; 37 - nativeIsGraphemeLengthInRange = binding.isGraphemeLengthInRange; 38 - hasNative = true; 39 - } catch {} 55 + const nativeGetGraphemeLength = binding.getGraphemeLength; 56 + const nativeIsGraphemeLengthInRange = binding.isGraphemeLengthInRange; 40 57 41 - const isAsciiWithoutCr = (text: string): boolean => { 42 - const len = text.length; 43 - let idx = 0; 58 + getGraphemeLength = (text) => { 59 + if (isAsciiWithoutCr(text)) { 60 + return text.length; 61 + } 62 + return nativeGetGraphemeLength(text); 63 + }; 44 64 45 - while (idx + 3 < len) { 46 - const a = text.charCodeAt(idx); 47 - const b = text.charCodeAt(idx + 1); 48 - const c = text.charCodeAt(idx + 2); 49 - const d = text.charCodeAt(idx + 3); 65 + isGraphemeLengthInRange = (text, min, max) => { 66 + const utf16Len = text.length; 50 67 51 - if ((a | b | c | d) > 0x7f || a === 0x0d || b === 0x0d || c === 0x0d || d === 0x0d) { 68 + if (utf16Len < min) { 52 69 return false; 53 70 } 54 - 55 - idx += 4; 56 - } 57 - 58 - while (idx < len) { 59 - const code = text.charCodeAt(idx); 60 - if (code > 0x7f || code === 0x0d) { 61 - return false; 71 + if (min === 0 && utf16Len <= max) { 72 + return true; 73 + } 74 + if (isAsciiWithoutCr(text)) { 75 + return utf16Len <= max; 62 76 } 63 77 64 - idx++; 65 - } 66 - 67 - return true; 68 - }; 69 - 70 - /** 71 - * returns the grapheme length of a string 72 - * @param text string to count graphemes in 73 - * @returns grapheme count 74 - */ 75 - export const getGraphemeLength = (text: string): number => { 76 - if (isAsciiWithoutCr(text)) { 77 - return text.length; 78 - } 79 - 80 - // native module handles non-ASCII much faster than Intl.Segmenter 81 - if (nativeGetGraphemeLength !== null) { 82 - return nativeGetGraphemeLength(text); 83 - } 84 - 85 - const iterator = segmenter.segment(text)[Symbol.iterator](); 86 - let count = 0; 87 - 88 - while (!iterator.next().done) { 89 - count++; 90 - } 91 - 92 - return count; 93 - }; 94 - 95 - /** 96 - * checks if the grapheme length of a string is within the specified range 97 - * @param text string to check 98 - * @param min minimum grapheme length (inclusive) 99 - * @param max maximum grapheme length (inclusive) 100 - * @returns true if the grapheme length is within range 101 - */ 102 - export const isGraphemeLengthInRange = (text: string, min: number, max: number): boolean => { 103 - const utf16Len = text.length; 104 - 105 - // UTF-16 length < min means grapheme count < min 106 - if (utf16Len < min) { 107 - return false; 108 - } 109 - 110 - // if there's no minimum constraint and UTF-16 length is within max, 111 - // grapheme count is definitely within max 112 - if (min === 0 && utf16Len <= max) { 113 - return true; 114 - } 115 - 116 - if (isAsciiWithoutCr(text)) { 117 - return utf16Len <= max; 118 - } 119 - 120 - // native module handles non-ASCII much faster 121 - if (nativeIsGraphemeLengthInRange !== null) { 122 78 return nativeIsGraphemeLengthInRange(text, min, max); 123 - } 79 + }; 124 80 125 - // count graphemes with early termination 126 - const iterator = segmenter.segment(text)[Symbol.iterator](); 127 - let count = 0; 128 - 129 - while (!iterator.next().done) { 130 - count++; 131 - if (count > max) { 132 - return false; 133 - } 134 - } 135 - 136 - return count >= min; 137 - }; 81 + hasNative = true; 82 + } catch {}
+1 -28
packages/misc/util-text/lib/index.rn.ts
··· 1 1 import { countGraphemes } from 'unicode-segmenter/grapheme'; 2 2 3 - const isAsciiWithoutCr = (text: string): boolean => { 4 - const len = text.length; 5 - let idx = 0; 6 - 7 - while (idx + 3 < len) { 8 - const a = text.charCodeAt(idx); 9 - const b = text.charCodeAt(idx + 1); 10 - const c = text.charCodeAt(idx + 2); 11 - const d = text.charCodeAt(idx + 3); 12 - 13 - if ((a | b | c | d) > 0x7f || a === 0x0d || b === 0x0d || c === 0x0d || d === 0x0d) { 14 - return false; 15 - } 16 - 17 - idx += 4; 18 - } 19 - 20 - while (idx < len) { 21 - const code = text.charCodeAt(idx); 22 - if (code > 0x7f || code === 0x0d) { 23 - return false; 24 - } 25 - 26 - idx++; 27 - } 28 - 29 - return true; 30 - }; 3 + import { isAsciiWithoutCr } from './utils.ts'; 31 4 32 5 /** 33 6 * returns the grapheme length of a string
+2 -34
packages/misc/util-text/lib/index.ts
··· 1 - const segmenter = new Intl.Segmenter(); 2 - 3 - const isAsciiWithoutCr = (text: string): boolean => { 4 - const len = text.length; 5 - let idx = 0; 6 - 7 - while (idx + 3 < len) { 8 - const a = text.charCodeAt(idx); 9 - const b = text.charCodeAt(idx + 1); 10 - const c = text.charCodeAt(idx + 2); 11 - const d = text.charCodeAt(idx + 3); 12 - 13 - if ((a | b | c | d) > 0x7f || a === 0x0d || b === 0x0d || c === 0x0d || d === 0x0d) { 14 - return false; 15 - } 16 - 17 - idx += 4; 18 - } 19 - 20 - while (idx < len) { 21 - const code = text.charCodeAt(idx); 22 - if (code > 0x7f || code === 0x0d) { 23 - return false; 24 - } 25 - 26 - idx++; 27 - } 1 + import { isAsciiWithoutCr } from './utils.ts'; 28 2 29 - return true; 30 - }; 31 - 32 - /** 33 - * whether the native module is available for the current runtime. 34 - */ 35 - export const hasNative = false; 3 + const segmenter = new Intl.Segmenter(); 36 4 37 5 /** 38 6 * returns the grapheme length of a string
+28
packages/misc/util-text/lib/utils.ts
··· 1 + export const isAsciiWithoutCr = (text: string): boolean => { 2 + const len = text.length; 3 + let idx = 0; 4 + 5 + while (idx + 3 < len) { 6 + const a = text.charCodeAt(idx); 7 + const b = text.charCodeAt(idx + 1); 8 + const c = text.charCodeAt(idx + 2); 9 + const d = text.charCodeAt(idx + 3); 10 + 11 + if ((a | b | c | d) > 0x7f || a === 0x0d || b === 0x0d || c === 0x0d || d === 0x0d) { 12 + return false; 13 + } 14 + 15 + idx += 4; 16 + } 17 + 18 + while (idx < len) { 19 + const code = text.charCodeAt(idx); 20 + if (code > 0x7f || code === 0x0d) { 21 + return false; 22 + } 23 + 24 + idx++; 25 + } 26 + 27 + return true; 28 + };