a collection of lightweight TypeScript packages for AT Protocol, the protocol powering Bluesky
atproto bluesky typescript npm
101
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor(bluesky-richtext-parser): first-char dispatch, manual autolink backpedal

Mary c4629cd9 ab76ddaa

+153 -15
+5
.changeset/ripe-towns-cough.md
··· 1 + --- 2 + '@atcute/bluesky-richtext-parser': patch 3 + --- 4 + 5 + first-char dispatch, manual autolink backpedal
+101
packages/bluesky/richtext-parser/lib/index.bench.ts
··· 1 + import { bench, run, do_not_optimize } from 'mitata'; 2 + 3 + import { tokenize } from './index.ts'; 4 + 5 + interface TestCase { 6 + name: string; 7 + text: string; 8 + } 9 + 10 + const TEST_CASES: TestCase[] = [ 11 + { 12 + name: 'short-ui: plain words', 13 + text: 'hello world', 14 + }, 15 + { 16 + name: 'short-ui: mention', 17 + text: '@alice.bsky.social hi there', 18 + }, 19 + { 20 + name: 'short-ui: topics', 21 + text: '#atproto and #bluesky', 22 + }, 23 + { 24 + name: 'short-ui: autolink', 25 + text: 'check this out: https://example.com', 26 + }, 27 + { 28 + name: 'short-ui: underscore word', 29 + text: 'foo_bar_baz', 30 + }, 31 + { 32 + name: 'short-ui: style mix', 33 + text: '**bold** and *italic* and `code`', 34 + }, 35 + { 36 + name: 'short-ui: emote', 37 + text: 'hello :wave:', 38 + }, 39 + { 40 + name: 'short-ui: markdown link', 41 + text: '[my site](https://example.com)', 42 + }, 43 + { 44 + name: 'short-ui: cashtags', 45 + text: 'cost is $AAPL and $BTC', 46 + }, 47 + { 48 + name: 'short-ui: no syntax', 49 + text: 'plain words only no syntax here', 50 + }, 51 + { 52 + name: 'mixed-social: mention topic link', 53 + text: 'hello @alice.bsky.social! check out #atproto and https://atproto.com', 54 + }, 55 + { 56 + name: 'mixed-social: nested emphasis', 57 + text: '___underlined__ but then not_ and ***bold** but then not*', 58 + }, 59 + { 60 + name: 'mixed-social: escapes and markdown', 61 + text: 'foo\\@bar and \\__underlined__ and [**mixed** formatting](example.com)', 62 + }, 63 + { 64 + name: 'mixed-social: delete emote link', 65 + text: 'watermelon ~~strike~~ and :party-parrot: and [repo](https://github.com/bluesky-social/atproto)', 66 + }, 67 + { 68 + name: 'mixed-social: multiline autolink', 69 + text: 'https://github.com/mary-ext/atproto-scraping/commit/\ncaaa495ae654ef8a98f223f3cecfe2ca261d6b4f', 70 + }, 71 + { 72 + name: 'mixed-social: prefixed autolink', 73 + text: 'abchttps://example.com/', 74 + }, 75 + { 76 + name: 'mixed-social: punctuation edge', 77 + text: '@@bsky.app #cool# $AAPL$ and ((https://foo.com/thing_(cool)))', 78 + }, 79 + { 80 + name: 'long-posts: english social post', 81 + text: 'Space penguin.\nThis is Arp 142, two interacting galaxies NGC 2936 and NGC 2937.\nProcessed by c.claude.\nhttps://www.flickr.com/photos/27527123@N02\n🔭 🧪', 82 + }, 83 + { 84 + name: 'long-posts: japanese news', 85 + text: '米SNSに党派色 Xに反発、Blueskyに大統領選挙後に100万人登録 https://www.nikkei.com/article/DGXZQOGN14EX90U4A111C2000000/?n_cid=SNSBS001 Blueskyは世界の利用者数が1500万人に達したと明らかにしました。 #ニュース', 86 + }, 87 + { 88 + name: 'long-posts: multilingual hashtags', 89 + text: '☀️Good morning☀️ #写真が好きな人と繋がりたい #Photography #Photo #風景写真 #青空 @bskyphotos.bsky.social', 90 + }, 91 + { 92 + name: 'long-posts: full feature mix', 93 + text: 'hello @bob.bsky.social and [my site](https://example.com) with some `inline code` and ~~deleted~~ words plus ***nested emphasis*** for parser stress', 94 + }, 95 + ]; 96 + 97 + for (const { name, text } of TEST_CASES) { 98 + bench(name, () => do_not_optimize(tokenize(text))); 99 + } 100 + 101 + await run();
+47 -15
packages/bluesky/richtext-parser/lib/index.ts
··· 10 10 const EMOTE_RE = /^:([\w-]+):/; 11 11 12 12 const AUTOLINK_RE = /^https?:\/\/[\S]+/; 13 - const AUTOLINK_BACKPEDAL_RE = /(?:(?<!\(.*)\))?[.,;]*$/; 13 + 14 + const trimAutolink = (url: string): string => { 15 + let end = url.length; 16 + 17 + while (end > 0) { 18 + const code = url.charCodeAt(end - 1); 19 + if (code === 46 || code === 44 || code === 59) { 20 + end -= 1; 21 + continue; 22 + } 23 + 24 + break; 25 + } 26 + 27 + if (end > 0 && url.charCodeAt(end - 1) === 41 && url.lastIndexOf('(', end - 1) === -1) { 28 + end -= 1; 29 + } 30 + 31 + return end === url.length ? url : url.slice(0, end); 32 + }; 14 33 15 34 const LINK_RE = 16 35 /^\[((?:\[[^\]]*\]|[^[\]]|\](?=[^[]*\]))*)\]\(\s*<?((?:\([^)]*\)|[^\s\\]|\\.)*?)>?(?:\s+['"]([^]*?)['"])?\s*\)/; ··· 188 207 const tokenizeAutolink = (src: string): AutolinkToken | undefined => { 189 208 const match = AUTOLINK_RE.exec(src); 190 209 if (match) { 191 - const url = match[0].replace(AUTOLINK_BACKPEDAL_RE, ''); 210 + const url = trimAutolink(match[0]); 192 211 193 212 return { 194 213 type: 'autolink', ··· 311 330 312 331 while (src) { 313 332 last = token; 333 + const first = src.charCodeAt(0); 314 334 315 - if ( 316 - (token = 317 - tokenizeEscape(src) || 318 - tokenizeAutolink(src) || 319 - tokenizeMention(src) || 320 - tokenizeTopic(src) || 321 - tokenizeCashtag(src) || 322 - tokenizeEmote(src) || 323 - tokenizeLink(src) || 324 - tokenizeEmStrongU(src) || 325 - tokenizeDelete(src) || 326 - tokenizeCode(src)) 327 - ) { 335 + if (first === 92) { 336 + token = tokenizeEscape(src); 337 + } else if (first === 104) { 338 + token = tokenizeAutolink(src); 339 + } else if (first === 64 || first === 65312) { 340 + token = tokenizeMention(src); 341 + } else if (first === 35 || first === 65283) { 342 + token = tokenizeTopic(src); 343 + } else if (first === 36 || first === 65284) { 344 + token = tokenizeCashtag(src); 345 + } else if (first === 58) { 346 + token = tokenizeEmote(src); 347 + } else if (first === 91) { 348 + token = tokenizeLink(src); 349 + } else if (first === 42 || first === 95) { 350 + token = tokenizeEmStrongU(src); 351 + } else if (first === 126) { 352 + token = tokenizeDelete(src); 353 + } else if (first === 96) { 354 + token = tokenizeCode(src); 355 + } else { 356 + token = undefined; 357 + } 358 + 359 + if (token) { 328 360 src = src.slice(token.raw.length); 329 361 tokens.push(token); 330 362 continue;