A simple tool which lets you scrape twitter accounts and crosspost them to bluesky accounts! Comes with a CLI and a webapp for managing profiles! Works with images/videos/link embeds/threads.
11
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: robust threading for split tweets and smart splitting

- Add bsky_tail_uri/cid to DB to track thread continuity across sessions.\n- Implement smart splitting with (i/n) numbering and sentence boundary detection.\n- Fix intra-batch threading using localProcessedMap.

jack 3ca09b2c 3eed0651

+134 -39
+25 -5
src/db.ts
··· 23 23 let schemaChanged = false; 24 24 const hasBskyIdentifier = tableInfo.some((col) => col.name === 'bsky_identifier'); 25 25 const hasTweetText = tableInfo.some((col) => col.name === 'tweet_text'); 26 + const hasTailUri = tableInfo.some((col) => col.name === 'bsky_tail_uri'); 26 27 27 - if (!hasBskyIdentifier || !hasTweetText) { 28 + if (!hasBskyIdentifier || !hasTweetText || !hasTailUri) { 28 29 console.log('🔄 Upgrading database schema...'); 29 30 30 31 // SQLite doesn't support easy PK changes, so we recreate the table if identifier is missing ··· 47 48 bsky_cid TEXT, 48 49 bsky_root_uri TEXT, 49 50 bsky_root_cid TEXT, 51 + bsky_tail_uri TEXT, 52 + bsky_tail_cid TEXT, 50 53 status TEXT NOT NULL, 51 54 created_at DATETIME DEFAULT CURRENT_TIMESTAMP, 52 55 PRIMARY KEY (twitter_id, bsky_identifier) ··· 63 66 64 67 // If old table didn't have tweet_text, we default to NULL 65 68 const textSelect = oldColumns.includes('tweet_text') ? 'tweet_text' : "NULL"; 69 + 70 + const tailUriSelect = oldColumns.includes('bsky_tail_uri') ? 'bsky_tail_uri' : "NULL"; 71 + const tailCidSelect = oldColumns.includes('bsky_tail_cid') ? 'bsky_tail_cid' : "NULL"; 66 72 67 73 db.exec(` 68 74 INSERT INTO processed_tweets ( ··· 73 79 bsky_uri, 74 80 bsky_cid, 75 81 bsky_root_uri, 76 - bsky_root_cid, 82 + bsky_root_cid, 83 + bsky_tail_uri, 84 + bsky_tail_cid, 77 85 status, 78 86 created_at 79 87 ) ··· 85 93 bsky_uri, 86 94 bsky_cid, 87 95 bsky_root_uri, 88 - bsky_root_cid, 96 + bsky_root_cid, 97 + ${tailUriSelect}, 98 + ${tailCidSelect}, 89 99 status, 90 100 created_at 91 101 FROM processed_tweets_old; ··· 108 118 bsky_cid TEXT, 109 119 bsky_root_uri TEXT, 110 120 bsky_root_cid TEXT, 121 + bsky_tail_uri TEXT, 122 + bsky_tail_cid TEXT, 111 123 status TEXT NOT NULL, -- 'migrated', 'skipped', 'failed' 112 124 created_at DATETIME DEFAULT CURRENT_TIMESTAMP, 113 125 PRIMARY KEY (twitter_id, bsky_identifier) ··· 129 141 bsky_cid?: string; 130 142 bsky_root_uri?: string; 131 143 bsky_root_cid?: string; 144 + bsky_tail_uri?: string; 145 + bsky_tail_cid?: string; 132 146 status: 'migrated' | 'skipped' | 'failed'; 133 147 created_at?: string; 134 148 } ··· 147 161 bsky_cid: row.bsky_cid, 148 162 bsky_root_uri: row.bsky_root_uri, 149 163 bsky_root_cid: row.bsky_root_cid, 164 + bsky_tail_uri: row.bsky_tail_uri, 165 + bsky_tail_cid: row.bsky_tail_cid, 150 166 status: row.status, 151 167 created_at: row.created_at 152 168 }; ··· 155 171 saveTweet(tweet: ProcessedTweet) { 156 172 const stmt = db.prepare(` 157 173 INSERT OR REPLACE INTO processed_tweets 158 - (twitter_id, twitter_username, bsky_identifier, tweet_text, bsky_uri, bsky_cid, bsky_root_uri, bsky_root_cid, status) 159 - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 174 + (twitter_id, twitter_username, bsky_identifier, tweet_text, bsky_uri, bsky_cid, bsky_root_uri, bsky_root_cid, bsky_tail_uri, bsky_tail_cid, status) 175 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 160 176 `); 161 177 stmt.run( 162 178 tweet.twitter_id, ··· 167 183 tweet.bsky_cid || null, 168 184 tweet.bsky_root_uri || null, 169 185 tweet.bsky_root_cid || null, 186 + tweet.bsky_tail_uri || null, 187 + tweet.bsky_tail_cid || null, 170 188 tweet.status, 171 189 ); 172 190 }, ··· 180 198 uri: row.bsky_uri, 181 199 cid: row.bsky_cid, 182 200 root: row.bsky_root_uri ? { uri: row.bsky_root_uri, cid: row.bsky_root_cid } : undefined, 201 + tail: (row.bsky_tail_uri && row.bsky_tail_cid) ? { uri: row.bsky_tail_uri, cid: row.bsky_tail_cid } : undefined, 183 202 migrated: row.status === 'migrated', 184 203 skipped: row.status === 'skipped', 185 204 }; ··· 196 215 uri: row.bsky_uri, 197 216 cid: row.bsky_cid, 198 217 root: row.bsky_root_uri ? { uri: row.bsky_root_uri, cid: row.bsky_root_cid } : undefined, 218 + tail: (row.bsky_tail_uri && row.bsky_tail_cid) ? { uri: row.bsky_tail_uri, cid: row.bsky_tail_cid } : undefined, 199 219 migrated: row.status === 'migrated', 200 220 skipped: row.status === 'skipped' 201 221 };
+109 -34
src/index.ts
··· 29 29 uri?: string; 30 30 cid?: string; 31 31 root?: { uri: string; cid: string }; 32 + tail?: { uri: string; cid: string }; 32 33 migrated?: boolean; 33 34 skipped?: boolean; 34 35 text?: string; ··· 189 190 bsky_cid: entry.cid, 190 191 bsky_root_uri: entry.root?.uri, 191 192 bsky_root_cid: entry.root?.cid, 193 + bsky_tail_uri: entry.tail?.uri, 194 + bsky_tail_cid: entry.tail?.cid, 192 195 status: entry.migrated || (entry.uri && entry.cid) ? 'migrated' : (entry.skipped ? 'skipped' : 'failed') 193 196 }); 194 197 } ··· 720 723 const chunks: string[] = []; 721 724 let remaining = text; 722 725 726 + // Reserve space for numbering like " (1/3)" -> approx 7 chars 727 + // We apply this reservation to the limit check 728 + const effectiveLimit = limit - 8; 729 + 723 730 while (remaining.length > 0) { 724 731 if (remaining.length <= limit) { 725 732 chunks.push(remaining); 726 733 break; 727 734 } 728 735 729 - // Try to split by paragraph 730 - let splitIndex = remaining.lastIndexOf('\n\n', limit); 736 + // Smart splitting priority: 737 + // 1. Double newline (paragraph) 738 + // 2. Sentence end (.!?) 739 + // 3. Space 740 + // 4. Force split 741 + 742 + let splitIndex = -1; 743 + 744 + // Check paragraphs 745 + let checkIndex = remaining.lastIndexOf('\n\n', effectiveLimit); 746 + if (checkIndex !== -1) splitIndex = checkIndex; 747 + 748 + // Check sentences 731 749 if (splitIndex === -1) { 732 - // Try to split by sentence 733 - splitIndex = remaining.lastIndexOf('. ', limit); 734 - if (splitIndex === -1) { 735 - // Try to split by space 736 - splitIndex = remaining.lastIndexOf(' ', limit); 737 - if (splitIndex === -1) { 738 - // Force split 739 - splitIndex = limit; 750 + // Look for punctuation followed by space 751 + const sentenceMatches = Array.from(remaining.substring(0, effectiveLimit).matchAll(/[.!?]\s/g)); 752 + if (sentenceMatches.length > 0) { 753 + const lastMatch = sentenceMatches[sentenceMatches.length - 1]; 754 + if (lastMatch && lastMatch.index !== undefined) { 755 + splitIndex = lastMatch.index + 1; // Include punctuation 756 + } 740 757 } 741 - } else { 742 - splitIndex += 1; // Include the period 743 - } 758 + } 759 + 760 + // Check spaces 761 + if (splitIndex === -1) { 762 + checkIndex = remaining.lastIndexOf(' ', effectiveLimit); 763 + if (checkIndex !== -1) splitIndex = checkIndex; 764 + } 765 + 766 + // Force split if no good break point found 767 + if (splitIndex === -1) { 768 + splitIndex = effectiveLimit; 744 769 } 745 770 746 771 chunks.push(remaining.substring(0, splitIndex).trim()); ··· 884 909 }); 885 910 886 911 const processedTweets = loadProcessedTweets(bskyIdentifier); 887 - const toProcess = filteredTweets.filter((t) => !processedTweets[t.id_str || t.id || '']); 912 + 913 + // Maintain a local map that updates in real-time for intra-batch replies 914 + const localProcessedMap: ProcessedTweetsMap = { ...processedTweets }; 915 + 916 + const toProcess = filteredTweets.filter((t) => !localProcessedMap[t.id_str || t.id || '']); 888 917 889 918 if (toProcess.length === 0) { 890 919 console.log(`[${twitterUsername}] ✅ No new tweets to process for ${bskyIdentifier}.`); ··· 900 929 const tweetId = tweet.id_str || tweet.id; 901 930 if (!tweetId) continue; 902 931 903 - if (processedTweets[tweetId]) continue; 932 + if (localProcessedMap[tweetId]) continue; 904 933 905 934 const isRetweet = tweet.isRetweet || tweet.retweeted_status_id_str || tweet.text?.startsWith('RT @'); 906 935 ··· 909 938 if (!dryRun) { 910 939 // Save as skipped so we don't check it again 911 940 saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweet.text }); 941 + localProcessedMap[tweetId] = { skipped: true, text: tweet.text }; 912 942 } 913 943 continue; 914 944 } ··· 930 960 let replyParentInfo: ProcessedTweetEntry | null = null; 931 961 932 962 if (isReply) { 933 - if (replyStatusId && processedTweets[replyStatusId]) { 963 + if (replyStatusId && localProcessedMap[replyStatusId]) { 934 964 console.log(`[${twitterUsername}] 🧵 Threading reply to post in ${bskyIdentifier}: ${replyStatusId}`); 935 - replyParentInfo = processedTweets[replyStatusId] ?? null; 965 + replyParentInfo = localProcessedMap[replyStatusId] ?? null; 936 966 } else { 937 967 console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply.`); 938 968 if (!dryRun) { 939 969 saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweetText }); 970 + localProcessedMap[tweetId] = { skipped: true, text: tweetText }; 940 971 } 941 972 continue; 942 973 } ··· 1105 1136 1106 1137 if (tweet.is_quote_status && tweet.quoted_status_id_str) { 1107 1138 const quoteId = tweet.quoted_status_id_str; 1108 - const quoteRef = processedTweets[quoteId]; 1139 + const quoteRef = localProcessedMap[quoteId]; 1109 1140 if (quoteRef && !quoteRef.migrated && quoteRef.uri && quoteRef.cid) { 1110 1141 console.log(`[${twitterUsername}] 🔄 Found quoted tweet in local history. Natively embedding.`); 1111 1142 quoteEmbed = { $type: 'app.bsky.embed.record', record: { uri: quoteRef.uri, cid: quoteRef.cid } }; ··· 1175 1206 1176 1207 let lastPostInfo: ProcessedTweetEntry | null = replyParentInfo; 1177 1208 1209 + // We will save the first chunk as the "Root" of this tweet, and the last chunk as the "Tail". 1210 + let firstChunkInfo: { uri: string; cid: string; root?: { uri: string; cid: string } } | null = null; 1211 + let lastChunkInfo: { uri: string; cid: string; root?: { uri: string; cid: string } } | null = null; 1212 + 1178 1213 for (let i = 0; i < chunks.length; i++) { 1179 - const chunk = chunks[i] as string; 1214 + let chunk = chunks[i] as string; 1215 + 1216 + // Add (i/n) if split 1217 + if (chunks.length > 1) { 1218 + chunk += ` (${i + 1}/${chunks.length})`; 1219 + } 1220 + 1180 1221 console.log(`[${twitterUsername}] 📤 Posting chunk ${i + 1}/${chunks.length}...`); 1181 1222 updateAppStatus({ message: `Posting chunk ${i + 1}/${chunks.length}...` }); 1182 1223 ··· 1220 1261 } 1221 1262 } 1222 1263 1264 + // Threading logic 1265 + // Determine actual parent URI/CID to reply to 1266 + let parentRef: { uri: string; cid: string } | null = null; 1267 + let rootRef: { uri: string; cid: string } | null = null; 1268 + 1223 1269 if (lastPostInfo?.uri && lastPostInfo?.cid) { 1270 + // If this is the start of a new tweet (i=0), check if parent has a tail 1271 + if (i === 0 && lastPostInfo.tail) { 1272 + parentRef = lastPostInfo.tail; 1273 + } else { 1274 + // Otherwise (intra-tweet or parent has no tail), use the main uri/cid (which is the previous post/chunk) 1275 + parentRef = { uri: lastPostInfo.uri, cid: lastPostInfo.cid }; 1276 + } 1277 + 1278 + rootRef = lastPostInfo.root || parentRef; // Propagate root, or use parent as root if none 1279 + } 1280 + 1281 + if (parentRef && rootRef) { 1224 1282 postRecord.reply = { 1225 - root: lastPostInfo.root || { uri: lastPostInfo.uri, cid: lastPostInfo.cid }, 1226 - parent: { uri: lastPostInfo.uri, cid: lastPostInfo.cid }, 1283 + root: rootRef, 1284 + parent: parentRef, 1227 1285 }; 1228 1286 } 1229 1287 ··· 1251 1309 } 1252 1310 } 1253 1311 1254 - const currentPostInfo = { 1255 - uri: response.uri, 1256 - cid: response.cid, 1257 - root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid }, 1258 - text: tweetText 1259 - }; 1312 + const currentPostInfo = { 1313 + uri: response.uri, 1314 + cid: response.cid, 1315 + root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid }, 1316 + // Text is just the current chunk text 1317 + text: chunk 1318 + }; 1260 1319 1261 - if (i === 0) { 1262 - saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, currentPostInfo); 1263 - } 1264 - 1265 - lastPostInfo = currentPostInfo; console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`); 1320 + if (i === 0) firstChunkInfo = currentPostInfo; 1321 + lastChunkInfo = currentPostInfo; 1322 + lastPostInfo = currentPostInfo; // Update for next iteration 1323 + 1324 + console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`); 1266 1325 1267 1326 if (chunks.length > 1) { 1268 1327 await new Promise((r) => setTimeout(r, 3000)); ··· 1271 1330 console.error(`[${twitterUsername}] ❌ Failed to post ${tweetId} (chunk ${i + 1}):`, err); 1272 1331 break; 1273 1332 } 1333 + } 1334 + 1335 + // Save to DB and Map 1336 + if (firstChunkInfo && lastChunkInfo) { 1337 + const entry: ProcessedTweetEntry = { 1338 + uri: firstChunkInfo.uri, 1339 + cid: firstChunkInfo.cid, 1340 + root: firstChunkInfo.root, 1341 + tail: { uri: lastChunkInfo.uri, cid: lastChunkInfo.cid }, // Save tail! 1342 + text: tweetText 1343 + }; 1344 + 1345 + if (!dryRun) { 1346 + saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, entry); 1347 + localProcessedMap[tweetId] = entry; // Update local map for subsequent replies in this batch 1348 + } 1274 1349 } 1275 1350 1276 1351 // Add a random delay between 5s and 15s to be more human-like ··· 1320 1395 console.log("⚠️ Could not login to Bluesky, but proceeding with MOCK AGENT for Dry Run."); 1321 1396 // biome-ignore lint/suspicious/noExplicitAny: mock agent 1322 1397 agent = { 1323 - post: async (record: any) => ({ uri: 'at://mock/post', cid: 'mock-cid' }), 1398 + post: async (record: any) => ({ uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }), 1324 1399 uploadBlob: async (data: any) => ({ data: { blob: { ref: { toString: () => 'mock-blob' } } } }), 1325 1400 // Add other necessary methods if they are called outside of the already mocked dryRun blocks 1326 1401 // But since we mocked the calls inside processTweets for dryRun, we just need the object to exist. ··· 1556 1631 } 1557 1632 } 1558 1633 1559 - main(); 1634 + main();