this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

optimize normalization

alice 40f8b1a2 88004274

+85 -33
+85 -33
packages/backend/src/lib/emojiNormalization.ts
··· 1 - import fs from 'fs'; 1 + import fs from 'node:fs'; 2 2 3 3 import { codePointToEmoji, emojiToCodePoint, lowercaseObject } from './helpers.js'; 4 4 import { Emoji, EmojiVariationSequence } from './types.js'; 5 5 6 6 // Load and parse normalization data 7 - // converted from: https://unicode.org/Public/emoji/12.1/emoji-variation-sequences.txt 8 - // regex in Sublime Text form: 9 - // find: ([0-9A-F]{4,5}) +FE0E +; +.+? style; +\# \((\d.\d)\) ([A-Z0-9\- ]+)\n[0-9A-F]{4,5} +FE0F +; +.+? style; +\# \(\d.\d\) [A-Z0-9\- ]+\n 10 - // replace: {"code": "$1", "textStyle": "$1 FE0E", "emojiStyle": "$1 FE0F", "version": "$2", "name": "$3"},\n 7 + // Converted from: https://unicode.org/Public/emoji/12.1/emoji-variation-sequences.txt 8 + // Regex in Sublime Text form: 9 + // Find: ([0-9A-F]{4,5}) +FE0E +; +.+? style; +\# \((\d.\d)\) ([A-Z0-9\- ]+)\n[0-9A-F]{4,5} +FE0F +; +.+? style; +\# \(\d.\d\) [A-Z0-9\- ]+\n 10 + // Replace: {"code": "$1", "textStyle": "$1 FE0E", "emojiStyle": "$1 FE0F", "version": "$2", "name": "$3"},\n 11 11 const eVSPath = new URL('./data/emojiVariationSequences.json', import.meta.url); 12 - 13 - // source: https://github.com/iamcal/emoji-data/blob/master/emoji.json 14 12 const eJSONPath = new URL('./data/emoji.json', import.meta.url); 15 13 16 - const emojiVariationSequences: EmojiVariationSequence[] = JSON.parse( 17 - fs.readFileSync(eVSPath, 'utf8'), 18 - ) as EmojiVariationSequence[]; 14 + // Initialize normalization maps as Maps for faster lookups 15 + const normalizationMap = new Map<string, string>(); 16 + const nonQualifiedMap = new Map<string, string>(); 19 17 20 - const emojiData: Emoji[] = JSON.parse(fs.readFileSync(eJSONPath, 'utf8')) as Emoji[]; 18 + // Cache for memoization 19 + const normalizationCache = new Map<string, string>(); 20 + 21 + // Function to load and process normalization data asynchronously 22 + async function initializeNormalizationMaps() { 23 + const [eVSData, eJSONData] = await Promise.all([ 24 + fs.promises.readFile(eVSPath, 'utf8'), 25 + fs.promises.readFile(eJSONPath, 'utf8'), 26 + ]); 21 27 22 - // Build normalization maps 23 - let normalizationMap: Record<string, string> = {}; 24 - emojiVariationSequences.forEach((seq) => { 25 - normalizationMap[seq.code] = seq.emojiStyle; 26 - normalizationMap[seq.textStyle] = seq.emojiStyle; 27 - }); 28 + const emojiVariationSequences: EmojiVariationSequence[] = JSON.parse(eVSData) as EmojiVariationSequence[]; 29 + const emojiData: Emoji[] = JSON.parse(eJSONData) as Emoji[]; 28 30 29 - normalizationMap = lowercaseObject(normalizationMap); 31 + for (const seq of emojiVariationSequences) { 32 + normalizationMap.set(seq.code.toLowerCase(), seq.emojiStyle); 33 + normalizationMap.set(seq.textStyle.toLowerCase(), seq.emojiStyle); 34 + } 30 35 31 - let nonQualifiedMap: Record<string, string> = {}; 32 - emojiData.forEach((emojiEntry) => { 33 - if (emojiEntry.non_qualified && emojiEntry.unified) { 34 - nonQualifiedMap[emojiEntry.non_qualified.replaceAll('-', ' ')] = emojiEntry.unified.replaceAll('-', ' '); 36 + const lowercasedNonQualifiedMap = lowercaseObject(Object.fromEntries(normalizationMap)); 37 + normalizationMap.clear(); 38 + for (const [key, value] of Object.entries(lowercasedNonQualifiedMap)) { 39 + normalizationMap.set(key, value); 35 40 } 36 - }); 37 41 38 - nonQualifiedMap = lowercaseObject(nonQualifiedMap); 42 + for (const emojiEntry of emojiData) { 43 + if (emojiEntry.non_qualified && emojiEntry.unified) { 44 + nonQualifiedMap.set( 45 + emojiEntry.non_qualified.replaceAll('-', ' ').toLowerCase(), 46 + emojiEntry.unified.replaceAll('-', ' ').toLowerCase(), 47 + ); 48 + } 49 + } 50 + 51 + const lowercasedNonQualified = lowercaseObject(Object.fromEntries(nonQualifiedMap)); 52 + nonQualifiedMap.clear(); 53 + for (const [key, value] of Object.entries(lowercasedNonQualified)) { 54 + nonQualifiedMap.set(key, value); 55 + } 56 + 57 + // Freeze the maps to prevent modifications 58 + Object.freeze(normalizationMap); 59 + Object.freeze(nonQualifiedMap); 60 + } 61 + 62 + // Initialize the maps at startup 63 + initializeNormalizationMaps().catch((error: unknown) => { 64 + console.error('Failed to initialize normalization maps:', error); 65 + process.exit(1); 66 + }); 39 67 40 68 export function normalizeEmoji(emoji: string): string { 69 + if (normalizationCache.has(emoji)) { 70 + return normalizationCache.get(emoji)!; 71 + } 72 + 41 73 // First Pass: Variation Sequence Normalization 42 - const emojiCodePoints = emojiToCodePoint(emoji); 43 - let firstPass; 44 - if (normalizationMap[emojiCodePoints]) { 45 - firstPass = normalizationMap[emojiCodePoints]; 46 - } else { 47 - firstPass = emojiCodePoints; 48 - } 74 + const emojiCodePoints = emojiToCodePoint(emoji).toLowerCase(); 75 + const firstPass = normalizationMap.get(emojiCodePoints) ?? emojiCodePoints; 49 76 let normalizedEmoji = codePointToEmoji(firstPass); 50 77 51 78 // Second Pass: Non-Qualified to Unified Normalization 52 - const unifiedCodePoints = nonQualifiedMap[firstPass]; 79 + const unifiedCodePoints = nonQualifiedMap.get(firstPass); 53 80 if (unifiedCodePoints && unifiedCodePoints !== firstPass) { 54 81 normalizedEmoji = codePointToEmoji(unifiedCodePoints); 55 82 } 56 83 84 + normalizationCache.set(emoji, normalizedEmoji); 57 85 return normalizedEmoji; 58 86 } 59 87 60 88 export function batchNormalizeEmojis(emojis: string[]): string[] { 61 - return emojis.map((emoji) => normalizeEmoji(emoji)); 89 + const result: string[] = new Array<string>(emojis.length); 90 + for (let i = 0; i < emojis.length; i++) { 91 + const emoji = emojis[i]; 92 + if (normalizationCache.has(emoji)) { 93 + result[i] = normalizationCache.get(emoji)!; 94 + if (emojiToCodePoint(emoji) !== emojiToCodePoint(result[i])) { 95 + console.log(`Cache hit for ${emojiToCodePoint(emoji)}: ${emojiToCodePoint(result[i])}`); 96 + } 97 + } else { 98 + // First Pass: Variation Sequence Normalization 99 + const emojiCodePoints = emojiToCodePoint(emoji).toLowerCase(); 100 + const firstPass = normalizationMap.get(emojiCodePoints) ?? emojiCodePoints; 101 + let normalizedEmoji = codePointToEmoji(firstPass); 102 + 103 + // Second Pass: Non-Qualified to Unified Normalization 104 + const unifiedCodePoints = nonQualifiedMap.get(firstPass); 105 + if (unifiedCodePoints && unifiedCodePoints !== firstPass) { 106 + normalizedEmoji = codePointToEmoji(unifiedCodePoints); 107 + } 108 + 109 + normalizationCache.set(emoji, normalizedEmoji); 110 + result[i] = normalizedEmoji; 111 + } 112 + } 113 + return result; 62 114 }