this repo has no description
13
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 8a9c2d5e1b3778f1ea43c9bd5d325cfa72016584 216 lines 8.1 kB view raw
1const std = @import("std"); 2const unicode = std.unicode; 3const testing = std.testing; 4const uucode = @import("uucode"); 5 6/// the method to use when calculating the width of a grapheme 7pub const Method = enum { 8 unicode, 9 wcwidth, 10 no_zwj, 11}; 12 13/// Calculate width from east asian width property and Unicode properties 14fn eawToWidth(cp: u21, eaw: uucode.types.EastAsianWidth) i16 { 15 // Based on wcwidth implementation 16 // Control characters 17 if (cp == 0) return 0; 18 if (cp < 32 or (cp >= 0x7f and cp < 0xa0)) return -1; 19 20 // Use general category for comprehensive zero-width detection 21 const gc = uucode.get(.general_category, cp); 22 switch (gc) { 23 .mark_nonspacing, .mark_enclosing => return 0, 24 else => {}, 25 } 26 27 // Additional zero-width characters not covered by general category 28 if (cp == 0x00ad) return 0; // soft hyphen 29 if (cp == 0x200b) return 0; // zero-width space 30 if (cp == 0x200c) return 0; // zero-width non-joiner 31 if (cp == 0x200d) return 0; // zero-width joiner 32 if (cp == 0x2060) return 0; // word joiner 33 if (cp == 0x034f) return 0; // combining grapheme joiner 34 if (cp == 0xfeff) return 0; // zero-width no-break space (BOM) 35 if (cp >= 0x180b and cp <= 0x180d) return 0; // Mongolian variation selectors 36 if (cp >= 0xfe00 and cp <= 0xfe0f) return 0; // variation selectors 37 if (cp >= 0xe0100 and cp <= 0xe01ef) return 0; // Plane-14 variation selectors 38 39 // East Asian Width: fullwidth or wide = 2 40 // ambiguous in East Asian context = 2, otherwise 1 41 // halfwidth, narrow, or neutral = 1 42 return switch (eaw) { 43 .fullwidth, .wide => 2, 44 else => 1, 45 }; 46} 47 48/// returns the width of the provided string, as measured by the method chosen 49pub fn gwidth(str: []const u8, method: Method) u16 { 50 switch (method) { 51 .unicode => { 52 var total: u16 = 0; 53 var grapheme_iter = uucode.grapheme.Iterator(uucode.utf8.Iterator).init(.init(str)); 54 55 var grapheme_start: usize = 0; 56 var prev_break: bool = true; 57 58 while (grapheme_iter.next()) |result| { 59 if (prev_break and !result.is_break) { 60 // Start of a new grapheme 61 const cp_len: usize = std.unicode.utf8CodepointSequenceLength(result.cp) catch 1; 62 grapheme_start = grapheme_iter.i - cp_len; 63 } 64 65 if (result.is_break) { 66 // End of a grapheme - calculate its width 67 const grapheme_end = grapheme_iter.i; 68 const grapheme_bytes = str[grapheme_start..grapheme_end]; 69 70 // Calculate grapheme width 71 var g_iter = uucode.utf8.Iterator.init(grapheme_bytes); 72 var width: i16 = 0; 73 var has_emoji_vs: bool = false; 74 var has_text_vs: bool = false; 75 var has_emoji_presentation: bool = false; 76 var ri_count: u8 = 0; 77 78 while (g_iter.next()) |cp| { 79 // Check for emoji variation selector (U+FE0F) 80 if (cp == 0xfe0f) { 81 has_emoji_vs = true; 82 continue; 83 } 84 85 // Check for text variation selector (U+FE0E) 86 if (cp == 0xfe0e) { 87 has_text_vs = true; 88 continue; 89 } 90 91 // Check if this codepoint has emoji presentation 92 if (uucode.get(.is_emoji_presentation, cp)) { 93 has_emoji_presentation = true; 94 } 95 96 // Count regional indicators (for flag emojis) 97 if (cp >= 0x1F1E6 and cp <= 0x1F1FF) { 98 ri_count += 1; 99 } 100 101 const eaw = uucode.get(.east_asian_width, cp); 102 const w = eawToWidth(cp, eaw); 103 // Take max of non-zero widths 104 if (w > 0 and w > width) width = w; 105 } 106 107 // Handle variation selectors and emoji presentation 108 if (has_text_vs) { 109 // Text presentation explicit - keep width as-is (usually 1) 110 width = @max(1, width); 111 } else if (has_emoji_vs or has_emoji_presentation or ri_count == 2) { 112 // Emoji presentation or flag pair - force width 2 113 width = @max(2, width); 114 } 115 116 total += @max(0, width); 117 118 grapheme_start = grapheme_end; 119 } 120 prev_break = result.is_break; 121 } 122 123 return total; 124 }, 125 .wcwidth => { 126 var total: u16 = 0; 127 var iter = uucode.utf8.Iterator.init(str); 128 while (iter.next()) |cp| { 129 const w: i16 = switch (cp) { 130 // undo an override in zg for emoji skintone selectors 131 0x1f3fb...0x1f3ff => 2, 132 else => blk: { 133 const eaw = uucode.get(.east_asian_width, cp); 134 break :blk eawToWidth(cp, eaw); 135 }, 136 }; 137 total += @intCast(@max(0, w)); 138 } 139 return total; 140 }, 141 .no_zwj => { 142 var iter = std.mem.splitSequence(u8, str, "\u{200D}"); 143 var result: u16 = 0; 144 while (iter.next()) |s| { 145 result += gwidth(s, .unicode); 146 } 147 return result; 148 }, 149 } 150} 151 152test "gwidth: a" { 153 try testing.expectEqual(1, gwidth("a", .unicode)); 154 try testing.expectEqual(1, gwidth("a", .wcwidth)); 155 try testing.expectEqual(1, gwidth("a", .no_zwj)); 156} 157 158test "gwidth: emoji with ZWJ" { 159 try testing.expectEqual(2, gwidth("👩‍🚀", .unicode)); 160 try testing.expectEqual(4, gwidth("👩‍🚀", .wcwidth)); 161 try testing.expectEqual(4, gwidth("👩‍🚀", .no_zwj)); 162} 163 164test "gwidth: emoji with VS16 selector" { 165 try testing.expectEqual(2, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .unicode)); 166 try testing.expectEqual(1, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .wcwidth)); 167 try testing.expectEqual(2, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .no_zwj)); 168} 169 170test "gwidth: emoji with skin tone selector" { 171 try testing.expectEqual(2, gwidth("👋🏿", .unicode)); 172 try testing.expectEqual(4, gwidth("👋🏿", .wcwidth)); 173 try testing.expectEqual(2, gwidth("👋🏿", .no_zwj)); 174} 175 176test "gwidth: zero-width space" { 177 try testing.expectEqual(0, gwidth("\u{200B}", .unicode)); 178 try testing.expectEqual(0, gwidth("\u{200B}", .wcwidth)); 179} 180 181test "gwidth: zero-width non-joiner" { 182 try testing.expectEqual(0, gwidth("\u{200C}", .unicode)); 183 try testing.expectEqual(0, gwidth("\u{200C}", .wcwidth)); 184} 185 186test "gwidth: combining marks" { 187 // Hebrew combining mark 188 try testing.expectEqual(0, gwidth("\u{05B0}", .unicode)); 189 // Devanagari combining mark 190 try testing.expectEqual(0, gwidth("\u{093C}", .unicode)); 191} 192 193test "gwidth: flag emoji (regional indicators)" { 194 // US flag 🇺🇸 195 try testing.expectEqual(2, gwidth("🇺🇸", .unicode)); 196 // UK flag 🇬🇧 197 try testing.expectEqual(2, gwidth("🇬🇧", .unicode)); 198} 199 200test "gwidth: text variation selector" { 201 // U+2764 (heavy black heart) + U+FE0E (text variation selector) 202 // Should be width 1 with text presentation 203 try testing.expectEqual(1, gwidth("❤︎", .unicode)); 204} 205 206test "gwidth: keycap sequence" { 207 // Digit 1 + U+FE0F + U+20E3 (combining enclosing keycap) 208 // Should be width 2 209 try testing.expectEqual(2, gwidth("1️⃣", .unicode)); 210} 211 212test "gwidth: base letter with combining mark" { 213 // 'a' + combining acute accent (NFD form) 214 // Should be width 1 (combining mark is zero-width) 215 try testing.expectEqual(1, gwidth("á", .unicode)); 216}