fix: use unicode-aware word boundaries in text inputs

+244 -41

2 changed files

expand all

src

vxfw

TextField.zig

widgets

TextInput.zig

+122 -20

src/vxfw/TextField.zig

··· 1 1 const std = @import("std"); 2 + const uucode = @import("uucode"); 2 3 const vaxis = @import("../main.zig"); 3 4 4 5 const vxfw = @import("vxfw.zig"); ··· 346 347 self.buf.growGapRight(grapheme.len); 347 348 } 348 349 349 - /// Returns true if the byte is a word constituent: ASCII alnum, underscore, 350 - /// or any non-ASCII byte (part of a multi-byte UTF-8 sequence). This ensures 351 - /// word motion never splits inside non-ASCII characters like accented letters. 352 - fn isWordChar(c: u8) bool { 353 - return std.ascii.isAlphanumeric(c) or c == '_' or c >= 0x80; 350 + const DecodedCodepoint = struct { 351 + cp: u21, 352 + start: usize, 353 + len: usize, 354 + }; 355 + 356 + fn decodeCodepointAt(bytes: []const u8, start: usize) DecodedCodepoint { 357 + const first = bytes[start]; 358 + const len = std.unicode.utf8ByteSequenceLength(first) catch 1; 359 + const capped_len = @min(len, bytes.len - start); 360 + const slice = bytes[start .. start + capped_len]; 361 + const cp = std.unicode.utf8Decode(slice) catch { 362 + return .{ .cp = first, .start = start, .len = 1 }; 363 + }; 364 + return .{ .cp = cp, .start = start, .len = capped_len }; 365 + } 366 + 367 + fn isUtf8ContinuationByte(c: u8) bool { 368 + return (c & 0b1100_0000) == 0b1000_0000; 369 + } 370 + 371 + fn decodeCodepointBefore(bytes: []const u8, end: usize) DecodedCodepoint { 372 + var start = end - 1; 373 + while (start > 0 and isUtf8ContinuationByte(bytes[start])) : (start -= 1) {} 374 + const slice = bytes[start..end]; 375 + const cp = std.unicode.utf8Decode(slice) catch { 376 + return .{ .cp = bytes[end - 1], .start = end - 1, .len = 1 }; 377 + }; 378 + return .{ .cp = cp, .start = start, .len = end - start }; 379 + } 380 + 381 + /// Returns true if the codepoint is a readline-style word constituent. 382 + fn isWordCodepoint(cp: u21) bool { 383 + if (cp == '_') return true; 384 + return switch (uucode.get(.general_category, cp)) { 385 + .letter_uppercase, 386 + .letter_lowercase, 387 + .letter_titlecase, 388 + .letter_modifier, 389 + .letter_other, 390 + .number_decimal_digit, 391 + .number_letter, 392 + .number_other, 393 + .mark_nonspacing, 394 + .mark_spacing_combining, 395 + .mark_enclosing, 396 + .punctuation_connector, 397 + => true, 398 + else => false, 399 + }; 400 + } 401 + 402 + fn isWhitespaceCodepoint(cp: u21) bool { 403 + return switch (cp) { 404 + ' ', '\t', '\n', '\r', 0x0b, 0x0c, 0x85 => true, 405 + else => switch (uucode.get(.general_category, cp)) { 406 + .separator_space, 407 + .separator_line, 408 + .separator_paragraph, 409 + => true, 410 + else => false, 411 + }, 412 + }; 354 413 } 355 414 356 415 /// Moves the cursor backward by one word using character-class boundaries. ··· 359 418 const first_half = self.buf.firstHalf(); 360 419 var i: usize = first_half.len; 361 420 // Skip non-word characters 362 - while (i > 0 and !isWordChar(first_half[i - 1])) : (i -= 1) {} 421 + while (i > 0) { 422 + const decoded = decodeCodepointBefore(first_half, i); 423 + if (isWordCodepoint(decoded.cp)) break; 424 + i = decoded.start; 425 + } 363 426 // Skip word characters 364 - while (i > 0 and isWordChar(first_half[i - 1])) : (i -= 1) {} 427 + while (i > 0) { 428 + const decoded = decodeCodepointBefore(first_half, i); 429 + if (!isWordCodepoint(decoded.cp)) break; 430 + i = decoded.start; 431 + } 365 432 self.buf.moveGapLeft(self.buf.cursor - i); 366 433 } 367 434 ··· 372 439 const second_half = self.buf.secondHalf(); 373 440 var i: usize = 0; 374 441 // Skip non-word characters 375 - while (i < second_half.len and !isWordChar(second_half[i])) : (i += 1) {} 442 + while (i < second_half.len) { 443 + const decoded = decodeCodepointAt(second_half, i); 444 + if (isWordCodepoint(decoded.cp)) break; 445 + i += decoded.len; 446 + } 376 447 // Skip word characters 377 - while (i < second_half.len and isWordChar(second_half[i])) : (i += 1) {} 448 + while (i < second_half.len) { 449 + const decoded = decodeCodepointAt(second_half, i); 450 + if (!isWordCodepoint(decoded.cp)) break; 451 + i += decoded.len; 452 + } 378 453 self.buf.moveGapRight(i); 379 454 } 380 455 ··· 392 467 const first_half = self.buf.firstHalf(); 393 468 var i: usize = first_half.len; 394 469 // Skip trailing whitespace 395 - while (i > 0 and std.ascii.isWhitespace(first_half[i - 1])) : (i -= 1) {} 470 + while (i > 0) { 471 + const decoded = decodeCodepointBefore(first_half, i); 472 + if (!isWhitespaceCodepoint(decoded.cp)) break; 473 + i = decoded.start; 474 + } 396 475 // Skip non-whitespace 397 - while (i > 0 and !std.ascii.isWhitespace(first_half[i - 1])) : (i -= 1) {} 476 + while (i > 0) { 477 + const decoded = decodeCodepointBefore(first_half, i); 478 + if (isWhitespaceCodepoint(decoded.cp)) break; 479 + i = decoded.start; 480 + } 398 481 const to_delete = self.buf.cursor - i; 399 482 self.buf.moveGapLeft(to_delete); 400 483 self.buf.growGapRight(to_delete); ··· 406 489 const second_half = self.buf.secondHalf(); 407 490 var i: usize = 0; 408 491 // Skip non-word characters 409 - while (i < second_half.len and !isWordChar(second_half[i])) : (i += 1) {} 492 + while (i < second_half.len) { 493 + const decoded = decodeCodepointAt(second_half, i); 494 + if (isWordCodepoint(decoded.cp)) break; 495 + i += decoded.len; 496 + } 410 497 // Skip word characters 411 - while (i < second_half.len and isWordChar(second_half[i])) : (i += 1) {} 498 + while (i < second_half.len) { 499 + const decoded = decodeCodepointAt(second_half, i); 500 + if (!isWordCodepoint(decoded.cp)) break; 501 + i += decoded.len; 502 + } 412 503 self.buf.growGapRight(i); 413 504 } 414 505 ··· 738 829 try std.testing.expectEqualStrings("-latte", input.buf.secondHalf()); 739 830 } 740 831 741 - test "non-ASCII punctuation treated as word chars" { 832 + test "non-ASCII punctuation acts as a separator" { 742 833 var input = TextField.init(std.testing.allocator); 743 834 defer input.deinit(); 744 - // Em dash (U+2014, 3 bytes: E2 80 94) has bytes >= 0x80, so all bytes are 745 - // classified as word chars. The entire string is one continuous "word" — the 746 - // em dash does NOT act as a separator. This is a known limitation of the 747 - // byte-based classifier. 748 - try input.insertSliceAtCursor("hello\xe2\x80\x94world"); 835 + try input.insertSliceAtCursor("hello\u{2014}world"); 749 836 input.moveBackwardWordwise(); 750 - try std.testing.expectEqualStrings("", input.buf.firstHalf()); 837 + try std.testing.expectEqualStrings("hello\u{2014}", input.buf.firstHalf()); 838 + try std.testing.expectEqualStrings("world", input.buf.secondHalf()); 839 + 840 + input.buf.moveGapLeft(input.buf.firstHalf().len); 841 + input.moveForwardWordwise(); 842 + try std.testing.expectEqualStrings("hello", input.buf.firstHalf()); 843 + try std.testing.expectEqualStrings("\u{2014}world", input.buf.secondHalf()); 844 + } 845 + 846 + test "deleteWordBeforeWhitespace handles unicode whitespace" { 847 + var input = TextField.init(std.testing.allocator); 848 + defer input.deinit(); 849 + try input.insertSliceAtCursor("hello\u{3000}world"); 850 + input.deleteWordBeforeWhitespace(); 851 + try std.testing.expectEqualStrings("hello\u{3000}", input.buf.firstHalf()); 852 + try std.testing.expectEqualStrings("", input.buf.secondHalf()); 751 853 } 752 854 753 855 test "deleteWordBefore with non-ASCII text" {

+122 -21

src/widgets/TextInput.zig

··· 1 1 const std = @import("std"); 2 + const uucode = @import("uucode"); 2 3 const assert = std.debug.assert; 3 4 const Key = @import("../Key.zig"); 4 5 const Cell = @import("../Cell.zig"); ··· 265 266 self.buf.growGapRight(grapheme.len); 266 267 } 267 268 268 - /// Returns true if the byte is a word constituent: ASCII alnum, underscore, 269 - /// or any non-ASCII byte (part of a multi-byte UTF-8 sequence). This ensures 270 - /// word motion never splits inside non-ASCII characters like accented letters. 271 - fn isWordChar(c: u8) bool { 272 - return std.ascii.isAlphanumeric(c) or c == '_' or c >= 0x80; 269 + const DecodedCodepoint = struct { 270 + cp: u21, 271 + start: usize, 272 + len: usize, 273 + }; 274 + 275 + fn decodeCodepointAt(bytes: []const u8, start: usize) DecodedCodepoint { 276 + const first = bytes[start]; 277 + const len = std.unicode.utf8ByteSequenceLength(first) catch 1; 278 + const capped_len = @min(len, bytes.len - start); 279 + const slice = bytes[start .. start + capped_len]; 280 + const cp = std.unicode.utf8Decode(slice) catch { 281 + return .{ .cp = first, .start = start, .len = 1 }; 282 + }; 283 + return .{ .cp = cp, .start = start, .len = capped_len }; 284 + } 285 + 286 + fn isUtf8ContinuationByte(c: u8) bool { 287 + return (c & 0b1100_0000) == 0b1000_0000; 288 + } 289 + 290 + fn decodeCodepointBefore(bytes: []const u8, end: usize) DecodedCodepoint { 291 + var start = end - 1; 292 + while (start > 0 and isUtf8ContinuationByte(bytes[start])) : (start -= 1) {} 293 + const slice = bytes[start..end]; 294 + const cp = std.unicode.utf8Decode(slice) catch { 295 + return .{ .cp = bytes[end - 1], .start = end - 1, .len = 1 }; 296 + }; 297 + return .{ .cp = cp, .start = start, .len = end - start }; 298 + } 299 + 300 + /// Returns true if the codepoint is a readline-style word constituent. 301 + fn isWordCodepoint(cp: u21) bool { 302 + if (cp == '_') return true; 303 + return switch (uucode.get(.general_category, cp)) { 304 + .letter_uppercase, 305 + .letter_lowercase, 306 + .letter_titlecase, 307 + .letter_modifier, 308 + .letter_other, 309 + .number_decimal_digit, 310 + .number_letter, 311 + .number_other, 312 + .mark_nonspacing, 313 + .mark_spacing_combining, 314 + .mark_enclosing, 315 + .punctuation_connector, 316 + => true, 317 + else => false, 318 + }; 319 + } 320 + 321 + fn isWhitespaceCodepoint(cp: u21) bool { 322 + return switch (cp) { 323 + ' ', '\t', '\n', '\r', 0x0b, 0x0c, 0x85 => true, 324 + else => switch (uucode.get(.general_category, cp)) { 325 + .separator_space, 326 + .separator_line, 327 + .separator_paragraph, 328 + => true, 329 + else => false, 330 + }, 331 + }; 273 332 } 274 333 275 334 /// Moves the cursor backward by one word using character-class boundaries. ··· 278 337 const first_half = self.buf.firstHalf(); 279 338 var i: usize = first_half.len; 280 339 // Skip non-word characters 281 - while (i > 0 and !isWordChar(first_half[i - 1])) : (i -= 1) {} 340 + while (i > 0) { 341 + const decoded = decodeCodepointBefore(first_half, i); 342 + if (isWordCodepoint(decoded.cp)) break; 343 + i = decoded.start; 344 + } 282 345 // Skip word characters 283 - while (i > 0 and isWordChar(first_half[i - 1])) : (i -= 1) {} 346 + while (i > 0) { 347 + const decoded = decodeCodepointBefore(first_half, i); 348 + if (!isWordCodepoint(decoded.cp)) break; 349 + i = decoded.start; 350 + } 284 351 self.buf.moveGapLeft(self.buf.cursor - i); 285 352 } 286 353 ··· 291 358 const second_half = self.buf.secondHalf(); 292 359 var i: usize = 0; 293 360 // Skip non-word characters 294 - while (i < second_half.len and !isWordChar(second_half[i])) : (i += 1) {} 361 + while (i < second_half.len) { 362 + const decoded = decodeCodepointAt(second_half, i); 363 + if (isWordCodepoint(decoded.cp)) break; 364 + i += decoded.len; 365 + } 295 366 // Skip word characters 296 - while (i < second_half.len and isWordChar(second_half[i])) : (i += 1) {} 367 + while (i < second_half.len) { 368 + const decoded = decodeCodepointAt(second_half, i); 369 + if (!isWordCodepoint(decoded.cp)) break; 370 + i += decoded.len; 371 + } 297 372 self.buf.moveGapRight(i); 298 373 } 299 374 ··· 311 386 const first_half = self.buf.firstHalf(); 312 387 var i: usize = first_half.len; 313 388 // Skip trailing whitespace 314 - while (i > 0 and std.ascii.isWhitespace(first_half[i - 1])) : (i -= 1) {} 389 + while (i > 0) { 390 + const decoded = decodeCodepointBefore(first_half, i); 391 + if (!isWhitespaceCodepoint(decoded.cp)) break; 392 + i = decoded.start; 393 + } 315 394 // Skip non-whitespace 316 - while (i > 0 and !std.ascii.isWhitespace(first_half[i - 1])) : (i -= 1) {} 395 + while (i > 0) { 396 + const decoded = decodeCodepointBefore(first_half, i); 397 + if (isWhitespaceCodepoint(decoded.cp)) break; 398 + i = decoded.start; 399 + } 317 400 const to_delete = self.buf.cursor - i; 318 401 self.buf.moveGapLeft(to_delete); 319 402 self.buf.growGapRight(to_delete); ··· 325 408 const second_half = self.buf.secondHalf(); 326 409 var i: usize = 0; 327 410 // Skip non-word characters 328 - while (i < second_half.len and !isWordChar(second_half[i])) : (i += 1) {} 411 + while (i < second_half.len) { 412 + const decoded = decodeCodepointAt(second_half, i); 413 + if (isWordCodepoint(decoded.cp)) break; 414 + i += decoded.len; 415 + } 329 416 // Skip word characters 330 - while (i < second_half.len and isWordChar(second_half[i])) : (i += 1) {} 417 + while (i < second_half.len) { 418 + const decoded = decodeCodepointAt(second_half, i); 419 + if (!isWordCodepoint(decoded.cp)) break; 420 + i += decoded.len; 421 + } 331 422 self.buf.growGapRight(i); 332 423 } 333 424 ··· 595 686 try std.testing.expectEqualStrings("-latte", input.buf.secondHalf()); 596 687 } 597 688 598 - test "non-ASCII punctuation treated as word chars" { 689 + test "non-ASCII punctuation acts as a separator" { 599 690 var input = TextInput.init(std.testing.allocator); 600 691 defer input.deinit(); 601 - // Em dash (U+2014, 3 bytes: E2 80 94) has bytes >= 0x80, so all bytes are 602 - // classified as word chars. The entire string is one continuous "word" — the 603 - // em dash does NOT act as a separator. This is a known limitation of the 604 - // byte-based classifier; proper Unicode category classification would be 605 - // needed to treat non-ASCII punctuation as separators. 606 - try input.insertSliceAtCursor("hello\xe2\x80\x94world"); 692 + try input.insertSliceAtCursor("hello\u{2014}world"); 607 693 input.moveBackwardWordwise(); 608 - try std.testing.expectEqualStrings("", input.buf.firstHalf()); 694 + try std.testing.expectEqualStrings("hello\u{2014}", input.buf.firstHalf()); 695 + try std.testing.expectEqualStrings("world", input.buf.secondHalf()); 696 + 697 + input.buf.moveGapLeft(input.buf.firstHalf().len); 698 + input.moveForwardWordwise(); 699 + try std.testing.expectEqualStrings("hello", input.buf.firstHalf()); 700 + try std.testing.expectEqualStrings("\u{2014}world", input.buf.secondHalf()); 701 + } 702 + 703 + test "deleteWordBeforeWhitespace handles unicode whitespace" { 704 + var input = TextInput.init(std.testing.allocator); 705 + defer input.deinit(); 706 + try input.insertSliceAtCursor("hello\u{3000}world"); 707 + input.deleteWordBeforeWhitespace(); 708 + try std.testing.expectEqualStrings("hello\u{3000}", input.buf.firstHalf()); 709 + try std.testing.expectEqualStrings("", input.buf.secondHalf()); 609 710 } 610 711 611 712 test "deleteWordBefore with non-ASCII text" {

Configure Feed

Configure Feed