a collection of lightweight TypeScript packages for AT Protocol, the protocol powering Bluesky
atproto bluesky typescript npm
101
fork

Configure Feed

Select the types of activity you want to include in your feed.

refactor(lexicons): faster UTF-8 and grapheme length validation

Mary e73fddf6 2aee780c

+11 -68
+5
.changeset/cuddly-icons-show.md
··· 1 + --- 2 + "@atcute/lexicons": patch 3 + --- 4 + 5 + faster UTF-8 length validation checks
+2 -20
packages/lexicons/lexicons/lib/syntax/uri.ts
··· 1 - import { getUtf8Length } from '@atcute/uint8array'; 1 + import { isUtf8LengthInRange } from '@atcute/uint8array'; 2 2 3 3 /** 4 4 * represents a generic URI ··· 13 13 return false; 14 14 } 15 15 16 - const MIN_LENGTH = 3; 17 - const MAX_LENGTH = 8192; 18 - 19 - const utf16Len = input.length; 20 - const maybeUtf8Len = utf16Len * 3; 21 - 22 - // fail early if estimated upper bound is too small 23 - if (maybeUtf8Len < MIN_LENGTH) { 24 - return false; 25 - } 26 - 27 - // skip calculation if UTF-16 length already satisfies both constraints 28 - if (utf16Len >= MIN_LENGTH && maybeUtf8Len <= MAX_LENGTH) { 29 - return URI_RE.test(input); 30 - } 31 - 32 - const utf8Len = getUtf8Length(input); 33 - 34 - if (utf8Len < MIN_LENGTH || utf8Len > MAX_LENGTH) { 16 + if (!isUtf8LengthInRange(input, 3, 8192)) { 35 17 return false; 36 18 } 37 19
+4 -48
packages/lexicons/lexicons/lib/validations/index.ts
··· 1 - import { getUtf8Length } from '@atcute/uint8array'; 2 - import { getGraphemeLength } from '@atcute/util-text'; 1 + import { isUtf8LengthInRange } from '@atcute/uint8array'; 2 + import { isGraphemeLengthInRange } from '@atcute/util-text'; 3 3 4 4 import type { StandardSchemaV1 } from '@standard-schema/spec'; 5 5 ··· 817 817 minLength: minLength, 818 818 maxLength: maxLength, 819 819 '~run'(input, _flags) { 820 - // UTF-8 conversion can be expensive, so we're going to do some safe naive 821 - // checks where we assume an upper-bound of the UTF-16 to UTF-8 conversion 822 - 823 - const utf16Len = input.length; 824 - const maybeUtf8Len = utf16Len * 3; 825 - 826 - // fail early if estimated upper bound is too small 827 - if (maybeUtf8Len < minLength) { 828 - return issue; 829 - } 830 - 831 - // skip calculation if UTF-16 length already satisfies both constraints 832 - if (utf16Len >= minLength && maybeUtf8Len <= maxLength) { 833 - return undefined; 834 - } 835 - 836 - const utf8Len = getUtf8Length(input); 837 - 838 - if (utf8Len < minLength) { 839 - return issue; 840 - } 841 - 842 - if (utf8Len > maxLength) { 820 + if (!isUtf8LengthInRange(input, minLength, maxLength)) { 843 821 return issue; 844 822 } 845 823 ··· 881 859 minGraphemes: minGraphemes, 882 860 maxGraphemes: maxGraphemes, 883 861 '~run'(input, _flags) { 884 - // grapheme conversion is expensive, so we're going to do some safe naive 885 - // checks where we assume 1 UTF-16 character = 1 grapheme. 886 - 887 - const utf16Len = input.length; 888 - 889 - // fail early if UTF-16 length is too small 890 - if (utf16Len < minGraphemes) { 891 - return issue; 892 - } 893 - 894 - // if there is no minimum bounds, we can safely skip when UTF-16 is 895 - // within the maximum bounds. 896 - if (minGraphemes === 0 && utf16Len <= maxGraphemes) { 897 - return undefined; 898 - } 899 - 900 - const graphemeLen = getGraphemeLength(input); 901 - 902 - if (graphemeLen < minGraphemes) { 903 - return issue; 904 - } 905 - 906 - if (graphemeLen > maxGraphemes) { 862 + if (!isGraphemeLengthInRange(input, minGraphemes, maxGraphemes)) { 907 863 return issue; 908 864 } 909 865