fix: missing chars · pds.dad/pumpkin-os@20038aa

+197 -20

2 changed files

expand all

smol-epub

src

html_strip.rs

src

apps

reader.rs

+112 -19

smol-epub/src/html_strip.rs

··· 53 53 #[repr(u8)] 54 54 enum Phase { 55 55 Text, 56 + Utf8Cont, 56 57 AfterLt, 57 58 TagName, 58 59 TagBody, ··· 114 115 trailing_nl: u8, // deferred newlines; flushed before next visible byte; capped at 2 115 116 has_output: bool, // true once any visible char emitted; suppresses leading whitespace 116 117 118 + // UTF-8 multi-byte accumulator (used in Utf8Cont phase) 119 + utf8_acc: u32, 120 + utf8_remaining: u8, 121 + 117 122 // deferred open-style markers; open-tag markers (bold on, heading on, etc.) 118 123 // appear AFTER paragraph-break newlines and BEFORE text. 119 124 // close-tag markers go to `pending` immediately (before paragraph newlines). ··· 154 159 last_was_space: true, 155 160 trailing_nl: 0, 156 161 has_output: false, 162 + utf8_acc: 0, 163 + utf8_remaining: 0, 157 164 deferred: [0u8; DEFERRED_CAP], 158 165 deferred_len: 0, 159 166 pending: [0u8; PENDING_CAP], ··· 226 233 self.phase = Phase::Entity; 227 234 } else if is_html_ws(b) { 228 235 self.queue_ws(); 236 + } else if b >= 0xC0 { 237 + // UTF-8 lead byte: start multi-byte accumulation 238 + if b < 0xE0 { 239 + self.utf8_acc = (b as u32) & 0x1F; 240 + self.utf8_remaining = 1; 241 + } else if b < 0xF0 { 242 + self.utf8_acc = (b as u32) & 0x0F; 243 + self.utf8_remaining = 2; 244 + } else { 245 + self.utf8_acc = (b as u32) & 0x07; 246 + self.utf8_remaining = 3; 247 + } 248 + self.phase = Phase::Utf8Cont; 249 + } else if b >= 0x80 { 250 + // stray continuation byte; skip silently 229 251 } else { 230 252 self.queue_text(b); 253 + } 254 + } 255 + 256 + // UTF-8 continuation bytes; accumulate then map to ASCII 257 + Phase::Utf8Cont => { 258 + if b & 0xC0 == 0x80 { 259 + self.utf8_acc = (self.utf8_acc << 6) | (b as u32 & 0x3F); 260 + self.utf8_remaining -= 1; 261 + if self.utf8_remaining == 0 { 262 + if let Some(ascii) = codepoint_to_byte(self.utf8_acc) { 263 + if is_html_ws(ascii) { 264 + self.queue_ws(); 265 + } else { 266 + self.queue_text(ascii); 267 + } 268 + } 269 + self.phase = Phase::Text; 270 + } 271 + } else { 272 + // broken sequence; emit replacement and reprocess byte 273 + self.queue_text(b'?'); 274 + self.phase = Phase::Text; 275 + advance = false; 231 276 } 232 277 } 233 278 ··· 914 959 last_was_space = true; 915 960 trailing_nl = 0; 916 961 } 962 + } else if b >= 0xC0 { 963 + // UTF-8 multi-byte sequence; decode and replace with ASCII approx 964 + let (cp, seq_len) = decode_utf8_char(buf, r, len); 965 + if let Some(ascii) = codepoint_to_byte(cp) { 966 + if is_html_ws(ascii) { 967 + if !last_was_space { 968 + buf[w] = b' '; 969 + w += 1; 970 + last_was_space = true; 971 + trailing_nl = 0; 972 + } 973 + } else { 974 + buf[w] = ascii; 975 + w += 1; 976 + last_was_space = false; 977 + trailing_nl = 0; 978 + } 979 + } 980 + r += seq_len; 981 + continue; 982 + } else if b >= 0x80 { 983 + // stray continuation byte; skip 984 + r += 1; 985 + continue; 917 986 } else { 918 987 buf[w] = b; 919 988 w += 1; ··· 1079 1148 match cp { 1080 1149 0 => None, 1081 1150 0x0001..=0x007F => Some(cp as u8), 1082 - 0x00A0 => Some(b' '), // nbsp 1083 - 0x00AD => Some(b'-'), // soft hyphen 1084 - 0x2013 | 0x2014 => Some(b'-'), 1085 - 0x2018..=0x201A => Some(b'\''), 1086 - 0x201C..=0x201E => Some(b'"'), 1087 - 0x2022 => Some(b'*'), 1088 - 0x2026 => Some(b'.'), 1089 - _ => Some(b'?'), // unicode placeholder 1151 + 0x00A0 => Some(b' '), // nbsp 1152 + 0x00AB | 0x00BB => Some(b'"'), // « » 1153 + 0x00AD => Some(b'-'), // soft hyphen 1154 + 0x00B7 => Some(b'.'), // middle dot 1155 + 0x00D7 => Some(b'x'), // multiplication sign 1156 + 0x00F7 => Some(b'/'), // division sign 1157 + 0x2010..=0x2015 => Some(b'-'), // hyphens, dashes (figure dash, horiz bar) 1158 + 0x2018..=0x201B => Some(b'\''), // single quotes (left, right, low-9, reversed-9) 1159 + 0x201C..=0x201F => Some(b'"'), // double quotes (left, right, low-9, reversed-9) 1160 + 0x2022 => Some(b'*'), // bullet 1161 + 0x2026 => Some(b'.'), // horizontal ellipsis 1162 + 0x2032 => Some(b'\''), // prime 1163 + 0x2033 => Some(b'"'), // double prime 1164 + 0x2039 | 0x203A => Some(b'\''), // single guillemets 1165 + 0x2212 => Some(b'-'), // minus sign 1166 + _ => Some(b'?'), // unmapped codepoint 1090 1167 } 1168 + } 1169 + 1170 + /// Decode one UTF-8 character starting at `buf[pos]` (which must be a lead byte >= 0xC0). 1171 + /// Returns `(codepoint, byte_length)`. On malformed input returns `(0xFFFD, 1)`. 1172 + fn decode_utf8_char(buf: &[u8], pos: usize, len: usize) -> (u32, usize) { 1173 + let b0 = buf[pos]; 1174 + let (mut cp, expected) = if b0 < 0xE0 { 1175 + ((b0 as u32) & 0x1F, 2) 1176 + } else if b0 < 0xF0 { 1177 + ((b0 as u32) & 0x0F, 3) 1178 + } else { 1179 + ((b0 as u32) & 0x07, 4) 1180 + }; 1181 + if pos + expected > len { 1182 + return (0xFFFD, len - pos); // truncated sequence; consume remaining 1183 + } 1184 + for i in 1..expected { 1185 + let cont = buf[pos + i]; 1186 + if cont & 0xC0 != 0x80 { 1187 + return (0xFFFD, i); // broken: stop before bad byte 1188 + } 1189 + cp = (cp << 6) | (cont as u32 & 0x3F); 1190 + } 1191 + (cp, expected) 1091 1192 } 1092 1193 1093 1194 // in-place entity decoding; separate from resolve_entity ··· 1145 1246 } 1146 1247 1147 1248 fn codepoint_to_decoded_inplace(cp: u32) -> DecodedInplace { 1148 - match cp { 1149 - 0 => DecodedInplace::None, 1150 - 0x0001..=0x007F => DecodedInplace::Byte(cp as u8), 1151 - 0x00A0 => DecodedInplace::Byte(b' '), 1152 - 0x00AD => DecodedInplace::Byte(b'-'), 1153 - 0x2013 | 0x2014 => DecodedInplace::Byte(b'-'), 1154 - 0x2018..=0x201A => DecodedInplace::Byte(b'\''), 1155 - 0x201C..=0x201E => DecodedInplace::Byte(b'"'), 1156 - 0x2022 => DecodedInplace::Byte(b'*'), 1157 - 0x2026 => DecodedInplace::Byte(b'.'), 1158 - _ => DecodedInplace::Byte(b'?'), 1249 + match codepoint_to_byte(cp) { 1250 + Some(b) => DecodedInplace::Byte(b), 1251 + None => DecodedInplace::None, 1159 1252 } 1160 1253 } 1161 1254

+85 -1

src/apps/reader.rs

··· 1266 1266 1267 1267 // helpers 1268 1268 1269 + /// Decode one UTF-8 character starting at `buf[pos]` (a lead byte >= 0xC0) 1270 + /// and map the codepoint to a printable ASCII replacement. 1271 + /// Returns `(ascii_byte, byte_length_consumed)`. 1272 + fn decode_utf8_to_ascii(buf: &[u8], pos: usize) -> (u8, usize) { 1273 + let b0 = buf[pos]; 1274 + let (mut cp, expected) = if b0 < 0xE0 { 1275 + ((b0 as u32) & 0x1F, 2) 1276 + } else if b0 < 0xF0 { 1277 + ((b0 as u32) & 0x0F, 3) 1278 + } else { 1279 + ((b0 as u32) & 0x07, 4) 1280 + }; 1281 + let len = buf.len(); 1282 + if pos + expected > len { 1283 + return (b'?', len - pos); 1284 + } 1285 + for i in 1..expected { 1286 + let cont = buf[pos + i]; 1287 + if cont & 0xC0 != 0x80 { 1288 + return (b'?', i); 1289 + } 1290 + cp = (cp << 6) | (cont as u32 & 0x3F); 1291 + } 1292 + let ascii = match cp { 1293 + 0x00A0 => b' ', // non-breaking space 1294 + 0x00AB | 0x00BB => b'"', // « » 1295 + 0x00AD => b'-', // soft hyphen 1296 + 0x00B7 => b'.', // middle dot 1297 + 0x00D7 => b'x', // multiplication sign 1298 + 0x00F7 => b'/', // division sign 1299 + 0x2010..=0x2015 => b'-', // hyphens, en-dash, em-dash, etc. 1300 + 0x2018..=0x201B => b'\'', // single curly quotes 1301 + 0x201C..=0x201F => b'"', // double curly quotes 1302 + 0x2022 => b'*', // bullet 1303 + 0x2026 => b'.', // horizontal ellipsis 1304 + 0x2032 => b'\'', // prime 1305 + 0x2033 => b'"', // double prime 1306 + 0x2039 | 0x203A => b'\'', // single guillemets 1307 + 0x2212 => b'-', // minus sign 1308 + _ => b'?', 1309 + }; 1310 + (ascii, expected) 1311 + } 1312 + 1269 1313 fn trim_trailing_cr(buf: &[u8], start: usize, end: usize) -> usize { 1270 1314 if end > start && buf[end - 1] == b'\r' { 1271 1315 end - 1 ··· 1420 1464 if lc >= max_l { 1421 1465 return (ls, lc); 1422 1466 } 1467 + i += 1; 1468 + continue; 1469 + } 1470 + 1471 + // UTF-8 multi-byte: decode entire sequence, use replacement char advance 1472 + if b >= 0xC0 { 1473 + let (repl, seq_len) = decode_utf8_to_ascii(buf, i); 1474 + let sty = current_style(bold, italic, heading); 1475 + let adv = fonts.advance(repl as char, sty) as u32; 1476 + px += adv; 1477 + if px > max_w { 1478 + if sp > ls { 1479 + emit!(ls, sp); 1480 + px -= sp_px; 1481 + ls = sp; 1482 + } else { 1483 + emit!(ls, i); 1484 + ls = i; 1485 + px = adv; 1486 + } 1487 + sp = ls; 1488 + sp_px = 0; 1489 + if lc >= max_l { 1490 + return (ls, lc); 1491 + } 1492 + } 1493 + i += seq_len; 1494 + continue; 1495 + } 1496 + if b >= 0x80 { 1497 + // stray continuation byte; skip without affecting layout 1423 1498 i += 1; 1424 1499 continue; 1425 1500 } ··· 2300 2375 j += 2; 2301 2376 continue; 2302 2377 } 2378 + // UTF-8 lead byte: decode full sequence, render ASCII replacement 2379 + if b >= 0xC0 { 2380 + let (repl, seq_len) = decode_utf8_to_ascii(line, j); 2381 + if (bitmap::FIRST_CHAR..=bitmap::LAST_CHAR).contains(&repl) { 2382 + cx += fs.draw_char(strip, repl as char, sty, cx, baseline) as i32; 2383 + } 2384 + j += seq_len; 2385 + continue; 2386 + } 2303 2387 if !(bitmap::FIRST_CHAR..=bitmap::LAST_CHAR).contains(&b) { 2304 2388 j += 1; 2305 - continue; // non-printable 2389 + continue; // stray continuation byte or control char 2306 2390 } 2307 2391 cx += fs.draw_char(strip, b as char, sty, cx, baseline) as i32; 2308 2392 j += 1;

Configure Feed

Configure Feed