A fork of pulp-os for the xteink4 adding custom apps
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

pulled smol-epub out into seperate repo

hans c740defe 693867c3

+3 -6969
+1
Cargo.lock
··· 1446 1446 [[package]] 1447 1447 name = "smol-epub" 1448 1448 version = "0.1.0" 1449 + source = "git+https://github.com/hansmrtn/smol-epub#ebabefc80ebfde53454eb1234d2efbf7e6336bfa" 1449 1450 dependencies = [ 1450 1451 "log", 1451 1452 "miniz_oxide",
+2 -2
Cargo.toml
··· 1 1 [workspace] 2 - members = [".", "smol-epub"] 2 + members = ["."] 3 3 4 4 [package] 5 5 edition = "2024" ··· 53 53 embedded-graphics-core = "0.4.1" 54 54 embedded-graphics = "0.8.2" 55 55 embedded-sdmmc = "0.9.0" 56 - smol-epub = { path = "smol-epub" } 56 + smol-epub = { git = "https://github.com/hansmrtn/smol-epub" } 57 57 58 58 59 59 [build-dependencies]
-19
smol-epub/Cargo.toml
··· 1 - [package] 2 - name = "smol-epub" 3 - version = "0.1.0" 4 - edition = "2024" 5 - rust-version = "1.88" 6 - description = "Minimal no_std EPUB parser with streaming decompression, HTML stripping, and optional 1-bit image decoders" 7 - license = "MIT OR Apache-2.0" 8 - repository = "https://github.com/pluots/smol-epub" 9 - keywords = ["epub", "no_std", "embedded", "e-ink", "ebook"] 10 - categories = ["no-std", "parser-implementations", "embedded"] 11 - readme = "README.md" 12 - 13 - [dependencies] 14 - miniz_oxide = { version = "0.8", default-features = false, features = ["with-alloc"] } 15 - log = "0.4" 16 - 17 - [features] 18 - default = ["images"] 19 - images = []
-119
smol-epub/README.md
··· 1 - # smol-epub 2 - 3 - Minimal `no_std` EPUB parser with streaming decompression, HTML stripping, 4 - CSS resolution, and optional 1-bit image decoders. 5 - 6 - Designed for memory-constrained embedded targets (≥ 140 KB heap), but works 7 - anywhere `alloc` is available. 8 - 9 - ## Features 10 - 11 - | Module | Purpose | 12 - |--------|---------| 13 - | `zip` | ZIP central-directory parser, streaming DEFLATE extraction | 14 - | `xml` | Minimal XML tag / attribute scanner (EPUB metadata) | 15 - | `css` | CSS property parser for EPUB stylesheets | 16 - | `epub` | EPUB structure: `container.xml` → OPF → spine / metadata / TOC | 17 - | `html_strip` | Single-pass, streaming HTML-to-styled-text converter | 18 - | `cache` | Chapter decompress-and-strip pipeline with cache metadata | 19 - | `png` | PNG decoder → 1-bit Floyd–Steinberg dithered bitmap *(feature `images`)* | 20 - | `jpeg` | JPEG decoder → 1-bit Floyd–Steinberg dithered bitmap *(feature `images`)* | 21 - 22 - ## Feature flags 23 - 24 - | Flag | Default | Description | 25 - |------|---------|-------------| 26 - | `images` | ✓ | Enable `png` and `jpeg` image decoders | 27 - 28 - ## Quick start 29 - 30 - ```rust 31 - use smol_epub::zip::{self, ZipIndex}; 32 - use smol_epub::epub::{self, EpubMeta, EpubSpine, EpubToc}; 33 - 34 - // 1. Build ZIP index from the EPUB file's central directory 35 - let mut zip = ZipIndex::new(); 36 - let (cd_offset, cd_size) = ZipIndex::parse_eocd(&tail_buf, file_size)?; 37 - // ... read the central directory bytes into `cd_buf` ... 38 - zip.parse_central_directory(&cd_buf)?; 39 - 40 - // 2. Parse EPUB structure 41 - let container = zip::extract_entry( 42 - zip.entry(zip.find("META-INF/container.xml").unwrap()), 43 - zip.entry(zip.find("META-INF/container.xml").unwrap()).local_offset, 44 - |off, buf| read_fn(off, buf), 45 - )?; 46 - let mut opf_path = [0u8; epub::OPF_PATH_CAP]; 47 - let opf_len = epub::parse_container(&container, &mut opf_path)?; 48 - 49 - // 3. Extract metadata and reading-order spine 50 - let mut meta = EpubMeta::new(); 51 - let mut spine = EpubSpine::new(); 52 - epub::parse_opf(&opf_data, opf_dir, &zip, &mut meta, &mut spine)?; 53 - println!("{} by {}", meta.title_str(), meta.author_str()); 54 - 55 - // 4. Optionally parse the table of contents 56 - let mut toc = EpubToc::new(); 57 - if let Some(src) = epub::find_toc_source(&opf_data, opf_dir, &zip) { 58 - epub::parse_toc(src, &toc_data, toc_dir, &spine, &zip, &mut toc); 59 - } 60 - 61 - // 5. Stream-decompress + HTML-strip a chapter 62 - let bytes_written = smol_epub::cache::stream_strip_entry( 63 - &entry, local_offset, 64 - |off, buf| read_fn(off, buf), // read closure 65 - |chunk| { output.extend(chunk); Ok(()) }, // output closure 66 - )?; 67 - ``` 68 - 69 - ## Streaming I/O model 70 - 71 - All functions that read from an external byte source accept a generic 72 - closure: 73 - 74 - ```rust 75 - FnMut(offset: u32, buf: &mut [u8]) -> Result<usize, E> 76 - ``` 77 - 78 - This works with SD cards, flash memory, `std::fs::File`, in-memory buffers, 79 - or any other random-access byte store — the crate never assumes a specific 80 - storage backend. 81 - 82 - ## Image decoders 83 - 84 - The `png` and `jpeg` modules decode images to 1-bit monochrome bitmaps 85 - using Floyd–Steinberg dithering, ideal for e-ink displays. Three decoder 86 - variants are provided for each format: 87 - 88 - | Function | Input | 89 - |----------|-------| 90 - | `decode_{png,jpeg}_fit` | In-memory `&[u8]` buffer | 91 - | `decode_{png,jpeg}_streaming` | Stored (uncompressed) ZIP entry via read closure | 92 - | `decode_{png,jpeg}_deflate_streaming` | DEFLATE-compressed ZIP entry via read closure | 93 - 94 - All variants accept `max_w` / `max_h` parameters and integer-downscale 95 - the image to fit. 96 - 97 - ## Memory budget 98 - 99 - Typical peak heap usage on an embedded target: 100 - 101 - | Operation | Peak heap | 102 - |-----------|-----------| 103 - | ZIP index parse | ~5 KB | 104 - | Chapter stream-strip (DEFLATE) | ~51 KB | 105 - | PNG streaming decode | ~90 KB | 106 - | JPEG streaming decode | ~30 KB | 107 - | JPEG DEFLATE streaming decode | ~79 KB | 108 - 109 - Stack usage is kept low throughout; large structs like `DecompressorOxide` 110 - (~11 KB) are always heap-allocated via `Box`. 111 - 112 - ## License 113 - 114 - Licensed under either of 115 - 116 - - [MIT license](http://opensource.org/licenses/MIT) 117 - - [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) 118 - 119 - at your option.
-442
smol-epub/src/cache.rs
··· 1 - //! EPUB chapter cache: streaming decompress + HTML strip pipeline. 2 - //! 3 - //! No persistent heap; ≈ 51 KB temporary per chapter. 4 - //! Cache directory layout uses 8.3-safe names: `_XXXXXXX/` with 5 - //! `META.BIN` + `CHnnn.TXT` files. 6 - 7 - use alloc::boxed::Box; 8 - use alloc::vec::Vec; 9 - 10 - use crate::html_strip::HtmlStripStream; 11 - use crate::zip::{METHOD_DEFLATE, METHOD_STORED, ZipEntry, ZipIndex}; 12 - 13 - const CACHE_MAGIC: u32 = 0x504C_5043; // "PLPC" 14 - const CACHE_VERSION: u8 = 1; 15 - const META_HEADER: usize = 16; 16 - 17 - /// Maximum number of chapters that can be tracked in a single cache. 18 - pub const MAX_CACHE_CHAPTERS: usize = 256; 19 - /// Maximum byte size of a `META.BIN` file (header + one `u32` per chapter). 20 - pub const META_MAX_SIZE: usize = META_HEADER + 4 * MAX_CACHE_CHAPTERS; 21 - 22 - const WINDOW_SIZE: usize = 32768; // DEFLATE sliding window 23 - const READ_BUF_SIZE: usize = 4096; // compressed read chunk 24 - const STRIP_BUF_SIZE: usize = 4096; // strip output accumulator 25 - const FLUSH_THRESHOLD: usize = STRIP_BUF_SIZE - 128; 26 - 27 - /// Compute the FNV-1a hash of `data`. 28 - #[inline] 29 - pub fn fnv1a(data: &[u8]) -> u32 { 30 - let mut h: u32 = 0x811c_9dc5; 31 - for &b in data { 32 - h ^= b as u32; 33 - h = h.wrapping_mul(0x0100_0193); 34 - } 35 - h 36 - } 37 - 38 - /// Generate an 8.3-safe cache directory name from a hash. 39 - /// 40 - /// Format: `_` followed by 7 uppercase hex digits of the lower 28 bits. 41 - pub fn dir_name_for_hash(name_hash: u32) -> [u8; 8] { 42 - let h = name_hash & 0x0FFF_FFFF; 43 - let mut buf = [0u8; 8]; 44 - buf[0] = b'_'; 45 - for i in 0..7 { 46 - let nibble = ((h >> (24 - i * 4)) & 0xF) as u8; 47 - buf[1 + i] = if nibble < 10 { 48 - b'0' + nibble 49 - } else { 50 - b'A' + nibble - 10 51 - }; 52 - } 53 - buf 54 - } 55 - 56 - /// Interpret an 8-byte directory name buffer as a UTF-8 `&str`. 57 - #[inline] 58 - pub fn dir_name_str(buf: &[u8; 8]) -> &str { 59 - core::str::from_utf8(buf).unwrap_or("_0000000") 60 - } 61 - 62 - /// Generate an 8.3-safe chapter filename: `CH000.TXT` through `CH255.TXT`. 63 - pub fn chapter_file_name(idx: u16) -> [u8; 9] { 64 - debug_assert!(idx < 1000, "chapter index out of 3-digit range"); 65 - let mut n = *b"CH000.TXT"; 66 - n[2] = b'0' + ((idx / 100) % 10) as u8; 67 - n[3] = b'0' + ((idx / 10) % 10) as u8; 68 - n[4] = b'0' + (idx % 10) as u8; 69 - n 70 - } 71 - 72 - /// Interpret a 9-byte chapter filename buffer as a UTF-8 `&str`. 73 - #[inline] 74 - pub fn chapter_file_str(buf: &[u8; 9]) -> &str { 75 - core::str::from_utf8(buf).unwrap_or("CH000.TXT") 76 - } 77 - 78 - /// Filename used for the cache metadata file. 79 - pub const META_FILE: &str = "META.BIN"; 80 - 81 - /// Encode cache metadata into `buf`; returns the number of bytes written. 82 - /// 83 - /// The metadata header stores a magic value, version, the EPUB file size, 84 - /// a name hash, and a `u32` size for each cached chapter. 85 - pub fn encode_cache_meta( 86 - epub_size: u32, 87 - name_hash: u32, 88 - chapter_sizes: &[u32], 89 - buf: &mut [u8], 90 - ) -> usize { 91 - let count = chapter_sizes.len().min(MAX_CACHE_CHAPTERS); 92 - let total = META_HEADER + count * 4; 93 - debug_assert!( 94 - buf.len() >= total, 95 - "meta buffer too small: {} < {}", 96 - buf.len(), 97 - total 98 - ); 99 - 100 - buf[0..4].copy_from_slice(&CACHE_MAGIC.to_le_bytes()); 101 - buf[4] = CACHE_VERSION; 102 - buf[5] = count as u8; 103 - buf[6] = 0; 104 - buf[7] = 0; 105 - buf[8..12].copy_from_slice(&epub_size.to_le_bytes()); 106 - buf[12..16].copy_from_slice(&name_hash.to_le_bytes()); 107 - 108 - for (i, &size) in chapter_sizes.iter().enumerate().take(count) { 109 - let off = META_HEADER + i * 4; 110 - buf[off..off + 4].copy_from_slice(&size.to_le_bytes()); 111 - } 112 - 113 - total 114 - } 115 - 116 - /// Parse and validate a `META.BIN` blob. 117 - /// 118 - /// On success, writes individual chapter sizes into `chapter_sizes_out` 119 - /// and returns the number of chapters. Returns an error if the magic, 120 - /// version, EPUB size, name hash, or chapter count do not match. 121 - pub fn parse_cache_meta( 122 - data: &[u8], 123 - epub_size: u32, 124 - name_hash: u32, 125 - expected_chapters: usize, 126 - chapter_sizes_out: &mut [u32], 127 - ) -> Result<usize, &'static str> { 128 - if data.len() < META_HEADER { 129 - return Err("cache: meta too short"); 130 - } 131 - 132 - let magic = u32::from_le_bytes([data[0], data[1], data[2], data[3]]); 133 - if magic != CACHE_MAGIC { 134 - return Err("cache: bad magic"); 135 - } 136 - 137 - if data[4] != CACHE_VERSION { 138 - return Err("cache: version mismatch"); 139 - } 140 - 141 - let stored_size = u32::from_le_bytes([data[8], data[9], data[10], data[11]]); 142 - let stored_hash = u32::from_le_bytes([data[12], data[13], data[14], data[15]]); 143 - 144 - if stored_size != epub_size { 145 - return Err("cache: epub size changed"); 146 - } 147 - if stored_hash != name_hash { 148 - return Err("cache: epub hash changed"); 149 - } 150 - 151 - let count = data[5] as usize; 152 - if count != expected_chapters { 153 - return Err("cache: chapter count mismatch"); 154 - } 155 - 156 - let needed = META_HEADER + count * 4; 157 - if data.len() < needed { 158 - return Err("cache: meta truncated"); 159 - } 160 - 161 - if chapter_sizes_out.len() < count { 162 - return Err("cache: output slice too small"); 163 - } 164 - 165 - for i in 0..count { 166 - let off = META_HEADER + i * 4; 167 - chapter_sizes_out[i] = 168 - u32::from_le_bytes([data[off], data[off + 1], data[off + 2], data[off + 3]]); 169 - } 170 - 171 - Ok(count) 172 - } 173 - 174 - /// Stream-decompress a ZIP entry, strip HTML, and emit plain-text chunks. 175 - /// 176 - /// `read_fn(offset, buf)` reads raw bytes from the underlying store. 177 - /// `output_fn(chunk)` receives stripped plain-text output incrementally. 178 - /// 179 - /// Returns the total number of bytes written through `output_fn`. 180 - /// Peak temporary memory ≈ 47 KB (decompressor + sliding window + strip 181 - /// buffers). 182 - pub fn stream_strip_entry<E>( 183 - entry: &ZipEntry, 184 - local_offset: u32, 185 - mut read_fn: impl FnMut(u32, &mut [u8]) -> Result<usize, E>, 186 - mut output_fn: impl FnMut(&[u8]) -> Result<(), &'static str>, 187 - ) -> Result<u32, &'static str> { 188 - // skip local file header to reach entry data 189 - let mut header = [0u8; 30]; 190 - read_fn(local_offset, &mut header).map_err(|_| "cache: read local header failed")?; 191 - let skip = ZipIndex::local_header_data_skip(&header)?; 192 - let data_offset = local_offset + skip; 193 - 194 - match entry.method { 195 - METHOD_STORED => stream_stored(entry, data_offset, &mut read_fn, &mut output_fn), 196 - METHOD_DEFLATE => stream_deflate(entry, data_offset, &mut read_fn, &mut output_fn), 197 - _ => Err("cache: unsupported compression method"), 198 - } 199 - } 200 - 201 - // stored entry: read raw, strip HTML, write via callback; stack-only 202 - fn stream_stored<E>( 203 - entry: &ZipEntry, 204 - data_offset: u32, 205 - read_fn: &mut impl FnMut(u32, &mut [u8]) -> Result<usize, E>, 206 - output_fn: &mut impl FnMut(&[u8]) -> Result<(), &'static str>, 207 - ) -> Result<u32, &'static str> { 208 - let mut stripper = HtmlStripStream::new(); 209 - let mut read_buf = [0u8; READ_BUF_SIZE]; 210 - let mut strip_buf = [0u8; STRIP_BUF_SIZE]; 211 - let mut strip_pos: usize = 0; 212 - let mut total_written: u32 = 0; 213 - 214 - let size = entry.uncomp_size; 215 - let mut file_pos = data_offset; 216 - let mut remaining = size; 217 - 218 - log::info!("cache: streaming stored entry ({} bytes)", size); 219 - 220 - while remaining > 0 { 221 - let want = (remaining as usize).min(READ_BUF_SIZE); 222 - let n = 223 - read_fn(file_pos, &mut read_buf[..want]).map_err(|_| "cache: read failed (stored)")?; 224 - if n == 0 { 225 - return Err("cache: unexpected EOF in stored entry"); 226 - } 227 - file_pos += n as u32; 228 - remaining -= n as u32; 229 - 230 - feed_and_flush( 231 - &mut stripper, 232 - &read_buf[..n], 233 - &mut strip_buf, 234 - &mut strip_pos, 235 - &mut total_written, 236 - output_fn, 237 - )?; 238 - } 239 - 240 - // flush trailing stripper state (deferred newlines, etc.) 241 - let trailing = stripper.finish(&mut strip_buf[strip_pos..]); 242 - strip_pos += trailing; 243 - if strip_pos > 0 { 244 - output_fn(&strip_buf[..strip_pos])?; 245 - total_written += strip_pos as u32; 246 - } 247 - 248 - Ok(total_written) 249 - } 250 - 251 - // deflate entry: decompress into 32KB circular window, strip HTML; ~47KB temp 252 - fn stream_deflate<E>( 253 - entry: &ZipEntry, 254 - data_offset: u32, 255 - read_fn: &mut impl FnMut(u32, &mut [u8]) -> Result<usize, E>, 256 - output_fn: &mut impl FnMut(&[u8]) -> Result<(), &'static str>, 257 - ) -> Result<u32, &'static str> { 258 - use miniz_oxide::inflate::TINFLStatus; 259 - use miniz_oxide::inflate::core::{DecompressorOxide, decompress, inflate_flags}; 260 - 261 - let comp_size = entry.comp_size as usize; 262 - let uncomp_size = entry.uncomp_size; 263 - 264 - log::info!( 265 - "cache: streaming deflate {} -> {} bytes", 266 - comp_size, 267 - uncomp_size 268 - ); 269 - 270 - // ~11KB DecompressorOxide; alloc zeroed directly (Box::new overflows stack) 271 - 272 - let decomp_ptr = 273 - unsafe { alloc::alloc::alloc_zeroed(core::alloc::Layout::new::<DecompressorOxide>()) }; 274 - if decomp_ptr.is_null() { 275 - return Err("cache: OOM for decompressor"); 276 - } 277 - let mut decomp = unsafe { Box::from_raw(decomp_ptr as *mut DecompressorOxide) }; 278 - 279 - // 32KB circular dictionary 280 - let mut window = Vec::new(); 281 - window 282 - .try_reserve_exact(WINDOW_SIZE) 283 - .map_err(|_| "cache: OOM for window")?; 284 - window.resize(WINDOW_SIZE, 0); 285 - 286 - // 4KB read buffer 287 - let mut rbuf = Vec::new(); 288 - rbuf.try_reserve_exact(READ_BUF_SIZE) 289 - .map_err(|_| "cache: OOM for read buffer")?; 290 - rbuf.resize(READ_BUF_SIZE, 0); 291 - 292 - let mut stripper = HtmlStripStream::new(); 293 - let mut strip_buf = [0u8; STRIP_BUF_SIZE]; 294 - let mut strip_pos: usize = 0; 295 - let mut total_written: u32 = 0; 296 - 297 - let mut in_avail: usize = 0; 298 - let mut file_pos = data_offset; 299 - let mut comp_left = comp_size; 300 - let mut out_pos: usize = 0; // write position in circular window 301 - 302 - loop { 303 - // top up read buffer 304 - if in_avail < READ_BUF_SIZE && comp_left > 0 { 305 - let space = READ_BUF_SIZE - in_avail; 306 - let want = space.min(comp_left); 307 - match read_fn(file_pos, &mut rbuf[in_avail..in_avail + want]) { 308 - Ok(n) if n > 0 => { 309 - file_pos += n as u32; 310 - comp_left -= n; 311 - in_avail += n; 312 - } 313 - Ok(_) => { 314 - comp_left = 0; 315 - } 316 - Err(_) => return Err("cache: read failed during deflate"), 317 - } 318 - } 319 - 320 - if in_avail == 0 && out_pos == 0 { 321 - return Err("cache: empty deflate stream"); 322 - } 323 - 324 - // circular-buffer mode: do not set TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF 325 - let flags = if comp_left > 0 { 326 - inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 327 - } else { 328 - 0 329 - }; 330 - 331 - let old_out_pos = out_pos; 332 - let (status, consumed, produced) = 333 - decompress(&mut decomp, &rbuf[..in_avail], &mut window, out_pos, flags); 334 - 335 - // feed new output to HTML stripper; always contiguous within window 336 - if produced > 0 { 337 - let end = old_out_pos + produced; 338 - debug_assert!( 339 - end <= WINDOW_SIZE, 340 - "deflate produced past window boundary: {} > {}", 341 - end, 342 - WINDOW_SIZE 343 - ); 344 - 345 - feed_and_flush( 346 - &mut stripper, 347 - &window[old_out_pos..end], 348 - &mut strip_buf, 349 - &mut strip_pos, 350 - &mut total_written, 351 - output_fn, 352 - )?; 353 - } 354 - 355 - out_pos += produced; 356 - 357 - if consumed > 0 && consumed < in_avail { 358 - rbuf.copy_within(consumed..in_avail, 0); 359 - } 360 - in_avail -= consumed; 361 - 362 - match status { 363 - TINFLStatus::Done => break, 364 - 365 - TINFLStatus::HasMoreOutput => { 366 - // window full; reset write pos, data stays for back-references 367 - out_pos = 0; 368 - } 369 - 370 - TINFLStatus::NeedsMoreInput => { 371 - if comp_left == 0 && in_avail == 0 { 372 - return Err("cache: truncated deflate stream"); 373 - } 374 - if consumed == 0 && produced == 0 && in_avail >= READ_BUF_SIZE { 375 - return Err("cache: deflate stream stuck"); 376 - } 377 - } 378 - 379 - _ => return Err("cache: deflate decompression error"), 380 - } 381 - } 382 - 383 - let trailing = stripper.finish(&mut strip_buf[strip_pos..]); 384 - strip_pos += trailing; 385 - if strip_pos > 0 { 386 - output_fn(&strip_buf[..strip_pos])?; 387 - total_written += strip_pos as u32; 388 - } 389 - 390 - Ok(total_written) 391 - } 392 - 393 - // feed input through stripper; flush to output_fn when FLUSH_THRESHOLD reached 394 - fn feed_and_flush( 395 - stripper: &mut HtmlStripStream, 396 - input: &[u8], 397 - strip_buf: &mut [u8; STRIP_BUF_SIZE], 398 - strip_pos: &mut usize, 399 - total_written: &mut u32, 400 - output_fn: &mut impl FnMut(&[u8]) -> Result<(), &'static str>, 401 - ) -> Result<(), &'static str> { 402 - let mut ip: usize = 0; 403 - 404 - while ip < input.len() { 405 - let avail_out = STRIP_BUF_SIZE - *strip_pos; 406 - if avail_out == 0 { 407 - // output buffer full; flush before continuing 408 - output_fn(&strip_buf[..*strip_pos])?; 409 - *total_written += *strip_pos as u32; 410 - *strip_pos = 0; 411 - continue; 412 - } 413 - 414 - let (consumed, written) = stripper.feed( 415 - &input[ip..], 416 - &mut strip_buf[*strip_pos..*strip_pos + avail_out], 417 - ); 418 - ip += consumed; 419 - *strip_pos += written; 420 - 421 - if consumed == 0 && written == 0 { 422 - // no progress: flush pending data, or skip byte to break deadlock 423 - if *strip_pos > 0 { 424 - output_fn(&strip_buf[..*strip_pos])?; 425 - *total_written += *strip_pos as u32; 426 - *strip_pos = 0; 427 - } else { 428 - ip += 1; 429 - } 430 - continue; 431 - } 432 - 433 - // flush when buffer is sufficiently full 434 - if *strip_pos >= FLUSH_THRESHOLD { 435 - output_fn(&strip_buf[..*strip_pos])?; 436 - *total_written += *strip_pos as u32; 437 - *strip_pos = 0; 438 - } 439 - } 440 - 441 - Ok(()) 442 - }
-805
smol-epub/src/css.rs
··· 1 - //! Minimal CSS parser for EPUB stylesheets. 2 - //! 3 - //! Selectors: tag, `.class`, `tag.class`, grouped. Combinators are 4 - //! reduced to the rightmost simple selector. `@`-rules and 5 - //! pseudo-classes are skipped. 6 - //! 7 - //! Rule table is stack-allocated: `MAX_CSS_RULES` × ~16 B = 2 KB. 8 - 9 - /// Maximum number of CSS rules the parser will store. 10 - pub const MAX_CSS_RULES: usize = 128; 11 - 12 - // ── property flag bits (which fields in StyleProps are explicitly set) ── 13 - 14 - /// Flag: `font-weight` is explicitly set. 15 - pub const PROP_FONT_WEIGHT: u16 = 1 << 0; 16 - /// Flag: `font-style` is explicitly set. 17 - pub const PROP_FONT_STYLE: u16 = 1 << 1; 18 - /// Flag: `text-align` is explicitly set. 19 - pub const PROP_TEXT_ALIGN: u16 = 1 << 2; 20 - /// Flag: `text-indent` is explicitly set. 21 - pub const PROP_TEXT_INDENT: u16 = 1 << 3; 22 - /// Flag: `margin-left` is explicitly set. 23 - pub const PROP_MARGIN_LEFT: u16 = 1 << 4; 24 - /// Flag: `margin-right` is explicitly set. 25 - pub const PROP_MARGIN_RIGHT: u16 = 1 << 5; 26 - /// Flag: `margin-top` is explicitly set. 27 - pub const PROP_MARGIN_TOP: u16 = 1 << 6; 28 - /// Flag: `margin-bottom` is explicitly set. 29 - pub const PROP_MARGIN_BOTTOM: u16 = 1 << 7; 30 - /// Flag: `display` is explicitly set. 31 - pub const PROP_DISPLAY: u16 = 1 << 8; 32 - /// Flag: `text-decoration` is explicitly set. 33 - pub const PROP_TEXT_DECORATION: u16 = 1 << 9; 34 - 35 - // ── property value constants ──────────────────────────────────────── 36 - 37 - /// `font-weight: normal`. 38 - pub const FW_NORMAL: u8 = 0; 39 - /// `font-weight: bold`. 40 - pub const FW_BOLD: u8 = 1; 41 - 42 - /// `font-style: normal`. 43 - pub const FS_NORMAL: u8 = 0; 44 - /// `font-style: italic`. 45 - pub const FS_ITALIC: u8 = 1; 46 - 47 - /// `text-align: left`. 48 - pub const TA_LEFT: u8 = 0; 49 - /// `text-align: center`. 50 - pub const TA_CENTER: u8 = 1; 51 - /// `text-align: right`. 52 - pub const TA_RIGHT: u8 = 2; 53 - /// `text-align: justify`. 54 - pub const TA_JUSTIFY: u8 = 3; 55 - 56 - /// `display` not explicitly set (inherit / default). 57 - pub const DISP_DEFAULT: u8 = 0; 58 - /// `display: none`. 59 - pub const DISP_NONE: u8 = 1; 60 - /// `display: block`. 61 - pub const DISP_BLOCK: u8 = 2; 62 - /// `display: inline`. 63 - pub const DISP_INLINE: u8 = 3; 64 - 65 - /// `text-decoration: none`. 66 - pub const TD_NONE: u8 = 0; 67 - /// `text-decoration: underline`. 68 - pub const TD_UNDERLINE: u8 = 1; 69 - /// `text-decoration: line-through`. 70 - pub const TD_LINE_THROUGH: u8 = 2; 71 - 72 - /// Resolved CSS properties for a single element. 73 - /// 74 - /// The `set` bitmask tracks which fields have been explicitly specified 75 - /// by a stylesheet rule. Lengths are stored in **quarter-em** units 76 - /// (`i8`): 1 em = 4, 0.5 em = 2, 2 em = 8. 77 - #[derive(Clone, Copy)] 78 - pub struct StyleProps { 79 - /// Bitmask of `PROP_*` flags indicating which fields are set. 80 - pub set: u16, 81 - /// `font-weight` — see [`FW_NORMAL`], [`FW_BOLD`]. 82 - pub font_weight: u8, 83 - /// `font-style` — see [`FS_NORMAL`], [`FS_ITALIC`]. 84 - pub font_style: u8, 85 - /// `text-align` — see [`TA_LEFT`], [`TA_CENTER`], etc. 86 - pub text_align: u8, 87 - /// `text-indent` in quarter-em units. 88 - pub text_indent: i8, 89 - /// `margin-left` in quarter-em units. 90 - pub margin_left: i8, 91 - /// `margin-right` in quarter-em units. 92 - pub margin_right: i8, 93 - /// `margin-top` in quarter-em units. 94 - pub margin_top: i8, 95 - /// `margin-bottom` in quarter-em units. 96 - pub margin_bottom: i8, 97 - /// `display` — see [`DISP_DEFAULT`], [`DISP_NONE`], etc. 98 - pub display: u8, 99 - /// `text-decoration` bitmask — see [`TD_NONE`], [`TD_UNDERLINE`], etc. 100 - pub text_decoration: u8, 101 - } 102 - 103 - impl StyleProps { 104 - /// A `StyleProps` with no fields set and all values at their defaults. 105 - pub const EMPTY: Self = Self { 106 - set: 0, 107 - font_weight: FW_NORMAL, 108 - font_style: FS_NORMAL, 109 - text_align: TA_LEFT, 110 - text_indent: 0, 111 - margin_left: 0, 112 - margin_right: 0, 113 - margin_top: 0, 114 - margin_bottom: 0, 115 - display: DISP_DEFAULT, 116 - text_decoration: TD_NONE, 117 - }; 118 - 119 - // merge other's properties; only override if specificity >= best seen 120 - fn apply(&mut self, other: &Self, spec: u8, best: &mut [u8; 16]) { 121 - macro_rules! merge { 122 - ($field:ident, $bit:expr) => { 123 - if other.set & (1 << $bit) != 0 && spec >= best[$bit] { 124 - self.$field = other.$field; 125 - self.set |= 1 << $bit; 126 - best[$bit] = spec; 127 - } 128 - }; 129 - } 130 - merge!(font_weight, 0); 131 - merge!(font_style, 1); 132 - merge!(text_align, 2); 133 - merge!(text_indent, 3); 134 - merge!(margin_left, 4); 135 - merge!(margin_right, 5); 136 - merge!(margin_top, 6); 137 - merge!(margin_bottom, 7); 138 - merge!(display, 8); 139 - merge!(text_decoration, 9); 140 - } 141 - 142 - #[inline] 143 - /// Returns `true` if `font-weight` is set to bold. 144 - pub fn is_bold(&self) -> bool { 145 - self.set & PROP_FONT_WEIGHT != 0 && self.font_weight == FW_BOLD 146 - } 147 - 148 - #[inline] 149 - /// Returns `true` if `font-style` is set to italic. 150 - pub fn is_italic(&self) -> bool { 151 - self.set & PROP_FONT_STYLE != 0 && self.font_style == FS_ITALIC 152 - } 153 - 154 - #[inline] 155 - /// Returns `true` if `display` is set to `none`. 156 - pub fn is_hidden(&self) -> bool { 157 - self.set & PROP_DISPLAY != 0 && self.display == DISP_NONE 158 - } 159 - } 160 - 161 - // selector 162 - 163 - #[derive(Clone, Copy)] 164 - struct Selector { 165 - tag: u8, // tag_id(); 0 = any 166 - class_hash: u16, // class_hash(); 0 = any 167 - specificity: u8, // tag=1, class=16, tag+class=17 168 - } 169 - 170 - impl Selector { 171 - const EMPTY: Self = Self { 172 - tag: 0, 173 - class_hash: 0, 174 - specificity: 0, 175 - }; 176 - 177 - fn matches(&self, elem_tag: u8, elem_class: u16) -> bool { 178 - let tag_ok = self.tag == 0 || self.tag == elem_tag; 179 - let cls_ok = self.class_hash == 0 || self.class_hash == elem_class; 180 - tag_ok && cls_ok 181 - } 182 - } 183 - 184 - // CssRule + CssRules 185 - 186 - #[derive(Clone, Copy)] 187 - struct CssRule { 188 - sel: Selector, // 4 bytes 189 - props: StyleProps, // 12 bytes 190 - } 191 - 192 - impl CssRule { 193 - const EMPTY: Self = Self { 194 - sel: Selector::EMPTY, 195 - props: StyleProps::EMPTY, 196 - }; 197 - } 198 - 199 - // parsed CSS rule table, stack-allocated (~2KB) 200 - /// Parsed CSS rule table (stack-allocated, up to [`MAX_CSS_RULES`] entries). 201 - pub struct CssRules { 202 - rules: [CssRule; MAX_CSS_RULES], 203 - count: usize, 204 - } 205 - 206 - impl Default for CssRules { 207 - fn default() -> Self { 208 - Self::new() 209 - } 210 - } 211 - 212 - impl CssRules { 213 - /// Create an empty rule table. 214 - pub const fn new() -> Self { 215 - Self { 216 - rules: [CssRule::EMPTY; MAX_CSS_RULES], 217 - count: 0, 218 - } 219 - } 220 - 221 - /// Remove all parsed rules. 222 - pub fn clear(&mut self) { 223 - self.count = 0; 224 - } 225 - 226 - #[inline] 227 - /// Number of rules currently stored. 228 - pub fn len(&self) -> usize { 229 - self.count 230 - } 231 - 232 - #[inline] 233 - /// Returns `true` if no rules have been parsed. 234 - pub fn is_empty(&self) -> bool { 235 - self.count == 0 236 - } 237 - 238 - // parse stylesheet; may be called multiple times to accumulate rules 239 - /// Parse a CSS stylesheet and append rules to the table. 240 - pub fn parse(&mut self, css: &[u8]) { 241 - let mut pos: usize = 0; 242 - 243 - while pos < css.len() { 244 - pos = skip_ws_comments(css, pos); 245 - if pos >= css.len() { 246 - break; 247 - } 248 - 249 - // skip @-rules (may contain nested blocks) 250 - if css[pos] == b'@' { 251 - pos = skip_at_rule(css, pos); 252 - continue; 253 - } 254 - 255 - // selector(s) run until '{' 256 - let sel_start = pos; 257 - let Some(brace) = scan_to_byte(css, pos, b'{') else { 258 - break; 259 - }; 260 - let sel_text = &css[sel_start..brace]; 261 - pos = brace + 1; 262 - 263 - // declarations run until '}' 264 - let Some(end) = scan_to_byte(css, pos, b'}') else { 265 - break; 266 - }; 267 - let decl_text = &css[pos..end]; 268 - pos = end + 1; 269 - 270 - // parse the declaration block 271 - let props = parse_declarations(decl_text); 272 - if props.set == 0 { 273 - continue; // no usable properties 274 - } 275 - 276 - // split grouped selectors on ',' and add a rule for each 277 - for sel_part in sel_text.split(|&b| b == b',') { 278 - let sel = parse_selector(sel_part); 279 - if sel.specificity == 0 && sel.tag == 0 && sel.class_hash == 0 { 280 - continue; // unparseable 281 - } 282 - if self.count < MAX_CSS_RULES { 283 - self.rules[self.count] = CssRule { sel, props }; 284 - self.count += 1; 285 - } 286 - } 287 - } 288 - } 289 - 290 - // resolve effective style for tag + class; merged by specificity 291 - /// Resolve the effective style for an element given its tag and class names. 292 - pub fn resolve(&self, tag_name: &[u8], class_name: &[u8]) -> StyleProps { 293 - let tid = tag_id(tag_name); 294 - let chash = if class_name.is_empty() { 295 - 0 296 - } else { 297 - class_hash(class_name) 298 - }; 299 - 300 - let mut result = StyleProps::EMPTY; 301 - let mut best = [0u8; 16]; 302 - 303 - for rule in &self.rules[..self.count] { 304 - if rule.sel.matches(tid, chash) { 305 - result.apply(&rule.props, rule.sel.specificity, &mut best); 306 - } 307 - } 308 - 309 - result 310 - } 311 - 312 - // resolve by pre-computed tag ID and class hash 313 - /// Resolve the effective style using precomputed tag-id and class-hash. 314 - pub fn resolve_by_id(&self, tid: u8, chash: u16) -> StyleProps { 315 - let mut result = StyleProps::EMPTY; 316 - let mut best = [0u8; 16]; 317 - 318 - for rule in &self.rules[..self.count] { 319 - if rule.sel.matches(tid, chash) { 320 - result.apply(&rule.props, rule.sel.specificity, &mut best); 321 - } 322 - } 323 - 324 - result 325 - } 326 - } 327 - 328 - // CSS parser internals 329 - 330 - // parse a single (possibly compound) selector 331 - fn parse_selector(raw: &[u8]) -> Selector { 332 - let raw = trim_css(raw); 333 - if raw.is_empty() { 334 - return Selector::EMPTY; 335 - } 336 - 337 - // take only the rightmost simple selector (ignore descendant/child combinators) 338 - let mut last_start = 0; 339 - let mut i = 0; 340 - while i < raw.len() { 341 - if raw[i] == b' ' || raw[i] == b'>' || raw[i] == b'+' || raw[i] == b'~' { 342 - let next = i + 1; 343 - // skip whitespace after combinator 344 - let mut j = next; 345 - while j < raw.len() && raw[j] == b' ' { 346 - j += 1; 347 - } 348 - if j < raw.len() { 349 - last_start = j; 350 - } 351 - i = j; 352 - } else { 353 - i += 1; 354 - } 355 - } 356 - 357 - let sel = trim_css(&raw[last_start..]); 358 - if sel.is_empty() || sel == b"*" { 359 - return Selector::EMPTY; 360 - } 361 - 362 - // strip pseudo-classes/elements (:hover, ::before, etc.) 363 - let sel = if let Some(p) = sel.iter().position(|&b| b == b':') { 364 - &sel[..p] 365 - } else { 366 - sel 367 - }; 368 - 369 - // strip #id (take everything before '#') 370 - let sel = if let Some(p) = sel.iter().position(|&b| b == b'#') { 371 - if p == 0 { 372 - // bare #id selector; can't match by tag/class, skip 373 - return Selector::EMPTY; 374 - } 375 - &sel[..p] 376 - } else { 377 - sel 378 - }; 379 - 380 - // split tag.class 381 - let (tag_part, class_part) = if let Some(dot) = sel.iter().position(|&b| b == b'.') { 382 - (&sel[..dot], &sel[dot + 1..]) 383 - } else { 384 - (sel, &[] as &[u8]) 385 - }; 386 - 387 - let tid = if tag_part.is_empty() { 388 - 0 389 - } else { 390 - tag_id(tag_part) 391 - }; 392 - let chash = if class_part.is_empty() { 393 - 0 394 - } else { 395 - class_hash(class_part) 396 - }; 397 - 398 - let specificity = match (tid != 0, chash != 0) { 399 - (false, false) => 0, 400 - (true, false) => 1, // tag only 401 - (false, true) => 16, // class only 402 - (true, true) => 17, // tag + class 403 - }; 404 - 405 - Selector { 406 - tag: tid, 407 - class_hash: chash, 408 - specificity, 409 - } 410 - } 411 - 412 - // parse declaration block (between { and }) 413 - fn parse_declarations(block: &[u8]) -> StyleProps { 414 - let mut props = StyleProps::EMPTY; 415 - 416 - // split on ';', handle each property:value pair 417 - for decl in block.split(|&b| b == b';') { 418 - let decl = trim_css(decl); 419 - if decl.is_empty() { 420 - continue; 421 - } 422 - 423 - // split on first ':' 424 - let Some(colon) = decl.iter().position(|&b| b == b':') else { 425 - continue; 426 - }; 427 - let prop_name = trim_css(&decl[..colon]); 428 - let prop_value = trim_css(&decl[colon + 1..]); 429 - 430 - if prop_name.is_empty() || prop_value.is_empty() { 431 - continue; 432 - } 433 - 434 - parse_property(prop_name, prop_value, &mut props); 435 - } 436 - 437 - props 438 - } 439 - 440 - // map CSS property name + value to StyleProps fields 441 - fn parse_property(name: &[u8], value: &[u8], props: &mut StyleProps) { 442 - match name { 443 - b"font-weight" => { 444 - props.font_weight = match value { 445 - v if v.starts_with(b"bold") => FW_BOLD, 446 - v if starts_with_digit(v) && parse_int(v) >= 600 => FW_BOLD, 447 - _ => FW_NORMAL, 448 - }; 449 - props.set |= PROP_FONT_WEIGHT; 450 - } 451 - 452 - b"font-style" => { 453 - props.font_style = if value.starts_with(b"italic") || value.starts_with(b"oblique") { 454 - FS_ITALIC 455 - } else { 456 - FS_NORMAL 457 - }; 458 - props.set |= PROP_FONT_STYLE; 459 - } 460 - 461 - b"text-align" => { 462 - props.text_align = match value { 463 - v if v.starts_with(b"center") => TA_CENTER, 464 - v if v.starts_with(b"right") => TA_RIGHT, 465 - v if v.starts_with(b"justify") => TA_JUSTIFY, 466 - _ => TA_LEFT, 467 - }; 468 - props.set |= PROP_TEXT_ALIGN; 469 - } 470 - 471 - b"text-indent" => { 472 - props.text_indent = parse_length_qem(value); 473 - props.set |= PROP_TEXT_INDENT; 474 - } 475 - 476 - b"margin-left" | b"padding-left" => { 477 - props.margin_left = parse_length_qem(value); 478 - props.set |= PROP_MARGIN_LEFT; 479 - } 480 - 481 - b"margin-right" | b"padding-right" => { 482 - props.margin_right = parse_length_qem(value); 483 - props.set |= PROP_MARGIN_RIGHT; 484 - } 485 - 486 - b"margin-top" | b"padding-top" => { 487 - props.margin_top = parse_length_qem(value); 488 - props.set |= PROP_MARGIN_TOP; 489 - } 490 - 491 - b"margin-bottom" | b"padding-bottom" => { 492 - props.margin_bottom = parse_length_qem(value); 493 - props.set |= PROP_MARGIN_BOTTOM; 494 - } 495 - 496 - b"display" => { 497 - props.display = match value { 498 - v if v.starts_with(b"none") => DISP_NONE, 499 - v if v.starts_with(b"block") => DISP_BLOCK, 500 - v if v.starts_with(b"inline") => DISP_INLINE, 501 - _ => DISP_DEFAULT, 502 - }; 503 - props.set |= PROP_DISPLAY; 504 - } 505 - 506 - b"text-decoration" | b"text-decoration-line" => { 507 - props.text_decoration = if value.starts_with(b"underline") { 508 - TD_UNDERLINE 509 - } else if value.starts_with(b"line-through") { 510 - TD_LINE_THROUGH 511 - } else { 512 - TD_NONE 513 - }; 514 - props.set |= PROP_TEXT_DECORATION; 515 - } 516 - 517 - // shorthand: margin: v1 [v2 [v3 [v4]]] 518 - b"margin" | b"padding" => { 519 - parse_margin_shorthand(value, props); 520 - } 521 - 522 - _ => {} // unknown property; ignore 523 - } 524 - } 525 - 526 - // parse margin shorthand (1-4 values) 527 - fn parse_margin_shorthand(value: &[u8], props: &mut StyleProps) { 528 - let mut vals = [0i8; 4]; 529 - let mut count = 0usize; 530 - 531 - // split value on whitespace, parse each part 532 - let mut pos = 0; 533 - let value = trim_css(value); 534 - 535 - while pos < value.len() && count < 4 { 536 - // skip whitespace 537 - while pos < value.len() && value[pos] == b' ' { 538 - pos += 1; 539 - } 540 - if pos >= value.len() { 541 - break; 542 - } 543 - // find end of token 544 - let start = pos; 545 - while pos < value.len() && value[pos] != b' ' { 546 - pos += 1; 547 - } 548 - vals[count] = parse_length_qem(&value[start..pos]); 549 - count += 1; 550 - } 551 - 552 - match count { 553 - 1 => { 554 - props.margin_top = vals[0]; 555 - props.margin_right = vals[0]; 556 - props.margin_bottom = vals[0]; 557 - props.margin_left = vals[0]; 558 - props.set |= 559 - PROP_MARGIN_TOP | PROP_MARGIN_RIGHT | PROP_MARGIN_BOTTOM | PROP_MARGIN_LEFT; 560 - } 561 - 2 => { 562 - props.margin_top = vals[0]; 563 - props.margin_bottom = vals[0]; 564 - props.margin_left = vals[1]; 565 - props.margin_right = vals[1]; 566 - props.set |= 567 - PROP_MARGIN_TOP | PROP_MARGIN_RIGHT | PROP_MARGIN_BOTTOM | PROP_MARGIN_LEFT; 568 - } 569 - 3 => { 570 - props.margin_top = vals[0]; 571 - props.margin_left = vals[1]; 572 - props.margin_right = vals[1]; 573 - props.margin_bottom = vals[2]; 574 - props.set |= 575 - PROP_MARGIN_TOP | PROP_MARGIN_RIGHT | PROP_MARGIN_BOTTOM | PROP_MARGIN_LEFT; 576 - } 577 - 4 => { 578 - props.margin_top = vals[0]; 579 - props.margin_right = vals[1]; 580 - props.margin_bottom = vals[2]; 581 - props.margin_left = vals[3]; 582 - props.set |= 583 - PROP_MARGIN_TOP | PROP_MARGIN_RIGHT | PROP_MARGIN_BOTTOM | PROP_MARGIN_LEFT; 584 - } 585 - _ => {} 586 - } 587 - } 588 - 589 - // tag ID mapping: lowercase tag name -> compact u8 for selector matching. 590 - // 0 = unknown/any; known tags get stable IDs. 591 - 592 - /// Map an HTML tag name to a compact numeric id used by [`CssRules::resolve_by_id`]. 593 - pub fn tag_id(name: &[u8]) -> u8 { 594 - match name { 595 - b"p" => 1, 596 - b"div" => 2, 597 - b"span" => 3, 598 - b"h1" => 4, 599 - b"h2" => 5, 600 - b"h3" => 6, 601 - b"h4" => 7, 602 - b"h5" => 8, 603 - b"h6" => 9, 604 - b"em" => 10, 605 - b"i" => 11, 606 - b"b" => 12, 607 - b"strong" => 13, 608 - b"a" => 14, 609 - b"blockquote" => 15, 610 - b"ul" => 16, 611 - b"ol" => 17, 612 - b"li" => 18, 613 - b"pre" => 19, 614 - b"code" => 20, 615 - b"body" => 21, 616 - b"section" => 22, 617 - b"article" => 23, 618 - b"figure" => 24, 619 - b"figcaption" => 25, 620 - b"cite" => 26, 621 - b"small" => 27, 622 - b"sup" => 28, 623 - b"sub" => 29, 624 - b"table" => 30, 625 - b"tr" => 31, 626 - b"td" => 32, 627 - b"th" => 33, 628 - b"header" => 34, 629 - b"footer" => 35, 630 - b"aside" => 36, 631 - b"nav" => 37, 632 - b"dl" => 38, 633 - b"dt" => 39, 634 - b"dd" => 40, 635 - b"abbr" => 41, 636 - _ => 0, 637 - } 638 - } 639 - 640 - // class hash: FNV-1a folded to 16 bits. 641 - // 0 reserved for "no class constraint"; hash of 0 is mapped to 1. 642 - 643 - /// Compute a 16-bit hash of a CSS class name for [`CssRules::resolve_by_id`]. 644 - pub fn class_hash(name: &[u8]) -> u16 { 645 - let mut h: u32 = 0x811c_9dc5; 646 - for &b in name { 647 - h ^= b as u32; 648 - h = h.wrapping_mul(0x0100_0193); 649 - } 650 - let h16 = ((h >> 16) ^ h) as u16; 651 - if h16 == 0 { 1 } else { h16 } 652 - } 653 - 654 - // CSS length parsing 655 - 656 - // parse CSS length to quarter-em units; handles em/rem/px/pt/0 657 - fn parse_length_qem(val: &[u8]) -> i8 { 658 - let val = trim_css(val); 659 - if val.is_empty() || val == b"0" || val == b"auto" || val == b"normal" { 660 - return 0; 661 - } 662 - 663 - let (neg, rest) = if val[0] == b'-' { 664 - (true, &val[1..]) 665 - } else { 666 - (false, val) 667 - }; 668 - 669 - // parse integer + fractional parts 670 - let mut whole: i32 = 0; 671 - let mut frac: i32 = 0; // hundredths 672 - let mut i = 0; 673 - let mut seen_dot = false; 674 - let mut frac_digits = 0u8; 675 - 676 - while i < rest.len() { 677 - let b = rest[i]; 678 - if b.is_ascii_digit() { 679 - if seen_dot { 680 - if frac_digits < 2 { 681 - frac = frac * 10 + (b - b'0') as i32; 682 - frac_digits += 1; 683 - } 684 - } else { 685 - whole = whole.saturating_mul(10).saturating_add((b - b'0') as i32); 686 - } 687 - } else if b == b'.' && !seen_dot { 688 - seen_dot = true; 689 - } else { 690 - break; 691 - } 692 - i += 1; 693 - } 694 - 695 - // normalise fractional part to hundredths 696 - if frac_digits == 1 { 697 - frac *= 10; 698 - } 699 - 700 - let unit = trim_css(&rest[i..]); 701 - 702 - // convert to quarter-em (4 qem = 1em) 703 - let qem = if unit.starts_with(b"px") || unit.starts_with(b"pt") { 704 - // 16px ~= 1em -> 4px ~= 1 qem 705 - let total_px_100 = whole * 100 + frac; 706 - (total_px_100 + 200) / 400 707 - } else { 708 - // em, rem, or unknown: treat as em 709 - whole * 4 + (frac * 4 + 50) / 100 710 - }; 711 - 712 - let signed = if neg { -qem } else { qem }; 713 - signed.clamp(-126, 126) as i8 714 - } 715 - 716 - // scanning helpers 717 - 718 - fn trim_css(data: &[u8]) -> &[u8] { 719 - let start = data 720 - .iter() 721 - .position(|b| !is_css_ws(*b)) 722 - .unwrap_or(data.len()); 723 - let end = data 724 - .iter() 725 - .rposition(|b| !is_css_ws(*b)) 726 - .map(|p| p + 1) 727 - .unwrap_or(start); 728 - if start >= end { &[] } else { &data[start..end] } 729 - } 730 - 731 - #[inline] 732 - fn is_css_ws(b: u8) -> bool { 733 - matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) 734 - } 735 - 736 - fn skip_ws_comments(css: &[u8], mut pos: usize) -> usize { 737 - loop { 738 - while pos < css.len() && is_css_ws(css[pos]) { 739 - pos += 1; 740 - } 741 - if pos + 1 < css.len() && css[pos] == b'/' && css[pos + 1] == b'*' { 742 - pos += 2; 743 - while pos + 1 < css.len() { 744 - if css[pos] == b'*' && css[pos + 1] == b'/' { 745 - pos += 2; 746 - break; 747 - } 748 - pos += 1; 749 - } 750 - } else { 751 - break; 752 - } 753 - } 754 - pos 755 - } 756 - 757 - // skip @-rule including nested brace blocks 758 - fn skip_at_rule(css: &[u8], pos: usize) -> usize { 759 - let mut p = pos + 1; // skip '@' 760 - while p < css.len() { 761 - if css[p] == b'{' { 762 - // block @-rule; count braces 763 - let mut depth = 1u32; 764 - p += 1; 765 - while p < css.len() && depth > 0 { 766 - match css[p] { 767 - b'{' => depth += 1, 768 - b'}' => depth -= 1, 769 - _ => {} 770 - } 771 - p += 1; 772 - } 773 - return p; 774 - } 775 - if css[p] == b';' { 776 - // statement @-rule (@import, @charset) 777 - return p + 1; 778 - } 779 - p += 1; 780 - } 781 - css.len() 782 - } 783 - 784 - fn scan_to_byte(css: &[u8], pos: usize, needle: u8) -> Option<usize> { 785 - css[pos..] 786 - .iter() 787 - .position(|&b| b == needle) 788 - .map(|i| pos + i) 789 - } 790 - 791 - fn starts_with_digit(val: &[u8]) -> bool { 792 - val.first().is_some_and(|b| b.is_ascii_digit()) 793 - } 794 - 795 - fn parse_int(val: &[u8]) -> i32 { 796 - let mut n: i32 = 0; 797 - for &b in val { 798 - if b.is_ascii_digit() { 799 - n = n.saturating_mul(10).saturating_add((b - b'0') as i32); 800 - } else { 801 - break; 802 - } 803 - } 804 - n 805 - }
-985
smol-epub/src/epub.rs
··· 1 - //! EPUB structure parser: `container.xml` → OPF → spine + metadata. 2 - //! 3 - //! `container.xml` gives the OPF path; the OPF gives metadata, a 4 - //! manifest (`id` → `href`), and a spine (ordered `idref`s). Spine 5 - //! references are resolved through the manifest to ZIP entry indices. 6 - 7 - use alloc::vec::Vec; 8 - 9 - use crate::xml; 10 - use crate::zip::ZipIndex; 11 - 12 - /// Maximum byte length of an EPUB title. 13 - pub const TITLE_CAP: usize = 96; 14 - /// Maximum byte length of an EPUB author name. 15 - pub const AUTHOR_CAP: usize = 64; 16 - /// Maximum number of spine entries (reading-order items). 17 - pub const MAX_SPINE: usize = 256; 18 - /// Maximum byte length of the OPF file path inside the ZIP. 19 - pub const OPF_PATH_CAP: usize = 256; 20 - 21 - /// EPUB book metadata (title and author), stored inline with fixed-size buffers. 22 - pub struct EpubMeta { 23 - /// Raw UTF-8 bytes of the title (up to [`TITLE_CAP`] bytes). 24 - pub title: [u8; TITLE_CAP], 25 - /// Number of valid bytes in [`title`](Self::title). 26 - pub title_len: u8, 27 - /// Raw UTF-8 bytes of the author name (up to [`AUTHOR_CAP`] bytes). 28 - pub author: [u8; AUTHOR_CAP], 29 - /// Number of valid bytes in [`author`](Self::author). 30 - pub author_len: u8, 31 - } 32 - 33 - impl Default for EpubMeta { 34 - fn default() -> Self { 35 - Self::new() 36 - } 37 - } 38 - 39 - impl EpubMeta { 40 - /// Create a new, empty `EpubMeta`. 41 - pub const fn new() -> Self { 42 - Self { 43 - title: [0u8; TITLE_CAP], 44 - title_len: 0, 45 - author: [0u8; AUTHOR_CAP], 46 - author_len: 0, 47 - } 48 - } 49 - 50 - /// Return the title as a `&str`, or `""` if it is not valid UTF-8. 51 - pub fn title_str(&self) -> &str { 52 - core::str::from_utf8(&self.title[..self.title_len as usize]).unwrap_or("") 53 - } 54 - 55 - /// Return the author as a `&str`, or `""` if it is not valid UTF-8. 56 - pub fn author_str(&self) -> &str { 57 - core::str::from_utf8(&self.author[..self.author_len as usize]).unwrap_or("") 58 - } 59 - 60 - fn set_title(&mut self, s: &[u8]) { 61 - let n = s.len().min(TITLE_CAP); 62 - self.title[..n].copy_from_slice(&s[..n]); 63 - self.title_len = n as u8; 64 - } 65 - 66 - fn set_author(&mut self, s: &[u8]) { 67 - let n = s.len().min(AUTHOR_CAP); 68 - self.author[..n].copy_from_slice(&s[..n]); 69 - self.author_len = n as u8; 70 - } 71 - } 72 - 73 - /// The EPUB reading-order spine: an ordered list of ZIP entry indices. 74 - pub struct EpubSpine { 75 - /// ZIP entry indices in reading order. 76 - pub items: [u16; MAX_SPINE], 77 - /// Number of valid entries in [`items`](Self::items). 78 - pub count: u16, 79 - } 80 - 81 - impl Default for EpubSpine { 82 - fn default() -> Self { 83 - Self::new() 84 - } 85 - } 86 - 87 - impl EpubSpine { 88 - /// Create a new, empty spine. 89 - pub const fn new() -> Self { 90 - Self { 91 - items: [0u16; MAX_SPINE], 92 - count: 0, 93 - } 94 - } 95 - 96 - #[inline] 97 - /// Number of items in the spine. 98 - pub fn len(&self) -> usize { 99 - self.count as usize 100 - } 101 - 102 - #[inline] 103 - /// Returns `true` if the spine contains no items. 104 - pub fn is_empty(&self) -> bool { 105 - self.count == 0 106 - } 107 - } 108 - 109 - // ── table of contents ─────────────────────────────────────────────── 110 - 111 - /// Maximum number of entries in the table of contents. 112 - pub const MAX_TOC: usize = 128; 113 - /// Maximum byte length of a single TOC entry title. 114 - pub const TOC_TITLE_CAP: usize = 48; 115 - 116 - /// A single entry in the EPUB table of contents. 117 - #[derive(Clone, Copy)] 118 - pub struct TocEntry { 119 - /// Raw UTF-8 bytes of the entry title. 120 - pub title: [u8; TOC_TITLE_CAP], 121 - /// Number of valid bytes in [`title`](Self::title). 122 - pub title_len: u8, 123 - /// Index into [`EpubSpine::items`]; `0xFFFF` means unresolved. 124 - pub spine_idx: u16, 125 - } 126 - 127 - impl TocEntry { 128 - /// An empty, unresolved TOC entry. 129 - pub const EMPTY: Self = Self { 130 - title: [0u8; TOC_TITLE_CAP], 131 - title_len: 0, 132 - spine_idx: 0xFFFF, 133 - }; 134 - 135 - /// Return the entry title as a `&str`, or `""` if not valid UTF-8. 136 - pub fn title_str(&self) -> &str { 137 - core::str::from_utf8(&self.title[..self.title_len as usize]).unwrap_or("") 138 - } 139 - } 140 - 141 - /// EPUB table of contents (flat list of [`TocEntry`] items). 142 - pub struct EpubToc { 143 - /// TOC entries in document order. 144 - pub entries: [TocEntry; MAX_TOC], 145 - /// Number of valid entries. 146 - pub count: u16, 147 - } 148 - 149 - impl Default for EpubToc { 150 - fn default() -> Self { 151 - Self::new() 152 - } 153 - } 154 - 155 - impl EpubToc { 156 - /// Create a new, empty table of contents. 157 - pub const fn new() -> Self { 158 - Self { 159 - entries: [TocEntry::EMPTY; MAX_TOC], 160 - count: 0, 161 - } 162 - } 163 - 164 - /// Remove all entries. 165 - pub fn clear(&mut self) { 166 - self.count = 0; 167 - } 168 - 169 - #[inline] 170 - /// Number of entries in the TOC. 171 - pub fn len(&self) -> usize { 172 - self.count as usize 173 - } 174 - 175 - #[inline] 176 - /// Returns `true` if the TOC contains no entries. 177 - pub fn is_empty(&self) -> bool { 178 - self.count == 0 179 - } 180 - 181 - fn push(&mut self, title: &[u8], spine_idx: u16) { 182 - if (self.count as usize) >= MAX_TOC { 183 - return; 184 - } 185 - let i = self.count as usize; 186 - let n = title.len().min(TOC_TITLE_CAP); 187 - self.entries[i] = TocEntry::EMPTY; 188 - self.entries[i].title[..n].copy_from_slice(&title[..n]); 189 - self.entries[i].title_len = n as u8; 190 - self.entries[i].spine_idx = spine_idx; 191 - self.count += 1; 192 - } 193 - } 194 - 195 - /// Identifies where the table-of-contents data lives inside the EPUB ZIP. 196 - #[derive(Clone, Copy, Debug)] 197 - pub enum TocSource { 198 - /// EPUB 2 NCX document (ZIP entry index). 199 - Ncx(usize), 200 - /// EPUB 3 Navigation Document (ZIP entry index). 201 - Nav(usize), 202 - } 203 - 204 - impl TocSource { 205 - /// Return the ZIP entry index regardless of variant. 206 - pub fn zip_index(&self) -> usize { 207 - match *self { 208 - TocSource::Ncx(i) | TocSource::Nav(i) => i, 209 - } 210 - } 211 - } 212 - 213 - // parse container.xml to find the OPF path; write into out 214 - /// Parse `META-INF/container.xml` and extract the OPF file path. 215 - /// 216 - /// Writes the path into `out` and returns its byte length. 217 - pub fn parse_container(data: &[u8], out: &mut [u8; OPF_PATH_CAP]) -> Result<usize, &'static str> { 218 - let mut found_len: Option<usize> = None; 219 - 220 - xml::for_each_tag(data, b"rootfile", |tag_bytes| { 221 - if found_len.is_some() { 222 - return; 223 - } 224 - if let Some(path) = xml::get_attr(tag_bytes, b"full-path") { 225 - let n = path.len().min(OPF_PATH_CAP); 226 - out[..n].copy_from_slice(&path[..n]); 227 - found_len = Some(n); 228 - } 229 - }); 230 - 231 - found_len.ok_or("epub: no rootfile full-path in container.xml") 232 - } 233 - 234 - /// Scan ZIP entries for a `.opf` file and return its path. 235 - /// 236 - /// This is a fallback for EPUBs that lack `META-INF/container.xml`. 237 - /// Writes the first `.opf` entry name into `out` and returns its byte length. 238 - pub fn find_opf_in_zip( 239 - zip: &ZipIndex, 240 - out: &mut [u8; OPF_PATH_CAP], 241 - ) -> Result<usize, &'static str> { 242 - for i in 0..zip.count() { 243 - let name = zip.entry_name(i); 244 - let bytes = name.as_bytes(); 245 - if bytes.len() >= 5 && bytes[bytes.len() - 4..].eq_ignore_ascii_case(b".opf") { 246 - let n = bytes.len().min(OPF_PATH_CAP); 247 - out[..n].copy_from_slice(&bytes[..n]); 248 - return Ok(n); 249 - } 250 - } 251 - Err("epub: no .opf file found in archive") 252 - } 253 - 254 - /// Parse an OPF document: extract metadata and build the reading-order spine. 255 - /// 256 - /// Two-pass, zero heap: phase 1 collects `idref` byte offsets 257 - /// (`MAX_SPINE` × 4 = 1 KB stack); phase 2 resolves each `idref` 258 - /// through the manifest to a ZIP entry index. 259 - pub fn parse_opf( 260 - opf: &[u8], 261 - opf_dir: &str, 262 - zip: &ZipIndex, 263 - meta: &mut EpubMeta, 264 - spine: &mut EpubSpine, 265 - ) -> Result<(), &'static str> { 266 - *meta = EpubMeta::new(); 267 - spine.count = 0; 268 - 269 - if let Some(title) = xml::tag_text(opf, b"title") { 270 - meta.set_title(title); 271 - } 272 - if let Some(author) = xml::tag_text(opf, b"creator") { 273 - meta.set_author(author); 274 - } 275 - 276 - // phase 1: collect idref byte offsets; get_attr returns subslices so 277 - // pointer subtraction gives the offset; (start,len) = 4B each = 1KB total 278 - 279 - #[derive(Clone, Copy)] 280 - struct IdrefLoc { 281 - start: u16, 282 - len: u16, 283 - } 284 - 285 - let mut idref_locs = [IdrefLoc { start: 0, len: 0 }; MAX_SPINE]; 286 - let mut idref_count: usize = 0; 287 - 288 - xml::for_each_tag(opf, b"itemref", |tag_bytes| { 289 - if idref_count >= MAX_SPINE { 290 - return; 291 - } 292 - if let Some(idref) = xml::get_attr(tag_bytes, b"idref") { 293 - let offset = idref.as_ptr() as usize - opf.as_ptr() as usize; 294 - if offset <= u16::MAX as usize && idref.len() <= u16::MAX as usize { 295 - idref_locs[idref_count] = IdrefLoc { 296 - start: offset as u16, 297 - len: idref.len() as u16, 298 - }; 299 - idref_count += 1; 300 - } 301 - } 302 - }); 303 - 304 - // phase 2: for each idref, scan manifest for matching <item> and resolve href 305 - let mut path_buf = [0u8; 512]; 306 - 307 - for loc in &idref_locs[..idref_count] { 308 - let idref = &opf[loc.start as usize..loc.start as usize + loc.len as usize]; 309 - 310 - let mut found = false; 311 - xml::for_each_tag(opf, b"item", |item_tag| { 312 - if found { 313 - return; 314 - } 315 - let Some(id) = xml::get_attr(item_tag, b"id") else { 316 - return; 317 - }; 318 - if id != idref { 319 - return; 320 - } 321 - let Some(href) = xml::get_attr(item_tag, b"href") else { 322 - return; 323 - }; 324 - 325 - let decoded_href = percent_decode(href); 326 - let href_str = core::str::from_utf8(&decoded_href).unwrap_or(""); 327 - let full_len = resolve_path(opf_dir, href_str, &mut path_buf); 328 - let full_path = core::str::from_utf8(&path_buf[..full_len]).unwrap_or(""); 329 - 330 - if let Some(idx) = zip.find(full_path).or_else(|| zip.find_icase(full_path)) 331 - && (spine.count as usize) < MAX_SPINE 332 - { 333 - spine.items[spine.count as usize] = idx as u16; 334 - spine.count += 1; 335 - } 336 - found = true; 337 - }); 338 - } 339 - 340 - if spine.count == 0 { 341 - return Err("epub: spine is empty after resolution"); 342 - } 343 - 344 - Ok(()) 345 - } 346 - 347 - // locate TOC in ZIP: EPUB 3 nav first, EPUB 2 NCX fallback 348 - /// Search the OPF manifest for a table-of-contents source. 349 - /// 350 - /// Tries, in order: EPUB 3 `<item properties="nav">`, EPUB 2 351 - /// `<spine toc="id">`, and a media-type fallback for NCX files. 352 - pub fn find_toc_source(opf: &[u8], opf_dir: &str, zip: &ZipIndex) -> Option<TocSource> { 353 - let mut path_buf = [0u8; 512]; 354 - 355 - // EPUB 3: manifest item with properties containing "nav" 356 - let mut nav_href_buf = [0u8; 256]; 357 - let mut nav_href_len: usize = 0; 358 - xml::for_each_tag(opf, b"item", |tag_bytes| { 359 - if nav_href_len > 0 { 360 - return; 361 - } 362 - if let Some(props) = xml::get_attr(tag_bytes, b"properties") { 363 - if props.split(|&b| b == b' ').any(|w| w == b"nav") { 364 - if let Some(href) = xml::get_attr(tag_bytes, b"href") { 365 - let n = href.len().min(nav_href_buf.len()); 366 - nav_href_buf[..n].copy_from_slice(&href[..n]); 367 - nav_href_len = n; 368 - } 369 - } 370 - } 371 - }); 372 - 373 - if nav_href_len > 0 { 374 - let decoded = percent_decode(&nav_href_buf[..nav_href_len]); 375 - let href_str = core::str::from_utf8(&decoded).unwrap_or(""); 376 - let full_len = resolve_path(opf_dir, href_str, &mut path_buf); 377 - let full_path = core::str::from_utf8(&path_buf[..full_len]).unwrap_or(""); 378 - if let Some(idx) = zip.find(full_path).or_else(|| zip.find_icase(full_path)) { 379 - log::info!("epub: TOC source = EPUB3 nav (zip #{})", idx); 380 - return Some(TocSource::Nav(idx)); 381 - } 382 - } 383 - 384 - // EPUB 2: <spine toc="id"> -> manifest item href 385 - let mut toc_id = [0u8; 64]; 386 - let mut toc_id_len: usize = 0; 387 - xml::for_each_tag(opf, b"spine", |tag_bytes| { 388 - if toc_id_len > 0 { 389 - return; 390 - } 391 - if let Some(attr) = xml::get_attr(tag_bytes, b"toc") { 392 - let n = attr.len().min(toc_id.len()); 393 - toc_id[..n].copy_from_slice(&attr[..n]); 394 - toc_id_len = n; 395 - } 396 - }); 397 - 398 - if toc_id_len > 0 { 399 - let target_id = &toc_id[..toc_id_len]; 400 - let mut ncx_href_buf = [0u8; 256]; 401 - let mut ncx_href_len: usize = 0; 402 - xml::for_each_tag(opf, b"item", |tag_bytes| { 403 - if ncx_href_len > 0 { 404 - return; 405 - } 406 - if let Some(id) = xml::get_attr(tag_bytes, b"id") { 407 - if id == target_id { 408 - if let Some(href) = xml::get_attr(tag_bytes, b"href") { 409 - let n = href.len().min(ncx_href_buf.len()); 410 - ncx_href_buf[..n].copy_from_slice(&href[..n]); 411 - ncx_href_len = n; 412 - } 413 - } 414 - } 415 - }); 416 - 417 - if ncx_href_len > 0 { 418 - let decoded = percent_decode(&ncx_href_buf[..ncx_href_len]); 419 - let href_str = core::str::from_utf8(&decoded).unwrap_or(""); 420 - let full_len = resolve_path(opf_dir, href_str, &mut path_buf); 421 - let full_path = core::str::from_utf8(&path_buf[..full_len]).unwrap_or(""); 422 - if let Some(idx) = zip.find(full_path).or_else(|| zip.find_icase(full_path)) { 423 - log::info!( 424 - "epub: TOC source = EPUB2 NCX via spine toc attr (zip #{})", 425 - idx 426 - ); 427 - return Some(TocSource::Ncx(idx)); 428 - } 429 - } 430 - } 431 - 432 - // fallback: find NCX by media-type (many EPUB 2 books omit toc attr on <spine>) 433 - let mut ncx_fb_href = [0u8; 256]; 434 - let mut ncx_fb_len: usize = 0; 435 - xml::for_each_tag(opf, b"item", |tag_bytes| { 436 - if ncx_fb_len > 0 { 437 - return; 438 - } 439 - if let Some(mt) = xml::get_attr(tag_bytes, b"media-type") { 440 - if mt == b"application/x-dtbncx+xml" { 441 - if let Some(href) = xml::get_attr(tag_bytes, b"href") { 442 - let n = href.len().min(ncx_fb_href.len()); 443 - ncx_fb_href[..n].copy_from_slice(&href[..n]); 444 - ncx_fb_len = n; 445 - } 446 - } 447 - } 448 - }); 449 - 450 - if ncx_fb_len > 0 { 451 - let decoded = percent_decode(&ncx_fb_href[..ncx_fb_len]); 452 - let href_str = core::str::from_utf8(&decoded).unwrap_or(""); 453 - let full_len = resolve_path(opf_dir, href_str, &mut path_buf); 454 - let full_path = core::str::from_utf8(&path_buf[..full_len]).unwrap_or(""); 455 - if let Some(idx) = zip.find(full_path).or_else(|| zip.find_icase(full_path)) { 456 - log::info!( 457 - "epub: TOC source = NCX via media-type fallback (zip #{})", 458 - idx 459 - ); 460 - return Some(TocSource::Ncx(idx)); 461 - } 462 - } 463 - 464 - log::warn!("epub: no TOC source found in OPF"); 465 - None 466 - } 467 - 468 - /// Parse a TOC document (NCX or Navigation Document) into `toc`. 469 - /// 470 - /// Dispatches to [`parse_ncx_toc`] or [`parse_nav_toc`] based on 471 - /// the [`TocSource`] variant. 472 - pub fn parse_toc( 473 - source: TocSource, 474 - data: &[u8], 475 - toc_dir: &str, 476 - spine: &EpubSpine, 477 - zip: &ZipIndex, 478 - toc: &mut EpubToc, 479 - ) { 480 - match source { 481 - TocSource::Ncx(_) => parse_ncx_toc(data, toc_dir, spine, zip, toc), 482 - TocSource::Nav(_) => parse_nav_toc(data, toc_dir, spine, zip, toc), 483 - } 484 - } 485 - 486 - /// Parse an EPUB 2 NCX document into flat TOC entries. 487 - /// 488 - /// Nested `<navPoint>` elements are flattened into a linear list. 489 - pub fn parse_ncx_toc( 490 - ncx: &[u8], 491 - ncx_dir: &str, 492 - spine: &EpubSpine, 493 - zip: &ZipIndex, 494 - toc: &mut EpubToc, 495 - ) { 496 - toc.clear(); 497 - let mut pos: usize = 0; 498 - let mut title_buf = [0u8; TOC_TITLE_CAP]; 499 - let mut title_len: usize = 0; 500 - 501 - while pos < ncx.len() { 502 - let Some(lt) = toc_find_byte(ncx, pos, b'<') else { 503 - break; 504 - }; 505 - pos = lt + 1; 506 - if pos >= ncx.len() { 507 - break; 508 - } 509 - 510 - // skip comments and PIs 511 - if ncx[pos] == b'!' || ncx[pos] == b'?' { 512 - pos = toc_skip_to_gt(ncx, pos); 513 - continue; 514 - } 515 - 516 - // skip closing tags 517 - let is_close = ncx[pos] == b'/'; 518 - if is_close { 519 - pos = toc_skip_to_gt(ncx, pos + 1); 520 - continue; 521 - } 522 - 523 - // read tag name 524 - let name_start = pos; 525 - while pos < ncx.len() && !is_toc_delim(ncx[pos]) { 526 - pos += 1; 527 - } 528 - let name = &ncx[name_start..pos]; 529 - 530 - // <text>: capture label for the next <content> 531 - if name.eq_ignore_ascii_case(b"text") { 532 - pos = toc_skip_to_gt(ncx, pos); 533 - let text_start = pos; 534 - while pos < ncx.len() && ncx[pos] != b'<' { 535 - pos += 1; 536 - } 537 - let raw = toc_trim_ws(&ncx[text_start..pos]); 538 - title_len = raw.len().min(TOC_TITLE_CAP); 539 - title_buf[..title_len].copy_from_slice(&raw[..title_len]); 540 - continue; 541 - } 542 - 543 - // <content src="...">: emit TOC entry 544 - if name.eq_ignore_ascii_case(b"content") { 545 - let gt = toc_find_byte(ncx, pos, b'>').unwrap_or(ncx.len()); 546 - let tag_bytes = &ncx[name_start..gt]; 547 - if let Some(src) = xml::get_attr(tag_bytes, b"src") { 548 - let sidx = href_to_spine_idx(src, ncx_dir, spine, zip); 549 - toc.push(&title_buf[..title_len], sidx); 550 - } 551 - pos = if gt < ncx.len() { gt + 1 } else { gt }; 552 - continue; 553 - } 554 - 555 - pos = toc_skip_to_gt(ncx, pos); 556 - } 557 - 558 - let unresolved = (0..toc.len()) 559 - .filter(|&i| toc.entries[i].spine_idx == 0xFFFF) 560 - .count(); 561 - if unresolved > 0 { 562 - log::warn!( 563 - "epub: NCX TOC: {} of {} entries unresolved", 564 - unresolved, 565 - toc.len() 566 - ); 567 - } 568 - } 569 - 570 - /// Parse an EPUB 3 Navigation Document into flat TOC entries. 571 - /// 572 - /// Extracts `<a>` elements from the `<nav epub:type="toc">` region 573 - /// and flattens nested `<ol>` lists. 574 - pub fn parse_nav_toc( 575 - nav: &[u8], 576 - nav_dir: &str, 577 - spine: &EpubSpine, 578 - zip: &ZipIndex, 579 - toc: &mut EpubToc, 580 - ) { 581 - toc.clear(); 582 - 583 - // restrict scanning to the <nav epub:type="toc"> ... </nav> region 584 - let Some((region_start, region_end)) = find_nav_toc_region(nav) else { 585 - log::warn!("epub: nav document has no <nav epub:type=\"toc\"> region"); 586 - return; 587 - }; 588 - let region = &nav[region_start..region_end]; 589 - 590 - let mut pos: usize = 0; 591 - while pos < region.len() { 592 - let Some(lt) = toc_find_byte(region, pos, b'<') else { 593 - break; 594 - }; 595 - pos = lt + 1; 596 - if pos >= region.len() { 597 - break; 598 - } 599 - 600 - if region[pos] == b'!' || region[pos] == b'?' || region[pos] == b'/' { 601 - pos = toc_skip_to_gt(region, pos); 602 - continue; 603 - } 604 - 605 - let name_start = pos; 606 - while pos < region.len() && !is_toc_delim(region[pos]) { 607 - pos += 1; 608 - } 609 - let name = &region[name_start..pos]; 610 - 611 - if !name.eq_ignore_ascii_case(b"a") { 612 - pos = toc_skip_to_gt(region, pos); 613 - continue; 614 - } 615 - 616 - // found <a ...>: extract href attribute 617 - let gt = toc_find_byte(region, pos, b'>').unwrap_or(region.len()); 618 - let tag_bytes = &region[name_start..gt]; 619 - let href = xml::get_attr(tag_bytes, b"href"); 620 - pos = if gt < region.len() { gt + 1 } else { gt }; 621 - 622 - // read text until </a>, stripping nested tags 623 - let mut title_buf = [0u8; TOC_TITLE_CAP]; 624 - let mut title_len: usize = 0; 625 - while pos < region.len() { 626 - if region[pos] == b'<' { 627 - // check for </a> 628 - if pos + 1 < region.len() && region[pos + 1] == b'/' { 629 - let cs = pos + 2; 630 - let mut ce = cs; 631 - while ce < region.len() && !is_toc_delim(region[ce]) { 632 - ce += 1; 633 - } 634 - if region[cs..ce].eq_ignore_ascii_case(b"a") { 635 - pos = toc_skip_to_gt(region, ce); 636 - break; 637 - } 638 - } 639 - // skip nested tag 640 - pos = toc_skip_to_gt(region, pos + 1); 641 - continue; 642 - } 643 - // accumulate text, collapse whitespace 644 - if title_len < TOC_TITLE_CAP { 645 - let b = region[pos]; 646 - if is_toc_ws(b) { 647 - if title_len > 0 && title_buf[title_len - 1] != b' ' { 648 - title_buf[title_len] = b' '; 649 - title_len += 1; 650 - } 651 - } else { 652 - title_buf[title_len] = b; 653 - title_len += 1; 654 - } 655 - } 656 - pos += 1; 657 - } 658 - 659 - // trim trailing whitespace 660 - while title_len > 0 && title_buf[title_len - 1] == b' ' { 661 - title_len -= 1; 662 - } 663 - 664 - if let Some(href) = href { 665 - let sidx = href_to_spine_idx(href, nav_dir, spine, zip); 666 - toc.push(&title_buf[..title_len], sidx); 667 - } 668 - } 669 - 670 - let unresolved = (0..toc.len()) 671 - .filter(|&i| toc.entries[i].spine_idx == 0xFFFF) 672 - .count(); 673 - if unresolved > 0 { 674 - log::warn!( 675 - "epub: nav TOC: {} of {} entries unresolved", 676 - unresolved, 677 - toc.len() 678 - ); 679 - } 680 - } 681 - 682 - // find the byte range of <nav epub:type="toc"> content; returns (start, end) 683 - fn find_nav_toc_region(data: &[u8]) -> Option<(usize, usize)> { 684 - let mut pos: usize = 0; 685 - while pos < data.len() { 686 - let Some(lt) = toc_find_byte(data, pos, b'<') else { 687 - break; 688 - }; 689 - pos = lt + 1; 690 - if pos >= data.len() { 691 - break; 692 - } 693 - if data[pos] == b'!' || data[pos] == b'?' || data[pos] == b'/' { 694 - pos = toc_skip_to_gt(data, pos); 695 - continue; 696 - } 697 - 698 - let name_start = pos; 699 - while pos < data.len() && !is_toc_delim(data[pos]) { 700 - pos += 1; 701 - } 702 - let name = &data[name_start..pos]; 703 - 704 - if !name.eq_ignore_ascii_case(b"nav") { 705 - pos = toc_skip_to_gt(data, pos); 706 - continue; 707 - } 708 - 709 - // check for epub:type="toc" or type="toc" 710 - let gt = toc_find_byte(data, pos, b'>').unwrap_or(data.len()); 711 - let tag_bytes = &data[name_start..gt]; 712 - 713 - // epub:type may be space-separated tokens e.g. "toc landmarks" 714 - let is_toc = if let Some(t) = xml::get_attr(tag_bytes, b"epub:type") { 715 - t == b"toc" || t.split(|&b| b == b' ').any(|w| w == b"toc") 716 - } else { 717 - xml::get_attr(tag_bytes, b"type") 718 - .map(|t| t == b"toc" || t.split(|&b| b == b' ').any(|w| w == b"toc")) 719 - .unwrap_or(false) 720 - }; 721 - 722 - if !is_toc { 723 - pos = if gt < data.len() { gt + 1 } else { gt }; 724 - continue; 725 - } 726 - 727 - let content_start = if gt < data.len() { gt + 1 } else { gt }; 728 - 729 - // find closing </nav> 730 - let mut search = content_start; 731 - while search < data.len() { 732 - if data[search] == b'<' && search + 2 < data.len() && data[search + 1] == b'/' { 733 - let ts = search + 2; 734 - let mut te = ts; 735 - while te < data.len() && !is_toc_delim(data[te]) { 736 - te += 1; 737 - } 738 - if data[ts..te].eq_ignore_ascii_case(b"nav") { 739 - return Some((content_start, search)); 740 - } 741 - } 742 - search += 1; 743 - } 744 - // no closing tag; use rest of document 745 - return Some((content_start, data.len())); 746 - } 747 - None 748 - } 749 - 750 - // resolve TOC href to spine index; strip fragment, percent-decode, resolve relative path. 751 - // returns 0xFFFF if unresolvable. 752 - fn href_to_spine_idx(href: &[u8], base_dir: &str, spine: &EpubSpine, zip: &ZipIndex) -> u16 { 753 - let decoded = percent_decode(href); 754 - let href_str = core::str::from_utf8(&decoded).unwrap_or(""); 755 - // strip fragment 756 - let href_no_frag = href_str.split('#').next().unwrap_or(href_str); 757 - if href_no_frag.is_empty() { 758 - return 0xFFFF; 759 - } 760 - 761 - let mut path_buf = [0u8; 512]; 762 - let full_len = resolve_path(base_dir, href_no_frag, &mut path_buf); 763 - let full_path = core::str::from_utf8(&path_buf[..full_len]).unwrap_or(""); 764 - 765 - // 1. exact match, then case-insensitive 766 - let zip_idx = zip 767 - .find(full_path) 768 - .or_else(|| zip.find_icase(full_path)) 769 - .or_else(|| { 770 - // 2. filename-only match (handles differing base dirs, leading ./, etc.) 771 - let filename = href_no_frag.rsplit('/').next().unwrap_or(href_no_frag); 772 - if filename.is_empty() { 773 - return None; 774 - } 775 - let fname = filename.as_bytes(); 776 - for i in 0..zip.count() { 777 - let entry_name = zip.entry_name(i).as_bytes(); 778 - let entry_fname = entry_name 779 - .rsplit(|&b| b == b'/') 780 - .next() 781 - .unwrap_or(entry_name); 782 - if entry_fname.eq_ignore_ascii_case(fname) { 783 - return Some(i); 784 - } 785 - } 786 - None 787 - }); 788 - 789 - let Some(zip_idx) = zip_idx else { 790 - return 0xFFFF; 791 - }; 792 - 793 - // 3. map zip entry index to spine position 794 - for i in 0..spine.len() { 795 - if spine.items[i] as usize == zip_idx { 796 - return i as u16; 797 - } 798 - } 799 - 800 - // 4. filename fallback against spine entry names 801 - let target_fname = zip 802 - .entry_name(zip_idx) 803 - .as_bytes() 804 - .rsplit(|&b| b == b'/') 805 - .next() 806 - .unwrap_or(b""); 807 - if !target_fname.is_empty() { 808 - for i in 0..spine.len() { 809 - let se = spine.items[i] as usize; 810 - let se_name = zip.entry_name(se).as_bytes(); 811 - let se_fname = se_name.rsplit(|&b| b == b'/').next().unwrap_or(se_name); 812 - if se_fname.eq_ignore_ascii_case(target_fname) { 813 - return i as u16; 814 - } 815 - } 816 - } 817 - 818 - 0xFFFF 819 - } 820 - 821 - // TOC scanning helpers (private) 822 - 823 - fn toc_find_byte(data: &[u8], start: usize, needle: u8) -> Option<usize> { 824 - data[start..] 825 - .iter() 826 - .position(|&b| b == needle) 827 - .map(|i| start + i) 828 - } 829 - 830 - fn toc_skip_to_gt(data: &[u8], mut pos: usize) -> usize { 831 - while pos < data.len() { 832 - if data[pos] == b'>' { 833 - return pos + 1; 834 - } 835 - pos += 1; 836 - } 837 - data.len() 838 - } 839 - 840 - #[inline] 841 - fn is_toc_delim(b: u8) -> bool { 842 - matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') 843 - } 844 - 845 - #[inline] 846 - fn is_toc_ws(b: u8) -> bool { 847 - matches!(b, b' ' | b'\t' | b'\n' | b'\r') 848 - } 849 - 850 - fn toc_trim_ws(data: &[u8]) -> &[u8] { 851 - let start = data 852 - .iter() 853 - .position(|b| !is_toc_ws(*b)) 854 - .unwrap_or(data.len()); 855 - let end = data 856 - .iter() 857 - .rposition(|b| !is_toc_ws(*b)) 858 - .map(|p| p + 1) 859 - .unwrap_or(start); 860 - if start >= end { &[] } else { &data[start..end] } 861 - } 862 - 863 - // ── path helpers ──────────────────────────────────────────────────── 864 - 865 - /// Resolve a relative `href` against `base_dir`, writing the result 866 - /// into `out`. Returns the number of bytes written. 867 - /// 868 - /// Handles `../` segments, leading `./`, and absolute paths. 869 - pub fn resolve_path(base_dir: &str, href: &str, out: &mut [u8; 512]) -> usize { 870 - let href = href.split('#').next().unwrap_or(href); 871 - 872 - if href.starts_with('/') || base_dir.is_empty() { 873 - let href = href.trim_start_matches('/'); 874 - let n = href.len().min(out.len()); 875 - out[..n].copy_from_slice(&href.as_bytes()[..n]); 876 - return n; 877 - } 878 - 879 - let base = base_dir.as_bytes(); 880 - let rel = href.as_bytes(); 881 - 882 - let mut rel_pos = 0; 883 - let mut base_end = base.len(); 884 - 885 - while rel_pos + 3 <= rel.len() && &rel[rel_pos..rel_pos + 3] == b"../" { 886 - rel_pos += 3; 887 - if let Some(slash) = base[..base_end].iter().rposition(|&b| b == b'/') { 888 - base_end = slash; 889 - } else { 890 - base_end = 0; 891 - } 892 - } 893 - 894 - if rel_pos + 2 <= rel.len() 895 - && &rel[rel_pos..rel_pos + 2] == b".." 896 - && (rel_pos + 2 == rel.len() || rel[rel_pos + 2] == b'/') 897 - { 898 - rel_pos += 2; 899 - if rel_pos < rel.len() && rel[rel_pos] == b'/' { 900 - rel_pos += 1; 901 - } 902 - if let Some(slash) = base[..base_end].iter().rposition(|&b| b == b'/') { 903 - base_end = slash; 904 - } else { 905 - base_end = 0; 906 - } 907 - } 908 - 909 - if rel_pos + 2 <= rel.len() && &rel[rel_pos..rel_pos + 2] == b"./" { 910 - rel_pos += 2; 911 - } 912 - 913 - let remaining = &rel[rel_pos..]; 914 - 915 - if base_end == 0 { 916 - let n = remaining.len().min(out.len()); 917 - out[..n].copy_from_slice(&remaining[..n]); 918 - n 919 - } else { 920 - let total = base_end + 1 + remaining.len(); 921 - let n = total.min(out.len()); 922 - 923 - let mut w = 0; 924 - let copy_base = base_end.min(n); 925 - out[..copy_base].copy_from_slice(&base[..copy_base]); 926 - w += copy_base; 927 - 928 - if w < n { 929 - out[w] = b'/'; 930 - w += 1; 931 - } 932 - 933 - let copy_rem = remaining.len().min(n.saturating_sub(w)); 934 - out[w..w + copy_rem].copy_from_slice(&remaining[..copy_rem]); 935 - w += copy_rem; 936 - 937 - w 938 - } 939 - } 940 - 941 - fn percent_decode(input: &[u8]) -> Vec<u8> { 942 - if !input.contains(&b'%') { 943 - return Vec::from(input); 944 - } 945 - 946 - let mut out = Vec::with_capacity(input.len()); 947 - let mut i = 0; 948 - while i < input.len() { 949 - if input[i] == b'%' && i + 2 < input.len() { 950 - let hi = hex_nibble(input[i + 1]); 951 - let lo = hex_nibble(input[i + 2]); 952 - if let (Some(h), Some(l)) = (hi, lo) { 953 - out.push((h << 4) | l); 954 - i += 3; 955 - continue; 956 - } 957 - } 958 - out.push(input[i]); 959 - i += 1; 960 - } 961 - out 962 - } 963 - 964 - fn hex_nibble(b: u8) -> Option<u8> { 965 - match b { 966 - b'0'..=b'9' => Some(b - b'0'), 967 - b'a'..=b'f' => Some(b - b'a' + 10), 968 - b'A'..=b'F' => Some(b - b'A' + 10), 969 - _ => None, 970 - } 971 - } 972 - 973 - /// Check if a filename looks like an EPUB (`.epub` or `.epu` for FAT 8.3 truncation). 974 - pub fn is_epub_filename(name: &str) -> bool { 975 - let b = name.as_bytes(); 976 - 977 - if b.len() >= 5 && b[b.len() - 5] == b'.' { 978 - return b[b.len() - 4..].eq_ignore_ascii_case(b"epub"); 979 - } 980 - if b.len() >= 4 && b[b.len() - 4] == b'.' { 981 - return b[b.len() - 3..].eq_ignore_ascii_case(b"epu"); 982 - } 983 - 984 - false 985 - }
-1343
smol-epub/src/html_strip.rs
··· 1 - //! Single-pass HTML to styled-text converter for EPUB XHTML. 2 - //! 3 - //! [`HtmlStripStream`]: streaming `feed`/`finish` interface; emits 2-byte 4 - //! `[MARKER, tag]` style codes inline with plain text. 5 - //! 6 - //! [`strip_html_inplace`]: in-place variant for `container.xml` / OPF / TOC. 7 - //! 8 - //! Marker encoding: `[0x01, tag]`. Inline: `B`/`b` `I`/`i`. 9 - //! Block: `H`/`h` `Q`/`q` `S` (hr). Image: `P` (path follows). 10 - 11 - use alloc::vec::Vec; 12 - 13 - /// Escape byte that introduces a 2-byte style marker in the output stream. 14 - pub const MARKER: u8 = 0x01; 15 - 16 - /// Style tag: bold **on** (`[MARKER, BOLD_ON]`). 17 - pub const BOLD_ON: u8 = b'B'; 18 - /// Style tag: bold **off** (`[MARKER, BOLD_OFF]`). 19 - pub const BOLD_OFF: u8 = b'b'; 20 - /// Style tag: italic **on** (`[MARKER, ITALIC_ON]`). 21 - pub const ITALIC_ON: u8 = b'I'; 22 - /// Style tag: italic **off** (`[MARKER, ITALIC_OFF]`). 23 - pub const ITALIC_OFF: u8 = b'i'; 24 - /// Style tag: heading **on** (`[MARKER, HEADING_ON]`). 25 - pub const HEADING_ON: u8 = b'H'; 26 - /// Style tag: heading **off** (`[MARKER, HEADING_OFF]`). 27 - pub const HEADING_OFF: u8 = b'h'; 28 - /// Style tag: block-quote **on** (`[MARKER, QUOTE_ON]`). 29 - pub const QUOTE_ON: u8 = b'Q'; 30 - /// Style tag: block-quote **off** (`[MARKER, QUOTE_OFF]`). 31 - pub const QUOTE_OFF: u8 = b'q'; 32 - 33 - /// Style tag: thematic break / horizontal rule (`[MARKER, BREAK]`). 34 - pub const BREAK: u8 = b'S'; 35 - /// Style tag: inline image reference (`[MARKER, IMG_REF, len, path…]`). 36 - pub const IMG_REF: u8 = b'P'; 37 - 38 - /// Returns `true` if `b` is the [`MARKER`] escape byte. 39 - #[inline] 40 - pub const fn is_marker(b: u8) -> bool { 41 - b == MARKER 42 - } 43 - 44 - const TAG_BUF_CAP: usize = 16; 45 - const ENTITY_BUF_CAP: usize = 12; 46 - const BANG_BUF_CAP: usize = 8; 47 - const PENDING_CAP: usize = 16; 48 - const DEFERRED_CAP: usize = 8; 49 - const IMG_SRC_CAP: usize = 128; // 3-byte marker header + up to 125 path bytes 50 - 51 - // state machine phases 52 - #[derive(Clone, Copy, PartialEq)] 53 - #[repr(u8)] 54 - enum Phase { 55 - Text, 56 - Utf8Cont, 57 - AfterLt, 58 - TagName, 59 - TagBody, 60 - Entity, 61 - SkipContent, 62 - SkipLt, 63 - SkipCloseName, 64 - SkipToGt, 65 - BangProbe, 66 - Comment, 67 - Cdata, 68 - Pi, 69 - BangOther, 70 - ImgBody, 71 - ImgAttrName, 72 - ImgAttrGap, 73 - ImgValStart, 74 - ImgSrcVal, 75 - ImgSkipVal, 76 - } 77 - 78 - impl Default for HtmlStripStream { 79 - fn default() -> Self { 80 - Self::new() 81 - } 82 - } 83 - 84 - /// Stateful, streaming HTML-to-styled-text converter (~80 bytes of state). 85 - /// 86 - /// Feed chunks of EPUB XHTML via [`feed`](Self::feed), then call 87 - /// [`finish`](Self::finish) to flush any trailing state. The output is 88 - /// plain text interspersed with 2-byte `[MARKER, tag]` style codes. 89 - pub struct HtmlStripStream { 90 - phase: Phase, 91 - 92 - // tag name accumulation 93 - tag_buf: [u8; TAG_BUF_CAP], 94 - tag_len: u8, 95 - is_close_tag: bool, 96 - enter_skip: bool, // tag is skip-content; enter SkipContent on > 97 - 98 - // entity accumulation 99 - entity_buf: [u8; ENTITY_BUF_CAP], 100 - entity_len: u8, 101 - 102 - // skip content 103 - skip_target: Option<SkipTag>, 104 - skip_match: bool, // in SkipToGt: did close tag name match? 105 - 106 - // bang construct probing 107 - bang_buf: [u8; BANG_BUF_CAP], 108 - bang_len: u8, 109 - 110 - // terminator matching (comment / CDATA / PI) 111 - match_pos: u8, 112 - 113 - // output state 114 - last_was_space: bool, 115 - trailing_nl: u8, // deferred newlines; flushed before next visible byte; capped at 2 116 - has_output: bool, // true once any visible char emitted; suppresses leading whitespace 117 - 118 - // UTF-8 multi-byte accumulator (used in Utf8Cont phase) 119 - utf8_acc: u32, 120 - utf8_remaining: u8, 121 - 122 - // deferred open-style markers; open-tag markers (bold on, heading on, etc.) 123 - // appear AFTER paragraph-break newlines and BEFORE text. 124 - // close-tag markers go to `pending` immediately (before paragraph newlines). 125 - deferred: [u8; DEFERRED_CAP], 126 - deferred_len: u8, 127 - 128 - // pending output: bytes queued by classify_tag or queue_text not yet 129 - // drained to the caller's output slice 130 - pending: [u8; PENDING_CAP], 131 - pend_w: u8, 132 - pend_r: u8, 133 - 134 - // image src capture (<img src="...">) 135 - img_src: [u8; IMG_SRC_CAP], 136 - img_w: u8, // write cursor (accumulation at [3..]) / drain length 137 - img_r: u8, // drain read cursor 138 - capture_img: bool, // true while inside <img> tag body 139 - img_is_src: bool, // current attribute name matched "src" 140 - img_quote: u8, // active quote char in attribute value 141 - } 142 - 143 - impl HtmlStripStream { 144 - /// Create a new stream in its initial state. 145 - pub const fn new() -> Self { 146 - Self { 147 - phase: Phase::Text, 148 - tag_buf: [0u8; TAG_BUF_CAP], 149 - tag_len: 0, 150 - is_close_tag: false, 151 - enter_skip: false, 152 - entity_buf: [0u8; ENTITY_BUF_CAP], 153 - entity_len: 0, 154 - skip_target: None, 155 - skip_match: false, 156 - bang_buf: [0u8; BANG_BUF_CAP], 157 - bang_len: 0, 158 - match_pos: 0, 159 - last_was_space: true, 160 - trailing_nl: 0, 161 - has_output: false, 162 - utf8_acc: 0, 163 - utf8_remaining: 0, 164 - deferred: [0u8; DEFERRED_CAP], 165 - deferred_len: 0, 166 - pending: [0u8; PENDING_CAP], 167 - pend_w: 0, 168 - pend_r: 0, 169 - img_src: [0u8; IMG_SRC_CAP], 170 - img_w: 0, 171 - img_r: 0, 172 - capture_img: false, 173 - img_is_src: false, 174 - img_quote: 0, 175 - } 176 - } 177 - 178 - /// Process a chunk of HTML input. 179 - /// 180 - /// Returns `(consumed, written)`. If `consumed < input.len()`, call 181 - /// again with the remaining input (the output buffer was full). 182 - pub fn feed(&mut self, input: &[u8], output: &mut [u8]) -> (usize, usize) { 183 - let ilen = input.len(); 184 - let olen = output.len(); 185 - let mut ip: usize = 0; 186 - let mut op: usize = 0; 187 - 188 - loop { 189 - // step 1: drain pending bytes to output 190 - while self.pend_r < self.pend_w { 191 - if op >= olen { 192 - return (ip, op); 193 - } 194 - output[op] = self.pending[self.pend_r as usize]; 195 - op += 1; 196 - self.pend_r += 1; 197 - } 198 - self.pend_r = 0; 199 - self.pend_w = 0; 200 - 201 - // step 1.5: drain image reference 202 - if !self.capture_img && self.img_r < self.img_w { 203 - while self.img_r < self.img_w { 204 - if op >= olen { 205 - return (ip, op); 206 - } 207 - output[op] = self.img_src[self.img_r as usize]; 208 - op += 1; 209 - self.img_r += 1; 210 - } 211 - self.img_r = 0; 212 - self.img_w = 0; 213 - } 214 - 215 - // step 2: check for end of input 216 - if ip >= ilen { 217 - return (ip, op); 218 - } 219 - 220 - // step 3: process one input byte 221 - let b = input[ip]; 222 - let mut advance = true; 223 - 224 - match self.phase { 225 - // normal text 226 - Phase::Text => { 227 - if b == MARKER { 228 - // literal 0x01 in source; drop silently (SOH never in real EPUBs) 229 - } else if b == b'<' { 230 - self.phase = Phase::AfterLt; 231 - } else if b == b'&' { 232 - self.entity_len = 0; 233 - self.phase = Phase::Entity; 234 - } else if is_html_ws(b) { 235 - self.queue_ws(); 236 - } else if b >= 0xC0 { 237 - // UTF-8 lead byte: start multi-byte accumulation 238 - if b < 0xE0 { 239 - self.utf8_acc = (b as u32) & 0x1F; 240 - self.utf8_remaining = 1; 241 - } else if b < 0xF0 { 242 - self.utf8_acc = (b as u32) & 0x0F; 243 - self.utf8_remaining = 2; 244 - } else { 245 - self.utf8_acc = (b as u32) & 0x07; 246 - self.utf8_remaining = 3; 247 - } 248 - self.phase = Phase::Utf8Cont; 249 - } else if b >= 0x80 { 250 - // stray continuation byte; skip silently 251 - } else { 252 - self.queue_text(b); 253 - } 254 - } 255 - 256 - // UTF-8 continuation bytes; accumulate then map to ASCII 257 - Phase::Utf8Cont => { 258 - if b & 0xC0 == 0x80 { 259 - self.utf8_acc = (self.utf8_acc << 6) | (b as u32 & 0x3F); 260 - self.utf8_remaining -= 1; 261 - if self.utf8_remaining == 0 { 262 - if let Some(ascii) = codepoint_to_byte(self.utf8_acc) { 263 - if is_html_ws(ascii) { 264 - self.queue_ws(); 265 - } else { 266 - self.queue_text(ascii); 267 - } 268 - } 269 - self.phase = Phase::Text; 270 - } 271 - } else { 272 - // broken sequence; emit replacement and reprocess byte 273 - self.queue_text(b'?'); 274 - self.phase = Phase::Text; 275 - advance = false; 276 - } 277 - } 278 - 279 - // after '<' 280 - Phase::AfterLt => match b { 281 - b'!' => { 282 - self.bang_len = 0; 283 - self.phase = Phase::BangProbe; 284 - } 285 - b'?' => { 286 - self.match_pos = 0; 287 - self.phase = Phase::Pi; 288 - } 289 - b'/' => { 290 - self.is_close_tag = true; 291 - self.tag_len = 0; 292 - self.enter_skip = false; 293 - self.phase = Phase::TagName; 294 - } 295 - b'>' => { 296 - // empty <>; ignore 297 - self.phase = Phase::Text; 298 - } 299 - _ => { 300 - self.is_close_tag = false; 301 - self.tag_len = 0; 302 - self.enter_skip = false; 303 - self.phase = Phase::TagName; 304 - advance = false; // reprocess in TagName 305 - } 306 - }, 307 - 308 - // accumulating tag name 309 - Phase::TagName => { 310 - if is_tag_delim(b) { 311 - self.classify_tag(); 312 - 313 - if b == b'>' { 314 - if self.capture_img { 315 - self.finish_img_tag(); 316 - } else { 317 - self.finish_tag(); 318 - } 319 - } else if self.capture_img { 320 - self.phase = Phase::ImgBody; 321 - } else { 322 - self.phase = Phase::TagBody; 323 - } 324 - } else if (self.tag_len as usize) < TAG_BUF_CAP { 325 - self.tag_buf[self.tag_len as usize] = b.to_ascii_lowercase(); 326 - self.tag_len += 1; 327 - } 328 - // overflow: stop accumulating, keep scanning for delimiter 329 - } 330 - 331 - // past tag name; skip attributes to '>' 332 - Phase::TagBody => { 333 - if b == b'>' { 334 - self.finish_tag(); 335 - } 336 - } 337 - 338 - // <img> attribute parsing; capture src="..." 339 - Phase::ImgBody => { 340 - if b == b'>' { 341 - self.finish_img_tag(); 342 - } else if b.is_ascii_alphabetic() || b == b'_' { 343 - self.tag_len = 0; 344 - self.tag_buf[0] = b.to_ascii_lowercase(); 345 - self.tag_len = 1; 346 - self.phase = Phase::ImgAttrName; 347 - } 348 - // whitespace, '/', etc: stay 349 - } 350 - 351 - Phase::ImgAttrName => { 352 - if b == b'=' { 353 - let name = &self.tag_buf[..self.tag_len as usize]; 354 - self.img_is_src = name == b"src"; 355 - self.phase = Phase::ImgValStart; 356 - } else if b == b'>' { 357 - self.finish_img_tag(); 358 - } else if is_html_ws(b) || b == b'/' { 359 - self.phase = Phase::ImgAttrGap; 360 - } else if (self.tag_len as usize) < TAG_BUF_CAP { 361 - self.tag_buf[self.tag_len as usize] = b.to_ascii_lowercase(); 362 - self.tag_len += 1; 363 - } 364 - } 365 - 366 - Phase::ImgAttrGap => { 367 - if b == b'=' { 368 - let name = &self.tag_buf[..self.tag_len as usize]; 369 - self.img_is_src = name == b"src"; 370 - self.phase = Phase::ImgValStart; 371 - } else if b == b'>' { 372 - self.finish_img_tag(); 373 - } else if b.is_ascii_alphabetic() || b == b'_' { 374 - self.tag_len = 0; 375 - self.tag_buf[0] = b.to_ascii_lowercase(); 376 - self.tag_len = 1; 377 - self.phase = Phase::ImgAttrName; 378 - } 379 - // whitespace, '/': stay 380 - } 381 - 382 - Phase::ImgValStart => { 383 - if b == b'"' || b == b'\'' { 384 - self.img_quote = b; 385 - self.phase = if self.img_is_src { 386 - Phase::ImgSrcVal 387 - } else { 388 - Phase::ImgSkipVal 389 - }; 390 - } else if b == b'>' { 391 - self.finish_img_tag(); 392 - } else if !is_html_ws(b) { 393 - // unquoted attribute value 394 - self.img_quote = 0; 395 - if self.img_is_src { 396 - let pos = self.img_w as usize; 397 - if pos < IMG_SRC_CAP { 398 - self.img_src[pos] = b; 399 - self.img_w += 1; 400 - } 401 - self.phase = Phase::ImgSrcVal; 402 - } else { 403 - self.phase = Phase::ImgSkipVal; 404 - } 405 - } 406 - } 407 - 408 - Phase::ImgSrcVal => { 409 - let done = if self.img_quote != 0 { 410 - b == self.img_quote 411 - } else { 412 - is_html_ws(b) || b == b'>' || b == b'/' 413 - }; 414 - if done { 415 - self.phase = Phase::ImgBody; 416 - if self.img_quote == 0 && b == b'>' { 417 - self.finish_img_tag(); 418 - } 419 - } else { 420 - let pos = self.img_w as usize; 421 - if pos < IMG_SRC_CAP { 422 - self.img_src[pos] = b; 423 - self.img_w += 1; 424 - } 425 - } 426 - } 427 - 428 - Phase::ImgSkipVal => { 429 - let done = if self.img_quote != 0 { 430 - b == self.img_quote 431 - } else { 432 - is_html_ws(b) || b == b'>' || b == b'/' 433 - }; 434 - if done { 435 - self.phase = Phase::ImgBody; 436 - if self.img_quote == 0 && b == b'>' { 437 - self.finish_img_tag(); 438 - } 439 - } 440 - } 441 - 442 - // entity accumulation 443 - Phase::Entity => { 444 - if b == b';' { 445 - let name = &self.entity_buf[..self.entity_len as usize]; 446 - match resolve_entity(name) { 447 - Some(b'\n') => { 448 - self.trailing_nl = self.trailing_nl.saturating_add(1).min(2); 449 - self.last_was_space = true; 450 - } 451 - Some(c) if is_html_ws(c) => { 452 - self.queue_ws(); 453 - } 454 - Some(c) => { 455 - self.queue_text(c); 456 - } 457 - None => { 458 - // unrecognised entity; emit literal '&' 459 - self.queue_text(b'&'); 460 - } 461 - } 462 - self.phase = Phase::Text; 463 - } else if is_entity_char(b) && (self.entity_len as usize) < ENTITY_BUF_CAP { 464 - self.entity_buf[self.entity_len as usize] = b; 465 - self.entity_len += 1; 466 - } else { 467 - // invalid char or overflow; emit literal '&' 468 - self.queue_text(b'&'); 469 - self.phase = Phase::Text; 470 - advance = false; // reprocess this byte as text 471 - } 472 - } 473 - 474 - // skip content (script / style / head) 475 - Phase::SkipContent => { 476 - if b == b'<' { 477 - self.phase = Phase::SkipLt; 478 - } 479 - } 480 - 481 - Phase::SkipLt => { 482 - if b == b'/' { 483 - self.tag_len = 0; 484 - self.phase = Phase::SkipCloseName; 485 - } else { 486 - self.phase = Phase::SkipContent; 487 - } 488 - } 489 - 490 - Phase::SkipCloseName => { 491 - if is_tag_delim(b) || b == b'>' { 492 - let matched = if let Some(target) = self.skip_target { 493 - let tgt = target.name(); 494 - let name = &self.tag_buf[..self.tag_len as usize]; 495 - name.len() == tgt.len() 496 - && name.iter().zip(tgt.iter()).all(|(a, t)| *a == *t) 497 - } else { 498 - false 499 - }; 500 - 501 - if b == b'>' { 502 - if matched { 503 - self.skip_target = None; 504 - self.phase = Phase::Text; 505 - } else { 506 - self.phase = Phase::SkipContent; 507 - } 508 - } else { 509 - self.skip_match = matched; 510 - self.phase = Phase::SkipToGt; 511 - } 512 - } else if (self.tag_len as usize) < TAG_BUF_CAP { 513 - self.tag_buf[self.tag_len as usize] = b.to_ascii_lowercase(); 514 - self.tag_len += 1; 515 - } 516 - } 517 - 518 - Phase::SkipToGt => { 519 - if b == b'>' { 520 - if self.skip_match { 521 - self.skip_target = None; 522 - self.phase = Phase::Text; 523 - } else { 524 - self.phase = Phase::SkipContent; 525 - } 526 - } 527 - } 528 - 529 - // bang construct probing (after '<!') 530 - Phase::BangProbe => { 531 - if b == b'>' { 532 - self.phase = Phase::Text; 533 - } else { 534 - let pos = self.bang_len as usize; 535 - if pos < BANG_BUF_CAP { 536 - self.bang_buf[pos] = b; 537 - self.bang_len += 1; 538 - } 539 - let n = self.bang_len as usize; 540 - 541 - if n == 1 { 542 - match b { 543 - b'-' | b'[' => {} 544 - _ => self.phase = Phase::BangOther, 545 - } 546 - } else if self.bang_buf[0] == b'-' { 547 - if n == 2 && b == b'-' { 548 - self.match_pos = 0; 549 - self.phase = Phase::Comment; 550 - } else { 551 - self.phase = Phase::BangOther; 552 - } 553 - } else { 554 - // bang_buf[0] == '[': check against "[CDATA[" 555 - const CDATA: &[u8] = b"[CDATA["; 556 - if n <= CDATA.len() && b == CDATA[n - 1] { 557 - if n == CDATA.len() { 558 - self.match_pos = 0; 559 - self.phase = Phase::Cdata; 560 - } 561 - } else { 562 - self.phase = Phase::BangOther; 563 - } 564 - } 565 - } 566 - } 567 - 568 - // comment: scanning for '-->' 569 - Phase::Comment => match self.match_pos { 570 - 0 => { 571 - if b == b'-' { 572 - self.match_pos = 1; 573 - } 574 - } 575 - 1 => { 576 - if b == b'-' { 577 - self.match_pos = 2; 578 - } else { 579 - self.match_pos = 0; 580 - } 581 - } 582 - _ => { 583 - if b == b'>' { 584 - self.phase = Phase::Text; 585 - } else if b != b'-' { 586 - self.match_pos = 0; 587 - } 588 - } 589 - }, 590 - 591 - // CDATA: scanning for ']]>' 592 - Phase::Cdata => match self.match_pos { 593 - 0 => { 594 - if b == b']' { 595 - self.match_pos = 1; 596 - } 597 - } 598 - 1 => { 599 - if b == b']' { 600 - self.match_pos = 2; 601 - } else { 602 - self.match_pos = 0; 603 - } 604 - } 605 - _ => { 606 - if b == b'>' { 607 - self.phase = Phase::Text; 608 - } else if b != b']' { 609 - self.match_pos = 0; 610 - } 611 - } 612 - }, 613 - 614 - // PI: scanning for '?>' 615 - Phase::Pi => match self.match_pos { 616 - 0 => { 617 - if b == b'?' { 618 - self.match_pos = 1; 619 - } 620 - } 621 - _ => { 622 - if b == b'>' { 623 - self.phase = Phase::Text; 624 - } else if b != b'?' { 625 - self.match_pos = 0; 626 - } 627 - } 628 - }, 629 - 630 - // other bang construct: scanning for '>' 631 - Phase::BangOther => { 632 - if b == b'>' { 633 - self.phase = Phase::Text; 634 - } 635 - } 636 - } 637 - 638 - if advance { 639 - ip += 1; 640 - } 641 - } 642 - } 643 - 644 - /// Flush any pending state and append a terminal newline if content 645 - /// was produced. Returns the number of bytes written to `output`. 646 - pub fn finish(&mut self, output: &mut [u8]) -> usize { 647 - let mut op: usize = 0; 648 - 649 - // drain remaining pending bytes 650 - while self.pend_r < self.pend_w && op < output.len() { 651 - output[op] = self.pending[self.pend_r as usize]; 652 - op += 1; 653 - self.pend_r += 1; 654 - } 655 - self.pend_r = 0; 656 - self.pend_w = 0; 657 - 658 - // terminal newline 659 - if self.has_output && op < output.len() { 660 - output[op] = b'\n'; 661 - op += 1; 662 - } 663 - 664 - self.phase = Phase::Text; 665 - op 666 - } 667 - 668 - #[inline] 669 - fn push_pending(&mut self, byte: u8) { 670 - let w = self.pend_w as usize; 671 - if w < PENDING_CAP { 672 - self.pending[w] = byte; 673 - self.pend_w += 1; 674 - } 675 - } 676 - 677 - fn push_deferred_marker(&mut self, tag: u8) { 678 - let n = self.deferred_len as usize; 679 - if n + 2 <= DEFERRED_CAP { 680 - self.deferred[n] = MARKER; 681 - self.deferred[n + 1] = tag; 682 - self.deferred_len += 2; 683 - } 684 - } 685 - 686 - // queue visible text byte; flush deferred newlines and style markers first 687 - fn queue_text(&mut self, b: u8) { 688 - // deferred newlines 689 - if self.has_output && self.trailing_nl > 0 { 690 - let nl = self.trailing_nl; 691 - for _ in 0..nl { 692 - self.push_pending(b'\n'); 693 - } 694 - } 695 - self.trailing_nl = 0; 696 - 697 - // deferred open-style markers 698 - let dlen = self.deferred_len as usize; 699 - for i in 0..dlen { 700 - self.push_pending(self.deferred[i]); 701 - } 702 - self.deferred_len = 0; 703 - 704 - self.push_pending(b); 705 - self.last_was_space = false; 706 - self.has_output = true; 707 - } 708 - 709 - // handle whitespace byte; collapse runs to a single space 710 - fn queue_ws(&mut self) { 711 - if self.last_was_space || !self.has_output { 712 - return; 713 - } 714 - self.last_was_space = true; 715 - 716 - // pending newlines already act as word separators 717 - if self.trailing_nl > 0 { 718 - return; 719 - } 720 - 721 - self.push_pending(b' '); 722 - } 723 - 724 - // classify accumulated tag name; push close markers to pending, open to deferred 725 - fn classify_tag(&mut self) { 726 - // copy tag name to a local to avoid borrowing self.tag_buf 727 - // while mutating self through push_pending / push_deferred 728 - let mut tn = [0u8; TAG_BUF_CAP]; 729 - let tn_len = self.tag_len as usize; 730 - tn[..tn_len].copy_from_slice(&self.tag_buf[..tn_len]); 731 - let name = &tn[..tn_len]; 732 - let is_close = self.is_close_tag; 733 - 734 - // skip-content tags (script/style/head); open only 735 - if !is_close && let Some(sk) = SkipTag::from_name(name) { 736 - self.skip_target = Some(sk); 737 - self.enter_skip = true; 738 - } 739 - 740 - // close-tag markers go out immediately (before deferred newlines) 741 - if is_close && let Some(m) = close_style_tag(name) { 742 - self.push_pending(MARKER); 743 - self.push_pending(m); 744 - } 745 - 746 - // block elements set deferred paragraph breaks 747 - if is_block_element(name) { 748 - self.trailing_nl = self.trailing_nl.max(2); 749 - self.last_was_space = true; 750 - } 751 - 752 - // open-tag markers are deferred (after newlines, before text); 753 - // inline markers too: <p><b>text -> \n\n[B]text, not [B]\n\ntext 754 - if !is_close && let Some(m) = open_style_tag(name) { 755 - self.push_deferred_marker(m); 756 - } 757 - 758 - // <br>: line break 759 - if name == b"br" { 760 - self.trailing_nl = self.trailing_nl.saturating_add(1).min(2); 761 - self.last_was_space = true; 762 - } 763 - 764 - // <hr>: scene break marker (deferred) 765 - if name == b"hr" && !is_close { 766 - self.push_deferred_marker(BREAK); 767 - } 768 - 769 - // <img>: enter image-src capture mode 770 - if name == b"img" && !is_close { 771 - self.capture_img = true; 772 - self.img_w = 3; // reserve [0..3] for marker header 773 - self.img_is_src = false; 774 - } 775 - } 776 - 777 - // transition out of TagName/TagBody on '>' 778 - fn finish_tag(&mut self) { 779 - if self.enter_skip { 780 - self.enter_skip = false; 781 - self.phase = Phase::SkipContent; 782 - } else { 783 - self.phase = Phase::Text; 784 - } 785 - } 786 - 787 - // finish <img> tag; emit image-ref marker if src was captured 788 - fn finish_img_tag(&mut self) { 789 - let path_len = (self.img_w as usize).saturating_sub(3); 790 - self.capture_img = false; 791 - 792 - if path_len > 0 { 793 - // block break before image 794 - if self.has_output { 795 - self.trailing_nl = self.trailing_nl.max(2); 796 - } 797 - // emit deferred newlines 798 - if self.has_output && self.trailing_nl > 0 { 799 - let nl = self.trailing_nl; 800 - for _ in 0..nl { 801 - self.push_pending(b'\n'); 802 - } 803 - } 804 - self.trailing_nl = 0; 805 - // flush deferred open-style markers 806 - let dlen = self.deferred_len as usize; 807 - for i in 0..dlen { 808 - self.push_pending(self.deferred[i]); 809 - } 810 - self.deferred_len = 0; 811 - // fill marker header [MARKER, IMG_REF, path_len]; 812 - // path bytes already at img_src[3..3+path_len] 813 - self.img_src[0] = MARKER; 814 - self.img_src[1] = IMG_REF; 815 - self.img_src[2] = path_len as u8; 816 - self.img_r = 0; 817 - // block break after image 818 - self.trailing_nl = 2; 819 - self.last_was_space = true; 820 - self.has_output = true; 821 - } else { 822 - self.img_w = 0; 823 - } 824 - 825 - self.phase = Phase::Text; 826 - } 827 - } 828 - 829 - /// Strip HTML tags from a complete buffer **in place**, producing plain text 830 - /// without style markers. 831 - /// 832 - /// The write cursor never passes the read cursor, so no extra allocation 833 - /// is needed. 834 - pub fn strip_html_inplace(buf: &mut Vec<u8>) { 835 - let len = buf.len(); 836 - if len == 0 { 837 - return; 838 - } 839 - 840 - let mut r: usize = 0; 841 - let mut w: usize = 0; 842 - let mut last_was_space = true; 843 - let mut trailing_nl: u8 = 1; 844 - let mut skip_until: Option<SkipTag> = None; 845 - 846 - while r < len { 847 - if let Some(skip) = skip_until { 848 - if let Some(end_pos) = find_close_tag(&buf[r..], skip.name()) { 849 - r += end_pos; 850 - skip_until = None; 851 - } else { 852 - break; 853 - } 854 - continue; 855 - } 856 - 857 - let b = buf[r]; 858 - 859 - if b == b'<' { 860 - r += 1; 861 - if r >= len { 862 - break; 863 - } 864 - 865 - if buf[r] == b'!' { 866 - r = skip_bang_construct(buf, r); 867 - continue; 868 - } 869 - if buf[r] == b'?' { 870 - r = skip_pi(buf, r); 871 - continue; 872 - } 873 - 874 - let is_close = buf[r] == b'/'; 875 - if is_close { 876 - r += 1; 877 - } 878 - 879 - let name_start = r; 880 - while r < len && !is_tag_delim(buf[r]) { 881 - r += 1; 882 - } 883 - let mut tn = [0u8; 16]; 884 - let tn_len = (r - name_start).min(16); 885 - for i in 0..tn_len { 886 - tn[i] = buf[name_start + i].to_ascii_lowercase(); 887 - } 888 - let tag = &tn[..tn_len]; 889 - 890 - if !is_close && let Some(sk) = SkipTag::from_name(tag) { 891 - skip_until = Some(sk); 892 - } 893 - 894 - if is_block_element(tag) { 895 - while trailing_nl < 2 { 896 - buf[w] = b'\n'; 897 - w += 1; 898 - trailing_nl += 1; 899 - } 900 - last_was_space = true; 901 - } 902 - 903 - if tag == b"br" { 904 - buf[w] = b'\n'; 905 - w += 1; 906 - trailing_nl = trailing_nl.saturating_add(1); 907 - last_was_space = true; 908 - } 909 - 910 - while r < len && buf[r] != b'>' { 911 - r += 1; 912 - } 913 - if r < len { 914 - r += 1; 915 - } 916 - continue; 917 - } 918 - 919 - if b == b'&' { 920 - let (decoded, advance) = decode_entity_inplace(buf, r); 921 - r += advance; 922 - 923 - match decoded { 924 - DecodedInplace::Byte(b'\n') => { 925 - buf[w] = b'\n'; 926 - w += 1; 927 - trailing_nl = trailing_nl.saturating_add(1); 928 - last_was_space = true; 929 - } 930 - DecodedInplace::Byte(c) if is_html_ws(c) => { 931 - if !last_was_space { 932 - buf[w] = b' '; 933 - w += 1; 934 - last_was_space = true; 935 - trailing_nl = 0; 936 - } 937 - } 938 - DecodedInplace::Byte(c) => { 939 - buf[w] = c; 940 - w += 1; 941 - last_was_space = false; 942 - trailing_nl = 0; 943 - } 944 - 945 - DecodedInplace::None => { 946 - buf[w] = b'&'; 947 - w += 1; 948 - last_was_space = false; 949 - trailing_nl = 0; 950 - } 951 - } 952 - continue; 953 - } 954 - 955 - if is_html_ws(b) { 956 - if !last_was_space { 957 - buf[w] = b' '; 958 - w += 1; 959 - last_was_space = true; 960 - trailing_nl = 0; 961 - } 962 - } else if b >= 0xC0 { 963 - // UTF-8 multi-byte sequence; decode and replace with ASCII approx 964 - let (cp, seq_len) = decode_utf8_char(buf, r, len); 965 - if let Some(ascii) = codepoint_to_byte(cp) { 966 - if is_html_ws(ascii) { 967 - if !last_was_space { 968 - buf[w] = b' '; 969 - w += 1; 970 - last_was_space = true; 971 - trailing_nl = 0; 972 - } 973 - } else { 974 - buf[w] = ascii; 975 - w += 1; 976 - last_was_space = false; 977 - trailing_nl = 0; 978 - } 979 - } 980 - r += seq_len; 981 - continue; 982 - } else if b >= 0x80 { 983 - // stray continuation byte; skip 984 - r += 1; 985 - continue; 986 - } else { 987 - buf[w] = b; 988 - w += 1; 989 - last_was_space = false; 990 - trailing_nl = 0; 991 - } 992 - 993 - r += 1; 994 - } 995 - 996 - while w > 0 && (buf[w - 1] == b' ' || buf[w - 1] == b'\n') { 997 - w -= 1; 998 - } 999 - if w > 0 { 1000 - buf[w] = b'\n'; 1001 - w += 1; 1002 - } 1003 - 1004 - buf.truncate(w); 1005 - } 1006 - 1007 - fn is_block_element(name: &[u8]) -> bool { 1008 - matches!( 1009 - name, 1010 - b"p" | b"div" 1011 - | b"h1" 1012 - | b"h2" 1013 - | b"h3" 1014 - | b"h4" 1015 - | b"h5" 1016 - | b"h6" 1017 - | b"li" 1018 - | b"ul" 1019 - | b"ol" 1020 - | b"dl" 1021 - | b"dt" 1022 - | b"dd" 1023 - | b"tr" 1024 - | b"blockquote" 1025 - | b"section" 1026 - | b"article" 1027 - | b"aside" 1028 - | b"figure" 1029 - | b"figcaption" 1030 - | b"header" 1031 - | b"footer" 1032 - | b"nav" 1033 - | b"pre" 1034 - | b"hr" 1035 - | b"table" 1036 - ) 1037 - } 1038 - 1039 - // marker byte for formatting tags; used by classify_tag 1040 - fn open_style_tag(tag: &[u8]) -> Option<u8> { 1041 - match tag { 1042 - b"b" | b"strong" => Some(BOLD_ON), 1043 - b"i" | b"em" | b"cite" => Some(ITALIC_ON), 1044 - b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => Some(HEADING_ON), 1045 - b"blockquote" => Some(QUOTE_ON), 1046 - _ => None, 1047 - } 1048 - } 1049 - 1050 - fn close_style_tag(tag: &[u8]) -> Option<u8> { 1051 - match tag { 1052 - b"b" | b"strong" => Some(BOLD_OFF), 1053 - b"i" | b"em" | b"cite" => Some(ITALIC_OFF), 1054 - b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => Some(HEADING_OFF), 1055 - b"blockquote" => Some(QUOTE_OFF), 1056 - _ => None, 1057 - } 1058 - } 1059 - 1060 - #[derive(Clone, Copy)] 1061 - enum SkipTag { 1062 - Script, 1063 - Style, 1064 - Head, 1065 - } 1066 - 1067 - impl SkipTag { 1068 - fn from_name(name: &[u8]) -> Option<Self> { 1069 - match name { 1070 - b"script" => Some(Self::Script), 1071 - b"style" => Some(Self::Style), 1072 - b"head" => Some(Self::Head), 1073 - _ => None, 1074 - } 1075 - } 1076 - 1077 - fn name(&self) -> &'static [u8] { 1078 - match self { 1079 - Self::Script => b"script", 1080 - Self::Style => b"style", 1081 - Self::Head => b"head", 1082 - } 1083 - } 1084 - } 1085 - 1086 - fn find_close_tag(data: &[u8], name: &[u8]) -> Option<usize> { 1087 - let mut pos = 0; 1088 - while pos + 2 < data.len() { 1089 - if data[pos] == b'<' && data[pos + 1] == b'/' { 1090 - let tag_start = pos + 2; 1091 - let mut tag_pos = tag_start; 1092 - while tag_pos < data.len() && !is_tag_delim(data[tag_pos]) { 1093 - tag_pos += 1; 1094 - } 1095 - let tag_name = &data[tag_start..tag_pos]; 1096 - if tag_name.len() == name.len() 1097 - && tag_name 1098 - .iter() 1099 - .zip(name.iter()) 1100 - .all(|(a, b)| a.to_ascii_lowercase() == *b) 1101 - { 1102 - while tag_pos < data.len() && data[tag_pos] != b'>' { 1103 - tag_pos += 1; 1104 - } 1105 - return Some(tag_pos + 1); 1106 - } 1107 - } 1108 - pos += 1; 1109 - } 1110 - None 1111 - } 1112 - 1113 - // resolve entity name to output byte; None for unrecognised 1114 - fn resolve_entity(name: &[u8]) -> Option<u8> { 1115 - match name { 1116 - b"amp" => Some(b'&'), 1117 - b"lt" => Some(b'<'), 1118 - b"gt" => Some(b'>'), 1119 - b"quot" => Some(b'"'), 1120 - b"apos" => Some(b'\''), 1121 - b"nbsp" => Some(b' '), 1122 - b"mdash" | b"emdash" => Some(b'-'), 1123 - b"ndash" | b"endash" => Some(b'-'), 1124 - b"lsquo" | b"rsquo" | b"sbquo" => Some(b'\''), 1125 - b"ldquo" | b"rdquo" | b"bdquo" => Some(b'"'), 1126 - b"hellip" => Some(b'.'), 1127 - b"copy" => Some(b'c'), 1128 - b"reg" => Some(b'R'), 1129 - b"trade" => Some(b'T'), 1130 - b"times" => Some(b'x'), 1131 - b"divide" => Some(b'/'), 1132 - b"deg" => Some(b'*'), 1133 - b"plusmn" => Some(b'+'), 1134 - b"frac12" | b"frac14" | b"frac34" => Some(b'/'), 1135 - _ => { 1136 - if name.starts_with(b"#x") || name.starts_with(b"#X") { 1137 - codepoint_to_byte(parse_hex(&name[2..])) 1138 - } else if name.starts_with(b"#") { 1139 - codepoint_to_byte(parse_decimal(&name[1..])) 1140 - } else { 1141 - None 1142 - } 1143 - } 1144 - } 1145 - } 1146 - 1147 - fn codepoint_to_byte(cp: u32) -> Option<u8> { 1148 - match cp { 1149 - 0 => None, 1150 - 0x0001..=0x007F => Some(cp as u8), 1151 - 0x00A0 => Some(b' '), // nbsp 1152 - 0x00AB | 0x00BB => Some(b'"'), // « » 1153 - 0x00AD => Some(b'-'), // soft hyphen 1154 - 0x00B7 => Some(b'.'), // middle dot 1155 - 0x00D7 => Some(b'x'), // multiplication sign 1156 - 0x00F7 => Some(b'/'), // division sign 1157 - 0x2010..=0x2015 => Some(b'-'), // hyphens, dashes (figure dash, horiz bar) 1158 - 0x2018..=0x201B => Some(b'\''), // single quotes (left, right, low-9, reversed-9) 1159 - 0x201C..=0x201F => Some(b'"'), // double quotes (left, right, low-9, reversed-9) 1160 - 0x2022 => Some(b'*'), // bullet 1161 - 0x2026 => Some(b'.'), // horizontal ellipsis 1162 - 0x2032 => Some(b'\''), // prime 1163 - 0x2033 => Some(b'"'), // double prime 1164 - 0x2039 | 0x203A => Some(b'\''), // single guillemets 1165 - 0x2212 => Some(b'-'), // minus sign 1166 - _ => Some(b'?'), // unmapped codepoint 1167 - } 1168 - } 1169 - 1170 - /// Decode one UTF-8 character starting at `buf[pos]` (which must be a lead byte >= 0xC0). 1171 - /// Returns `(codepoint, byte_length)`. On malformed input returns `(0xFFFD, 1)`. 1172 - fn decode_utf8_char(buf: &[u8], pos: usize, len: usize) -> (u32, usize) { 1173 - let b0 = buf[pos]; 1174 - let (mut cp, expected) = if b0 < 0xE0 { 1175 - ((b0 as u32) & 0x1F, 2) 1176 - } else if b0 < 0xF0 { 1177 - ((b0 as u32) & 0x0F, 3) 1178 - } else { 1179 - ((b0 as u32) & 0x07, 4) 1180 - }; 1181 - if pos + expected > len { 1182 - return (0xFFFD, len - pos); // truncated sequence; consume remaining 1183 - } 1184 - for i in 1..expected { 1185 - let cont = buf[pos + i]; 1186 - if cont & 0xC0 != 0x80 { 1187 - return (0xFFFD, i); // broken: stop before bad byte 1188 - } 1189 - cp = (cp << 6) | (cont as u32 & 0x3F); 1190 - } 1191 - (cp, expected) 1192 - } 1193 - 1194 - // in-place entity decoding; separate from resolve_entity 1195 - enum DecodedInplace { 1196 - Byte(u8), 1197 - None, 1198 - } 1199 - 1200 - fn decode_entity_inplace(input: &[u8], pos: usize) -> (DecodedInplace, usize) { 1201 - debug_assert!(input[pos] == b'&'); 1202 - 1203 - let remaining = &input[pos + 1..]; 1204 - let max_scan = remaining.len().min(12); 1205 - let semi = remaining[..max_scan].iter().position(|&b| b == b';'); 1206 - 1207 - let Some(semi) = semi else { 1208 - return (DecodedInplace::None, 1); 1209 - }; 1210 - 1211 - let entity = &remaining[..semi]; 1212 - let advance = 1 + semi + 1; 1213 - 1214 - let decoded = match entity { 1215 - b"amp" => DecodedInplace::Byte(b'&'), 1216 - b"lt" => DecodedInplace::Byte(b'<'), 1217 - b"gt" => DecodedInplace::Byte(b'>'), 1218 - b"quot" => DecodedInplace::Byte(b'"'), 1219 - b"apos" => DecodedInplace::Byte(b'\''), 1220 - b"nbsp" => DecodedInplace::Byte(b' '), 1221 - b"mdash" | b"emdash" => DecodedInplace::Byte(b'-'), 1222 - b"ndash" | b"endash" => DecodedInplace::Byte(b'-'), 1223 - b"lsquo" | b"rsquo" | b"sbquo" => DecodedInplace::Byte(b'\''), 1224 - b"ldquo" | b"rdquo" | b"bdquo" => DecodedInplace::Byte(b'"'), 1225 - b"hellip" => DecodedInplace::Byte(b'.'), 1226 - b"copy" => DecodedInplace::Byte(b'c'), 1227 - b"reg" => DecodedInplace::Byte(b'R'), 1228 - b"trade" => DecodedInplace::Byte(b'T'), 1229 - b"times" => DecodedInplace::Byte(b'x'), 1230 - b"divide" => DecodedInplace::Byte(b'/'), 1231 - b"deg" => DecodedInplace::Byte(b'*'), 1232 - b"plusmn" => DecodedInplace::Byte(b'+'), 1233 - b"frac12" | b"frac14" | b"frac34" => DecodedInplace::Byte(b'/'), 1234 - _ => { 1235 - if entity.starts_with(b"#x") || entity.starts_with(b"#X") { 1236 - codepoint_to_decoded_inplace(parse_hex(&entity[2..])) 1237 - } else if entity.starts_with(b"#") { 1238 - codepoint_to_decoded_inplace(parse_decimal(&entity[1..])) 1239 - } else { 1240 - DecodedInplace::None 1241 - } 1242 - } 1243 - }; 1244 - 1245 - (decoded, advance) 1246 - } 1247 - 1248 - fn codepoint_to_decoded_inplace(cp: u32) -> DecodedInplace { 1249 - match codepoint_to_byte(cp) { 1250 - Some(b) => DecodedInplace::Byte(b), 1251 - None => DecodedInplace::None, 1252 - } 1253 - } 1254 - 1255 - fn parse_hex(bytes: &[u8]) -> u32 { 1256 - let mut val = 0u32; 1257 - for &b in bytes { 1258 - let nibble = match b { 1259 - b'0'..=b'9' => (b - b'0') as u32, 1260 - b'a'..=b'f' => (b - b'a' + 10) as u32, 1261 - b'A'..=b'F' => (b - b'A' + 10) as u32, 1262 - _ => return 0, 1263 - }; 1264 - val = val.wrapping_mul(16).wrapping_add(nibble); 1265 - } 1266 - val 1267 - } 1268 - 1269 - fn parse_decimal(bytes: &[u8]) -> u32 { 1270 - let mut val = 0u32; 1271 - for &b in bytes { 1272 - if b.is_ascii_digit() { 1273 - val = val.wrapping_mul(10).wrapping_add((b - b'0') as u32); 1274 - } else { 1275 - return 0; 1276 - } 1277 - } 1278 - val 1279 - } 1280 - 1281 - #[inline] 1282 - fn is_html_ws(b: u8) -> bool { 1283 - matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0C) 1284 - } 1285 - 1286 - #[inline] 1287 - fn is_tag_delim(b: u8) -> bool { 1288 - matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') 1289 - } 1290 - 1291 - #[inline] 1292 - fn is_entity_char(b: u8) -> bool { 1293 - b.is_ascii_alphanumeric() || b == b'#' 1294 - } 1295 - 1296 - fn skip_to_gt(data: &[u8], mut pos: usize) -> usize { 1297 - while pos < data.len() { 1298 - if data[pos] == b'>' { 1299 - return pos + 1; 1300 - } 1301 - pos += 1; 1302 - } 1303 - data.len() 1304 - } 1305 - 1306 - fn skip_bang_construct(data: &[u8], pos: usize) -> usize { 1307 - let rest = &data[pos..]; 1308 - 1309 - if rest.starts_with(b"!--") { 1310 - let mut p = pos + 3; 1311 - while p + 2 < data.len() { 1312 - if data[p] == b'-' && data[p + 1] == b'-' && data[p + 2] == b'>' { 1313 - return p + 3; 1314 - } 1315 - p += 1; 1316 - } 1317 - return data.len(); 1318 - } 1319 - 1320 - if rest.starts_with(b"![CDATA[") { 1321 - let mut p = pos + 8; 1322 - while p + 2 < data.len() { 1323 - if data[p] == b']' && data[p + 1] == b']' && data[p + 2] == b'>' { 1324 - return p + 3; 1325 - } 1326 - p += 1; 1327 - } 1328 - return data.len(); 1329 - } 1330 - 1331 - skip_to_gt(data, pos) 1332 - } 1333 - 1334 - fn skip_pi(data: &[u8], pos: usize) -> usize { 1335 - let mut p = pos + 1; 1336 - while p + 1 < data.len() { 1337 - if data[p] == b'?' && data[p + 1] == b'>' { 1338 - return p + 2; 1339 - } 1340 - p += 1; 1341 - } 1342 - data.len() 1343 - }
-1368
smol-epub/src/jpeg.rs
··· 1 - //! Minimal baseline JPEG decoder producing 1-bit Floyd–Steinberg dithered bitmaps. 2 - //! 3 - //! Streams MCU-row-by-row via 4 KB chunked reads; peak RAM ≈ 30 KB. 4 - //! Luminance (Y) channel only — chrominance is Huffman-decoded to 5 - //! advance the bitstream, then discarded. 6 - //! 7 - //! Progressive JPEG (SOF2) is partially supported: first scan only 8 - //! (DC + low-frequency AC). 9 - //! 10 - //! Output is packed 1-bit MSB-first, row-major — see [`DecodedImage`](crate::DecodedImage). 11 - 12 - extern crate alloc; 13 - 14 - use alloc::boxed::Box; 15 - use alloc::vec; 16 - use alloc::vec::Vec; 17 - 18 - use crate::DecodedImage; 19 - 20 - // JPEG marker bytes 21 - 22 - const M_SOF0: u8 = 0xC0; 23 - const M_SOF2: u8 = 0xC2; 24 - const M_DHT: u8 = 0xC4; 25 - const M_SOI: u8 = 0xD8; 26 - const M_EOI: u8 = 0xD9; 27 - const M_SOS: u8 = 0xDA; 28 - const M_DQT: u8 = 0xDB; 29 - const M_DRI: u8 = 0xDD; 30 - const M_RST0: u8 = 0xD0; 31 - const M_RST7: u8 = 0xD7; 32 - 33 - // limits 34 - 35 - const MAX_COMP: usize = 4; 36 - const MAX_PIXELS: u32 = 2048 * 2048; 37 - 38 - // header bytes to read for marker parsing; large APP/EXIF segments skipped by length 39 - const HEADER_READ: usize = 32768; 40 - 41 - // chunk size for streaming reads during MCU decode 42 - const CHUNK_SIZE: usize = 4096; 43 - 44 - // DEFLATE sliding-window size for streaming ZIP decompression 45 - const DEFLATE_WINDOW: usize = 32768; 46 - 47 - // zig-zag scan order 48 - 49 - #[rustfmt::skip] 50 - const ZZ: [usize; 64] = [ 51 - 0, 1, 8, 16, 9, 2, 3, 10, 52 - 17, 24, 32, 25, 18, 11, 4, 5, 53 - 12, 19, 26, 33, 40, 48, 41, 34, 54 - 27, 20, 13, 6, 7, 14, 21, 28, 55 - 35, 42, 49, 56, 57, 50, 43, 36, 56 - 29, 22, 15, 23, 30, 37, 44, 51, 57 - 58, 59, 52, 45, 38, 31, 39, 46, 58 - 53, 60, 61, 54, 47, 55, 62, 63, 59 - ]; 60 - 61 - // IDCT constants (IJG ISLOW, CONST_BITS = 13) 62 - 63 - const CB: i32 = 13; 64 - const P1: i32 = 2; 65 - const F0298: i32 = 2446; 66 - const F0390: i32 = 3196; 67 - const F0541: i32 = 4433; 68 - const F0765: i32 = 6270; 69 - const F0899: i32 = 7373; 70 - const F1175: i32 = 9633; 71 - const F1501: i32 = 12299; 72 - const F1847: i32 = 15137; 73 - const F1961: i32 = 16069; 74 - const F2053: i32 = 16819; 75 - const F2562: i32 = 20995; 76 - const F3072: i32 = 25172; 77 - 78 - // types 79 - 80 - #[derive(Clone, Copy, Default)] 81 - struct Component { 82 - id: u8, 83 - h_samp: u8, 84 - v_samp: u8, 85 - qt_idx: u8, 86 - dc_tbl: u8, 87 - ac_tbl: u8, 88 - } 89 - 90 - struct HuffTable { 91 - lut: [(u8, u8); 256], 92 - mincode: [i32; 17], 93 - maxcode: [i32; 17], 94 - valptr: [usize; 17], 95 - values: [u8; 256], 96 - } 97 - 98 - struct JpegState { 99 - width: u16, 100 - height: u16, 101 - num_comp: u8, 102 - comp: [Component; MAX_COMP], 103 - max_h: u8, 104 - max_v: u8, 105 - qt: [[u16; 64]; 4], 106 - qt_ok: [bool; 4], 107 - dc_huff: [HuffTable; 4], 108 - ac_huff: [HuffTable; 4], 109 - dc_ok: [bool; 4], 110 - ac_ok: [bool; 4], 111 - restart_interval: u16, 112 - // byte offset of entropy data (relative to start of JPEG data) 113 - scan_start: usize, 114 - scan_num_comp: u8, 115 - scan_order: [u8; MAX_COMP], 116 - progressive: bool, 117 - // first-scan spectral selection start (0 = DC) 118 - scan_ss: u8, 119 - // first-scan spectral selection end (0 = DC only, 63 = all AC) 120 - scan_se: u8, 121 - // first-scan successive approximation low bit (point transform) 122 - scan_al: u8, 123 - } 124 - 125 - impl JpegState { 126 - fn heap_new() -> Result<Box<Self>, &'static str> { 127 - let layout = core::alloc::Layout::new::<Self>(); 128 - let ptr = unsafe { alloc::alloc::alloc_zeroed(layout) }; 129 - if ptr.is_null() { 130 - return Err("jpeg: OOM for decoder state"); 131 - } 132 - let mut st = unsafe { Box::from_raw(ptr as *mut Self) }; 133 - st.max_h = 1; 134 - st.max_v = 1; 135 - for ht in st.dc_huff.iter_mut().chain(st.ac_huff.iter_mut()) { 136 - ht.maxcode.fill(-1); 137 - } 138 - Ok(st) 139 - } 140 - } 141 - 142 - // byte source trait + implementations 143 - 144 - // raw byte source for the JPEG bitstream 145 - trait JpegRead { 146 - fn read_byte(&mut self) -> Result<u8, &'static str>; 147 - fn is_eof(&self) -> bool; 148 - } 149 - 150 - // reads from an in-memory slice 151 - struct SliceReader<'a> { 152 - data: &'a [u8], 153 - pos: usize, 154 - } 155 - 156 - impl<'a> SliceReader<'a> { 157 - fn new(data: &'a [u8], start: usize) -> Self { 158 - Self { data, pos: start } 159 - } 160 - } 161 - 162 - impl JpegRead for SliceReader<'_> { 163 - #[inline] 164 - fn read_byte(&mut self) -> Result<u8, &'static str> { 165 - if self.pos >= self.data.len() { 166 - return Err("jpeg: unexpected end of data"); 167 - } 168 - let b = self.data[self.pos]; 169 - self.pos += 1; 170 - Ok(b) 171 - } 172 - 173 - #[inline] 174 - fn is_eof(&self) -> bool { 175 - self.pos >= self.data.len() 176 - } 177 - } 178 - 179 - // reads from SD via closure, buffering 4KB chunks 180 - struct ChunkReader<F> { 181 - read_fn: F, 182 - offset: u32, // absolute offset of next byte to fetch 183 - end: u32, // end-of-data offset (exclusive) 184 - buf: [u8; CHUNK_SIZE], 185 - pos: usize, 186 - len: usize, 187 - } 188 - 189 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> ChunkReader<F> { 190 - fn new(read_fn: F, start: u32, end: u32) -> Self { 191 - Self { 192 - read_fn, 193 - offset: start, 194 - end, 195 - buf: [0u8; CHUNK_SIZE], 196 - pos: 0, 197 - len: 0, 198 - } 199 - } 200 - 201 - fn refill(&mut self) -> Result<(), &'static str> { 202 - if self.offset >= self.end { 203 - self.len = 0; 204 - return Ok(()); 205 - } 206 - let want = CHUNK_SIZE.min((self.end - self.offset) as usize); 207 - let n = (self.read_fn)(self.offset, &mut self.buf[..want])?; 208 - if n == 0 { 209 - self.len = 0; 210 - return Ok(()); 211 - } 212 - self.offset += n as u32; 213 - self.pos = 0; 214 - self.len = n; 215 - Ok(()) 216 - } 217 - } 218 - 219 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> JpegRead for ChunkReader<F> { 220 - fn read_byte(&mut self) -> Result<u8, &'static str> { 221 - if self.pos >= self.len { 222 - self.refill()?; 223 - if self.len == 0 { 224 - return Err("jpeg: unexpected end of data"); 225 - } 226 - } 227 - let b = self.buf[self.pos]; 228 - self.pos += 1; 229 - Ok(b) 230 - } 231 - 232 - fn is_eof(&self) -> bool { 233 - self.pos >= self.len && self.offset >= self.end 234 - } 235 - } 236 - 237 - // streaming DEFLATE-from-SD reader; 4KB chunks in, one decompressed byte at a time out. 238 - // peak heap: ~47KB (11KB decompressor + 32KB window + 4KB read buf). 239 - struct DeflateReader<F> { 240 - read_fn: F, 241 - file_pos: u32, // absolute offset of next compressed byte 242 - comp_left: usize, // compressed bytes remaining in ZIP entry 243 - rbuf: Vec<u8>, // compressed-data read buffer 244 - in_avail: usize, // valid bytes in rbuf 245 - decomp: Box<miniz_oxide::inflate::core::DecompressorOxide>, // ~11KB 246 - window: Vec<u8>, // 32KB circular dictionary 247 - dict_pos: usize, // write position in window (cumulative, mod DEFLATE_WINDOW) 248 - read_pos: usize, // next byte to yield from window 249 - avail: usize, // decompressed bytes available (dict_pos - read_pos) 250 - done: bool, // true once miniz reports Done 251 - } 252 - 253 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> DeflateReader<F> { 254 - fn new(read_fn: F, data_offset: u32, comp_size: u32) -> Result<Self, &'static str> { 255 - use miniz_oxide::inflate::core::DecompressorOxide; 256 - 257 - let decomp_ptr = 258 - unsafe { alloc::alloc::alloc_zeroed(core::alloc::Layout::new::<DecompressorOxide>()) }; 259 - if decomp_ptr.is_null() { 260 - return Err("jpeg: OOM for DEFLATE decompressor"); 261 - } 262 - let decomp = unsafe { Box::from_raw(decomp_ptr as *mut DecompressorOxide) }; 263 - 264 - let mut window = Vec::new(); 265 - window 266 - .try_reserve_exact(DEFLATE_WINDOW) 267 - .map_err(|_| "jpeg: OOM for DEFLATE window")?; 268 - window.resize(DEFLATE_WINDOW, 0); 269 - 270 - let mut rbuf = Vec::new(); 271 - rbuf.try_reserve_exact(CHUNK_SIZE) 272 - .map_err(|_| "jpeg: OOM for DEFLATE read buffer")?; 273 - rbuf.resize(CHUNK_SIZE, 0); 274 - 275 - Ok(Self { 276 - read_fn, 277 - file_pos: data_offset, 278 - comp_left: comp_size as usize, 279 - rbuf, 280 - in_avail: 0, 281 - decomp, 282 - window, 283 - dict_pos: 0, 284 - read_pos: 0, 285 - avail: 0, 286 - done: false, 287 - }) 288 - } 289 - 290 - // decompress more data into the circular window 291 - fn pump(&mut self) -> Result<(), &'static str> { 292 - use miniz_oxide::inflate::TINFLStatus; 293 - use miniz_oxide::inflate::core::{decompress, inflate_flags}; 294 - 295 - if self.done { 296 - return Ok(()); 297 - } 298 - 299 - // top up read buffer from SD 300 - if self.in_avail < CHUNK_SIZE && self.comp_left > 0 { 301 - let space = CHUNK_SIZE - self.in_avail; 302 - let want = space.min(self.comp_left); 303 - match (self.read_fn)( 304 - self.file_pos, 305 - &mut self.rbuf[self.in_avail..self.in_avail + want], 306 - ) { 307 - Ok(n) if n > 0 => { 308 - self.file_pos += n as u32; 309 - self.comp_left -= n; 310 - self.in_avail += n; 311 - } 312 - Ok(_) => { 313 - self.comp_left = 0; 314 - } 315 - Err(e) => return Err(e), 316 - } 317 - } 318 - 319 - let flags = if self.comp_left > 0 { 320 - inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 321 - } else { 322 - 0 323 - }; 324 - 325 - let write_pos = self.dict_pos & (DEFLATE_WINDOW - 1); 326 - let (status, consumed, produced) = decompress( 327 - &mut *self.decomp, 328 - &self.rbuf[..self.in_avail], 329 - &mut self.window, 330 - write_pos, 331 - flags, 332 - ); 333 - 334 - if consumed > 0 && consumed < self.in_avail { 335 - self.rbuf.copy_within(consumed..self.in_avail, 0); 336 - } 337 - self.in_avail -= consumed; 338 - 339 - self.dict_pos += produced; 340 - self.avail += produced; 341 - 342 - match status { 343 - TINFLStatus::Done => { 344 - self.done = true; 345 - } 346 - TINFLStatus::HasMoreOutput | TINFLStatus::NeedsMoreInput => {} 347 - _ => return Err("jpeg: DEFLATE decompression error"), 348 - } 349 - 350 - Ok(()) 351 - } 352 - 353 - // read up to buf.len() decompressed bytes into buf; return count read 354 - fn read_bytes(&mut self, buf: &mut [u8]) -> Result<usize, &'static str> { 355 - let mut total = 0usize; 356 - while total < buf.len() { 357 - if self.avail == 0 { 358 - if self.done { 359 - break; 360 - } 361 - self.pump()?; 362 - if self.avail == 0 { 363 - break; 364 - } 365 - } 366 - let rp = self.read_pos & (DEFLATE_WINDOW - 1); 367 - let contiguous = (DEFLATE_WINDOW - rp).min(self.avail); 368 - let n = contiguous.min(buf.len() - total); 369 - buf[total..total + n].copy_from_slice(&self.window[rp..rp + n]); 370 - self.read_pos += n; 371 - self.avail -= n; 372 - total += n; 373 - } 374 - Ok(total) 375 - } 376 - } 377 - 378 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> JpegRead for DeflateReader<F> { 379 - fn read_byte(&mut self) -> Result<u8, &'static str> { 380 - if self.avail == 0 { 381 - if self.done { 382 - return Err("jpeg: unexpected end of DEFLATE stream"); 383 - } 384 - self.pump()?; 385 - if self.avail == 0 { 386 - return Err("jpeg: unexpected end of DEFLATE stream"); 387 - } 388 - } 389 - let rp = self.read_pos & (DEFLATE_WINDOW - 1); 390 - let b = self.window[rp]; 391 - self.read_pos += 1; 392 - self.avail -= 1; 393 - Ok(b) 394 - } 395 - 396 - fn is_eof(&self) -> bool { 397 - self.avail == 0 && self.done 398 - } 399 - } 400 - 401 - // BitReader: generic over byte source 402 - 403 - struct BitReader<R> { 404 - source: R, 405 - buf: u32, 406 - avail: u8, 407 - marker: u8, // stashed marker byte (non-zero = encountered during next_byte) 408 - } 409 - 410 - impl<R: JpegRead> BitReader<R> { 411 - fn new(source: R) -> Self { 412 - Self { 413 - source, 414 - buf: 0, 415 - avail: 0, 416 - marker: 0, 417 - } 418 - } 419 - 420 - // fetch next entropy-coded byte, handling JPEG byte stuffing 421 - fn next_byte(&mut self) -> Result<u8, &'static str> { 422 - if self.marker != 0 { 423 - return Ok(0); 424 - } 425 - let b = self.source.read_byte()?; 426 - if b != 0xFF { 427 - return Ok(b); 428 - } 429 - loop { 430 - if self.source.is_eof() { 431 - return Ok(0); 432 - } 433 - let next = self.source.read_byte()?; 434 - match next { 435 - 0x00 => return Ok(0xFF), 436 - 0xFF => continue, 437 - _ => { 438 - self.marker = next; 439 - return Ok(0); 440 - } 441 - } 442 - } 443 - } 444 - 445 - fn ensure(&mut self, n: u8) -> Result<(), &'static str> { 446 - while self.avail < n { 447 - let b = self.next_byte()?; 448 - self.buf |= (b as u32) << (24 - self.avail); 449 - self.avail += 8; 450 - } 451 - Ok(()) 452 - } 453 - 454 - #[inline] 455 - fn peek(&mut self, n: u8) -> Result<u32, &'static str> { 456 - self.ensure(n)?; 457 - Ok(self.buf >> (32 - n as u32)) 458 - } 459 - 460 - #[inline] 461 - fn drop_bits(&mut self, n: u8) { 462 - self.buf <<= n as u32; 463 - self.avail -= n; 464 - } 465 - 466 - #[inline] 467 - fn read_bits(&mut self, n: u8) -> Result<u32, &'static str> { 468 - if n == 0 { 469 - return Ok(0); 470 - } 471 - self.ensure(n)?; 472 - let val = self.buf >> (32 - n as u32); 473 - self.buf <<= n as u32; 474 - self.avail -= n; 475 - Ok(val) 476 - } 477 - 478 - // discard remaining bits, advance past the next restart marker 479 - fn consume_restart(&mut self) -> Result<(), &'static str> { 480 - self.buf = 0; 481 - self.avail = 0; 482 - 483 - // if next_byte already stashed a marker, check it now 484 - if self.marker != 0 { 485 - let m = self.marker; 486 - self.marker = 0; 487 - if m >= M_RST0 && m <= M_RST7 { 488 - return Ok(()); 489 - } 490 - // non-RST marker; keep going 491 - return Ok(()); 492 - } 493 - 494 - // scan forward for the restart marker 495 - loop { 496 - if self.source.is_eof() { 497 - return Ok(()); 498 - } 499 - let b = self.source.read_byte()?; 500 - if b != 0xFF { 501 - continue; 502 - } 503 - loop { 504 - if self.source.is_eof() { 505 - return Ok(()); 506 - } 507 - let m = self.source.read_byte()?; 508 - match m { 509 - 0xFF => continue, 510 - 0x00 => break, 511 - M_RST0..=M_RST7 => return Ok(()), 512 - _ => return Ok(()), 513 - } 514 - } 515 - } 516 - } 517 - } 518 - 519 - // public API 520 - 521 - // decode a baseline JPEG from an in-memory buffer 522 - /// Decode a JPEG from an in-memory buffer to a 1-bit dithered bitmap. 523 - /// 524 - /// The image is integer-downscaled so the result fits within 525 - /// `max_w` × `max_h` pixels. 526 - pub fn decode_jpeg_fit(data: &[u8], max_w: u16, max_h: u16) -> Result<DecodedImage, &'static str> { 527 - let st = parse_markers(data)?; 528 - 529 - validate_tables(&st)?; 530 - 531 - let reader = SliceReader::new(data, st.scan_start); 532 - decode_baseline(&st, BitReader::new(reader), max_w, max_h) 533 - } 534 - 535 - /// Decode a JPEG from a **stored** (uncompressed) ZIP entry by streaming 536 - /// 4 KB chunks through `read_fn`. 537 - /// 538 - /// `read_fn(offset, buf)` reads bytes at the given absolute offset and 539 - /// returns the number of bytes actually read. Progressive JPEGs are 540 - /// decoded using the first scan only. 541 - pub fn decode_jpeg_streaming<F>( 542 - mut read_fn: F, 543 - data_offset: u32, 544 - data_size: u32, 545 - max_w: u16, 546 - max_h: u16, 547 - ) -> Result<DecodedImage, &'static str> 548 - where 549 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 550 - { 551 - // read the first portion of the JPEG for marker parsing 552 - let hdr_size = HEADER_READ.min(data_size as usize); 553 - let mut hdr = Vec::new(); 554 - hdr.try_reserve_exact(hdr_size) 555 - .map_err(|_| "jpeg: OOM for header")?; 556 - hdr.resize(hdr_size, 0); 557 - let n = read_fn(data_offset, &mut hdr)?; 558 - hdr.truncate(n); 559 - 560 - let st = parse_markers(&hdr)?; 561 - 562 - validate_tables(&st)?; 563 - 564 - // free header; marker data is now in JpegState 565 - drop(hdr); 566 - 567 - let scan_abs = data_offset + st.scan_start as u32; 568 - let end_abs = data_offset + data_size; 569 - let reader = ChunkReader::new(read_fn, scan_abs, end_abs); 570 - 571 - decode_baseline(&st, BitReader::new(reader), max_w, max_h) 572 - } 573 - 574 - /// Backward-compatible alias for [`decode_jpeg_streaming`]. 575 - pub fn decode_jpeg_sd<F>( 576 - read_fn: F, 577 - data_offset: u32, 578 - data_size: u32, 579 - max_w: u16, 580 - max_h: u16, 581 - ) -> Result<DecodedImage, &'static str> 582 - where 583 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 584 - { 585 - decode_jpeg_streaming(read_fn, data_offset, data_size, max_w, max_h) 586 - } 587 - 588 - /// Decode a JPEG from a **DEFLATE-compressed** ZIP entry by streaming 589 - /// reads through `read_fn`. 590 - /// 591 - /// Both ZIP decompression and MCU decode are streamed concurrently, 592 - /// so the full entry is never held in memory. Peak heap ≈ 79 KB. 593 - pub fn decode_jpeg_deflate_streaming<F>( 594 - read_fn: F, 595 - data_offset: u32, 596 - comp_size: u32, 597 - uncomp_size: u32, 598 - max_w: u16, 599 - max_h: u16, 600 - ) -> Result<DecodedImage, &'static str> 601 - where 602 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 603 - { 604 - let mut deflate = DeflateReader::new(read_fn, data_offset, comp_size)?; 605 - 606 - // decompress enough for marker parsing 607 - let hdr_size = HEADER_READ.min(uncomp_size as usize); 608 - let mut hdr = Vec::new(); 609 - hdr.try_reserve_exact(hdr_size) 610 - .map_err(|_| "jpeg: OOM for header")?; 611 - hdr.resize(hdr_size, 0); 612 - let n = deflate.read_bytes(&mut hdr)?; 613 - hdr.truncate(n); 614 - 615 - let st = parse_markers(&hdr)?; 616 - 617 - validate_tables(&st)?; 618 - 619 - // advance past header bytes already decompressed beyond scan_start; 620 - // if scan_start > n skip forward; if <= n rewind read cursor 621 - let scan_start = st.scan_start; 622 - if scan_start > n { 623 - // rare: headers larger than HEADER_READ; skip forward 624 - let skip = scan_start - n; 625 - let mut trash = [0u8; 256]; 626 - let mut left = skip; 627 - while left > 0 { 628 - let chunk = left.min(trash.len()); 629 - let got = deflate.read_bytes(&mut trash[..chunk])?; 630 - if got == 0 { 631 - return Err("jpeg: truncated DEFLATE before scan data"); 632 - } 633 - left -= got; 634 - } 635 - } else { 636 - // bytes from scan_start..n already in window; rewind read cursor 637 - let rewind = n - scan_start; 638 - deflate.read_pos -= rewind; 639 - deflate.avail += rewind; 640 - } 641 - 642 - // free header; marker data is in JpegState 643 - drop(hdr); 644 - 645 - decode_baseline(&st, BitReader::new(deflate), max_w, max_h) 646 - } 647 - 648 - /// Backward-compatible alias for [`decode_jpeg_deflate_streaming`]. 649 - pub fn decode_jpeg_deflate_sd<F>( 650 - read_fn: F, 651 - data_offset: u32, 652 - comp_size: u32, 653 - uncomp_size: u32, 654 - max_w: u16, 655 - max_h: u16, 656 - ) -> Result<DecodedImage, &'static str> 657 - where 658 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 659 - { 660 - decode_jpeg_deflate_streaming(read_fn, data_offset, comp_size, uncomp_size, max_w, max_h) 661 - } 662 - 663 - // baseline decode core (generic over byte source) 664 - 665 - fn validate_tables(st: &JpegState) -> Result<(), &'static str> { 666 - for sci in 0..st.scan_num_comp as usize { 667 - let ci = st.scan_order[sci] as usize; 668 - let c = &st.comp[ci]; 669 - if !st.qt_ok[c.qt_idx as usize] { 670 - return Err("jpeg: missing quant table"); 671 - } 672 - if !st.dc_ok[c.dc_tbl as usize] { 673 - return Err("jpeg: missing DC Huffman table"); 674 - } 675 - if st.scan_se > 0 && !st.ac_ok[c.ac_tbl as usize] { 676 - return Err("jpeg: missing AC Huffman table"); 677 - } 678 - } 679 - Ok(()) 680 - } 681 - 682 - fn decode_baseline<R: JpegRead>( 683 - st: &JpegState, 684 - mut reader: BitReader<R>, 685 - max_w: u16, 686 - max_h: u16, 687 - ) -> Result<DecodedImage, &'static str> { 688 - let w = st.width as usize; 689 - let h = st.height as usize; 690 - if w == 0 || h == 0 { 691 - return Err("jpeg: zero dimensions"); 692 - } 693 - if (w as u32).saturating_mul(h as u32) > MAX_PIXELS { 694 - return Err("jpeg: exceeds pixel limit"); 695 - } 696 - 697 - let scale = { 698 - let sw = (w + max_w as usize - 1) / max_w as usize; 699 - let sh = (h + max_h as usize - 1) / max_h as usize; 700 - sw.max(sh).max(1) 701 - }; 702 - let out_w = (w / scale).max(1); 703 - let out_h = (h / scale).max(1); 704 - let out_stride = (out_w + 7) / 8; 705 - 706 - let mcu_w = st.max_h as usize * 8; 707 - let mcu_h = st.max_v as usize * 8; 708 - let mcus_x = (w + mcu_w - 1) / mcu_w; 709 - let mcus_y = (h + mcu_h - 1) / mcu_h; 710 - let row_w = mcus_x * mcu_w; 711 - 712 - if st.progressive { 713 - log::warn!( 714 - "jpeg: progressive {}x{} -> {}x{} (scale {}, first scan Ss={} Se={} Al={})", 715 - w, 716 - h, 717 - out_w, 718 - out_h, 719 - scale, 720 - st.scan_ss, 721 - st.scan_se, 722 - st.scan_al 723 - ); 724 - } else { 725 - log::info!( 726 - "jpeg: baseline {}x{} → {}x{} (scale {})", 727 - w, 728 - h, 729 - out_w, 730 - out_h, 731 - scale 732 - ); 733 - } 734 - 735 - // allocate buffers 736 - 737 - let mut y_row = vec![128u8; row_w * mcu_h]; 738 - let mut output = Vec::new(); 739 - output 740 - .try_reserve_exact(out_stride * out_h) 741 - .map_err(|_| "jpeg: OOM for output")?; 742 - output.resize(out_stride * out_h, 0u8); 743 - let mut err_cur = vec![0i16; out_w + 2]; 744 - let mut err_nxt = vec![0i16; out_w + 2]; 745 - 746 - let mut dc_pred = [0i32; MAX_COMP]; 747 - let mut block = [0i32; 64]; 748 - let mut pix = [0u8; 64]; 749 - let mut mcu_cnt: u32 = 0; 750 - let total_mcus = (mcus_x * mcus_y) as u32; 751 - let mut out_y: usize = 0; 752 - 753 - // MCU decode loop 754 - 755 - for mcu_row in 0..mcus_y { 756 - y_row.fill(128); 757 - 758 - for mcu_col in 0..mcus_x { 759 - for sci in 0..st.scan_num_comp as usize { 760 - let ci = st.scan_order[sci] as usize; 761 - let c = &st.comp[ci]; 762 - let is_y = ci == 0; 763 - 764 - for bv in 0..c.v_samp as usize { 765 - for bh in 0..c.h_samp as usize { 766 - if is_y { 767 - decode_block( 768 - &mut reader, 769 - &st.dc_huff[c.dc_tbl as usize], 770 - &st.ac_huff[c.ac_tbl as usize], 771 - &mut dc_pred[ci], 772 - &st.qt[c.qt_idx as usize], 773 - &mut block, 774 - st.scan_se as usize, 775 - st.scan_al, 776 - )?; 777 - idct(&block, &mut pix); 778 - let bx = mcu_col * mcu_w + bh * 8; 779 - let by = bv * 8; 780 - for r in 0..8 { 781 - let dst = (by + r) * row_w + bx; 782 - y_row[dst..dst + 8].copy_from_slice(&pix[r * 8..r * 8 + 8]); 783 - } 784 - } else { 785 - skip_block( 786 - &mut reader, 787 - &st.dc_huff[c.dc_tbl as usize], 788 - &st.ac_huff[c.ac_tbl as usize], 789 - &mut dc_pred[ci], 790 - st.scan_se as usize, 791 - )?; 792 - } 793 - } 794 - } 795 - } 796 - 797 - mcu_cnt += 1; 798 - 799 - if st.restart_interval > 0 800 - && mcu_cnt % st.restart_interval as u32 == 0 801 - && mcu_cnt < total_mcus 802 - { 803 - reader.consume_restart()?; 804 - dc_pred.fill(0); 805 - } 806 - } 807 - 808 - // dither this MCU row 809 - for py in 0..mcu_h { 810 - let src_y = mcu_row * mcu_h + py; 811 - if src_y >= h || out_y >= out_h { 812 - break; 813 - } 814 - if src_y % scale != 0 { 815 - continue; 816 - } 817 - let row_off = py * row_w; 818 - let out_row = &mut output[out_y * out_stride..(out_y + 1) * out_stride]; 819 - dither_row_grey( 820 - &y_row[row_off..], 821 - scale, 822 - out_w, 823 - &mut err_cur, 824 - &mut err_nxt, 825 - out_row, 826 - ); 827 - out_y += 1; 828 - core::mem::swap(&mut err_cur, &mut err_nxt); 829 - err_nxt.fill(0); 830 - } 831 - } 832 - 833 - Ok(DecodedImage { 834 - width: out_w as u16, 835 - height: out_y as u16, 836 - data: output, 837 - stride: out_stride, 838 - }) 839 - } 840 - 841 - // marker parsing (operates on &[u8] header buffer) 842 - 843 - fn parse_markers(data: &[u8]) -> Result<Box<JpegState>, &'static str> { 844 - if data.len() < 2 || data[0] != 0xFF || data[1] != M_SOI { 845 - return Err("jpeg: invalid signature"); 846 - } 847 - let mut st = JpegState::heap_new()?; 848 - let mut pos = 2usize; 849 - let len = data.len(); 850 - 851 - loop { 852 - while pos < len && data[pos] != 0xFF { 853 - pos += 1; 854 - } 855 - while pos < len && data[pos] == 0xFF { 856 - pos += 1; 857 - } 858 - if pos >= len { 859 - return Err("jpeg: truncated"); 860 - } 861 - let marker = data[pos]; 862 - pos += 1; 863 - 864 - match marker { 865 - 0x00 | M_RST0..=M_RST7 => continue, 866 - 867 - M_SOF0 => parse_sof(data, &mut pos, &mut st, false)?, 868 - M_SOF2 => parse_sof(data, &mut pos, &mut st, true)?, 869 - 0xC1 | 0xC3 | 0xC5..=0xCB | 0xCD..=0xCF => { 870 - return Err("jpeg: unsupported SOF variant"); 871 - } 872 - M_DHT => parse_dht(data, &mut pos, &mut st)?, 873 - M_DQT => parse_dqt(data, &mut pos, &mut st)?, 874 - M_DRI => parse_dri(data, &mut pos, &mut st)?, 875 - M_SOS => { 876 - parse_sos(data, &mut pos, &mut st)?; 877 - st.scan_start = pos; 878 - return Ok(st); 879 - } 880 - M_EOI => return Err("jpeg: EOI before SOS"), 881 - _ => { 882 - if pos + 2 > len { 883 - return Err("jpeg: truncated marker"); 884 - } 885 - let seg = be_u16(data, pos) as usize; 886 - if seg < 2 || pos + seg > len { 887 - return Err("jpeg: bad marker length"); 888 - } 889 - pos += seg; 890 - } 891 - } 892 - } 893 - } 894 - 895 - fn parse_sof( 896 - data: &[u8], 897 - pos: &mut usize, 898 - st: &mut JpegState, 899 - progressive: bool, 900 - ) -> Result<(), &'static str> { 901 - if *pos + 2 > data.len() { 902 - return Err("jpeg: SOF truncated"); 903 - } 904 - let seg = be_u16(data, *pos) as usize; 905 - *pos += 2; 906 - if *pos + seg - 2 > data.len() { 907 - return Err("jpeg: SOF truncated"); 908 - } 909 - let p = *pos; 910 - if data[p] != 8 { 911 - return Err("jpeg: only 8-bit precision"); 912 - } 913 - st.height = be_u16(data, p + 1); 914 - st.width = be_u16(data, p + 3); 915 - st.num_comp = data[p + 5]; 916 - st.progressive = progressive; 917 - if st.num_comp == 0 || st.num_comp as usize > MAX_COMP { 918 - return Err("jpeg: bad component count"); 919 - } 920 - if p + 6 + st.num_comp as usize * 3 > data.len() { 921 - return Err("jpeg: SOF truncated"); 922 - } 923 - let mut off = p + 6; 924 - st.max_h = 1; 925 - st.max_v = 1; 926 - for i in 0..st.num_comp as usize { 927 - st.comp[i].id = data[off]; 928 - let samp = data[off + 1]; 929 - st.comp[i].h_samp = samp >> 4; 930 - st.comp[i].v_samp = samp & 0x0F; 931 - st.comp[i].qt_idx = data[off + 2]; 932 - if st.comp[i].h_samp == 0 || st.comp[i].v_samp == 0 { 933 - return Err("jpeg: zero sampling factor"); 934 - } 935 - st.max_h = st.max_h.max(st.comp[i].h_samp); 936 - st.max_v = st.max_v.max(st.comp[i].v_samp); 937 - off += 3; 938 - } 939 - *pos += seg - 2; 940 - Ok(()) 941 - } 942 - 943 - fn parse_dqt(data: &[u8], pos: &mut usize, st: &mut JpegState) -> Result<(), &'static str> { 944 - if *pos + 2 > data.len() { 945 - return Err("jpeg: DQT truncated"); 946 - } 947 - let seg = be_u16(data, *pos) as usize; 948 - let end = *pos + seg; 949 - *pos += 2; 950 - if end > data.len() { 951 - return Err("jpeg: DQT truncated"); 952 - } 953 - while *pos < end { 954 - let info = data[*pos]; 955 - *pos += 1; 956 - let prec = info >> 4; 957 - let id = (info & 0x0F) as usize; 958 - if id >= 4 { 959 - return Err("jpeg: DQT id out of range"); 960 - } 961 - if prec == 0 { 962 - if *pos + 64 > end { 963 - return Err("jpeg: DQT truncated"); 964 - } 965 - for i in 0..64 { 966 - st.qt[id][i] = data[*pos] as u16; 967 - *pos += 1; 968 - } 969 - } else { 970 - if *pos + 128 > end { 971 - return Err("jpeg: DQT truncated"); 972 - } 973 - for i in 0..64 { 974 - st.qt[id][i] = be_u16(data, *pos); 975 - *pos += 2; 976 - } 977 - } 978 - st.qt_ok[id] = true; 979 - } 980 - Ok(()) 981 - } 982 - 983 - fn parse_dht(data: &[u8], pos: &mut usize, st: &mut JpegState) -> Result<(), &'static str> { 984 - if *pos + 2 > data.len() { 985 - return Err("jpeg: DHT truncated"); 986 - } 987 - let seg = be_u16(data, *pos) as usize; 988 - let end = *pos + seg; 989 - *pos += 2; 990 - if end > data.len() { 991 - return Err("jpeg: DHT truncated"); 992 - } 993 - while *pos < end { 994 - if *pos + 17 > end { 995 - return Err("jpeg: DHT truncated"); 996 - } 997 - let info = data[*pos]; 998 - *pos += 1; 999 - let class = info >> 4; 1000 - let id = (info & 0x0F) as usize; 1001 - if id >= 4 { 1002 - return Err("jpeg: DHT id out of range"); 1003 - } 1004 - let mut bits = [0u8; 16]; 1005 - bits.copy_from_slice(&data[*pos..*pos + 16]); 1006 - *pos += 16; 1007 - let total: usize = bits.iter().map(|&b| b as usize).sum(); 1008 - if total > 256 || *pos + total > end { 1009 - return Err("jpeg: DHT value overflow"); 1010 - } 1011 - let vals = &data[*pos..*pos + total]; 1012 - *pos += total; 1013 - if class == 0 { 1014 - build_huff_table(&mut st.dc_huff[id], &bits, vals); 1015 - st.dc_ok[id] = true; 1016 - } else { 1017 - build_huff_table(&mut st.ac_huff[id], &bits, vals); 1018 - st.ac_ok[id] = true; 1019 - } 1020 - } 1021 - Ok(()) 1022 - } 1023 - 1024 - fn parse_dri(data: &[u8], pos: &mut usize, st: &mut JpegState) -> Result<(), &'static str> { 1025 - if *pos + 4 > data.len() { 1026 - return Err("jpeg: DRI truncated"); 1027 - } 1028 - *pos += 2; 1029 - st.restart_interval = be_u16(data, *pos); 1030 - *pos += 2; 1031 - Ok(()) 1032 - } 1033 - 1034 - fn parse_sos(data: &[u8], pos: &mut usize, st: &mut JpegState) -> Result<(), &'static str> { 1035 - if *pos + 2 > data.len() { 1036 - return Err("jpeg: SOS truncated"); 1037 - } 1038 - let seg = be_u16(data, *pos) as usize; 1039 - if *pos + seg > data.len() { 1040 - return Err("jpeg: SOS truncated"); 1041 - } 1042 - *pos += 2; 1043 - st.scan_num_comp = data[*pos]; 1044 - *pos += 1; 1045 - if st.scan_num_comp == 0 || st.scan_num_comp > st.num_comp { 1046 - return Err("jpeg: bad SOS component count"); 1047 - } 1048 - for sci in 0..st.scan_num_comp as usize { 1049 - let cs = data[*pos]; 1050 - let td_ta = data[*pos + 1]; 1051 - *pos += 2; 1052 - let mut found = false; 1053 - for j in 0..st.num_comp as usize { 1054 - if st.comp[j].id == cs { 1055 - st.comp[j].dc_tbl = td_ta >> 4; 1056 - st.comp[j].ac_tbl = td_ta & 0x0F; 1057 - st.scan_order[sci] = j as u8; 1058 - found = true; 1059 - break; 1060 - } 1061 - } 1062 - if !found { 1063 - return Err("jpeg: SOS references unknown component"); 1064 - } 1065 - } 1066 - st.scan_ss = data[*pos]; 1067 - st.scan_se = data[*pos + 1]; 1068 - let ah_al = data[*pos + 2]; 1069 - st.scan_al = ah_al & 0x0F; 1070 - *pos += 3; 1071 - Ok(()) 1072 - } 1073 - 1074 - // Huffman table construction 1075 - 1076 - fn build_huff_table(table: &mut HuffTable, bits: &[u8; 16], vals: &[u8]) { 1077 - let total: usize = bits.iter().map(|&b| b as usize).sum(); 1078 - table.values[..total].copy_from_slice(&vals[..total]); 1079 - table.lut.fill((0, 0)); 1080 - table.maxcode.fill(-1); 1081 - 1082 - let mut code: u32 = 0; 1083 - let mut si: usize = 0; 1084 - 1085 - for bl in 1..=16usize { 1086 - let cnt = bits[bl - 1] as usize; 1087 - if cnt > 0 { 1088 - table.valptr[bl] = si; 1089 - table.mincode[bl] = code as i32; 1090 - for _ in 0..cnt { 1091 - if bl <= 8 { 1092 - let prefix = (code << (8 - bl)) as usize; 1093 - let fill = 1usize << (8 - bl); 1094 - for k in 0..fill { 1095 - if prefix + k < 256 { 1096 - table.lut[prefix + k] = (vals[si], bl as u8); 1097 - } 1098 - } 1099 - } 1100 - si += 1; 1101 - code += 1; 1102 - } 1103 - table.maxcode[bl] = (code - 1) as i32; 1104 - } 1105 - code <<= 1; 1106 - } 1107 - } 1108 - 1109 - // Huffman decode 1110 - 1111 - fn huff_decode<R: JpegRead>(r: &mut BitReader<R>, t: &HuffTable) -> Result<u8, &'static str> { 1112 - let peek8 = r.peek(8)? as usize; 1113 - let (sym, nb) = t.lut[peek8]; 1114 - if nb > 0 { 1115 - r.drop_bits(nb); 1116 - return Ok(sym); 1117 - } 1118 - let peek16 = r.peek(16)? as i32; 1119 - for bl in 9..=16u8 { 1120 - let code = peek16 >> (16 - bl); 1121 - if t.maxcode[bl as usize] >= 0 && code <= t.maxcode[bl as usize] { 1122 - r.drop_bits(bl); 1123 - let idx = t.valptr[bl as usize] as i32 + code - t.mincode[bl as usize]; 1124 - return Ok(t.values[idx as usize]); 1125 - } 1126 - } 1127 - Err("jpeg: invalid Huffman code") 1128 - } 1129 - 1130 - #[inline] 1131 - fn extend(bits: u32, size: u8) -> i32 { 1132 - let half = 1u32 << (size as u32 - 1); 1133 - if bits < half { 1134 - bits as i32 - ((1u32 << size as u32) as i32 - 1) 1135 - } else { 1136 - bits as i32 1137 - } 1138 - } 1139 - 1140 - // block decode (Y) / skip (non-Y) 1141 - 1142 - fn decode_block<R: JpegRead>( 1143 - r: &mut BitReader<R>, 1144 - dc_ht: &HuffTable, 1145 - ac_ht: &HuffTable, 1146 - dc_pred: &mut i32, 1147 - qt: &[u16; 64], 1148 - blk: &mut [i32; 64], 1149 - se: usize, 1150 - al: u8, 1151 - ) -> Result<(), &'static str> { 1152 - blk.fill(0); 1153 - 1154 - let dc_size = huff_decode(r, dc_ht)?; 1155 - if dc_size > 0 { 1156 - if dc_size > 11 { 1157 - return Err("jpeg: DC size > 11"); 1158 - } 1159 - let bits = r.read_bits(dc_size)?; 1160 - *dc_pred += extend(bits, dc_size); 1161 - } 1162 - blk[0] = ((*dc_pred) << al).wrapping_mul(qt[0] as i32); 1163 - 1164 - if se > 0 { 1165 - let mut k: usize = 1; 1166 - while k <= se { 1167 - let sym = huff_decode(r, ac_ht)?; 1168 - let run = (sym >> 4) as usize; 1169 - let size = sym & 0x0F; 1170 - if size == 0 { 1171 - if run == 15 { 1172 - k += 16; 1173 - } else { 1174 - break; 1175 - } 1176 - } else { 1177 - k += run; 1178 - if k > se { 1179 - return Err("jpeg: AC index overflow"); 1180 - } 1181 - let bits = r.read_bits(size)?; 1182 - let val = extend(bits, size); 1183 - blk[ZZ[k]] = (val << al).wrapping_mul(qt[k] as i32); 1184 - k += 1; 1185 - } 1186 - } 1187 - } 1188 - Ok(()) 1189 - } 1190 - 1191 - fn skip_block<R: JpegRead>( 1192 - r: &mut BitReader<R>, 1193 - dc_ht: &HuffTable, 1194 - ac_ht: &HuffTable, 1195 - dc_pred: &mut i32, 1196 - se: usize, 1197 - ) -> Result<(), &'static str> { 1198 - let dc_size = huff_decode(r, dc_ht)?; 1199 - if dc_size > 0 { 1200 - let bits = r.read_bits(dc_size)?; 1201 - *dc_pred += extend(bits, dc_size); 1202 - } 1203 - if se > 0 { 1204 - let mut k: usize = 1; 1205 - while k <= se { 1206 - let sym = huff_decode(r, ac_ht)?; 1207 - let run = (sym >> 4) as usize; 1208 - let size = sym & 0x0F; 1209 - if size == 0 { 1210 - if run == 15 { 1211 - k += 16; 1212 - } else { 1213 - break; 1214 - } 1215 - } else { 1216 - k += run + 1; 1217 - let _ = r.read_bits(size)?; 1218 - } 1219 - } 1220 - } 1221 - Ok(()) 1222 - } 1223 - 1224 - // integer IDCT (IJG ISLOW, two-pass row + col) 1225 - 1226 - fn idct(block: &[i32; 64], out: &mut [u8; 64]) { 1227 - let mut ws = [0i32; 64]; 1228 - 1229 - for row in 0..8 { 1230 - let b = row * 8; 1231 - let (d0, d1, d2, d3) = (block[b], block[b + 1], block[b + 2], block[b + 3]); 1232 - let (d4, d5, d6, d7) = (block[b + 4], block[b + 5], block[b + 6], block[b + 7]); 1233 - 1234 - if d1 == 0 && d2 == 0 && d3 == 0 && d4 == 0 && d5 == 0 && d6 == 0 && d7 == 0 { 1235 - let dc = d0 << P1; 1236 - ws[b..b + 8].fill(dc); 1237 - continue; 1238 - } 1239 - 1240 - let z1 = (d2 + d6).wrapping_mul(F0541); 1241 - let tmp2 = z1 + d6.wrapping_mul(-F1847); 1242 - let tmp3 = z1 + d2.wrapping_mul(F0765); 1243 - let tmp0 = (d0 + d4) << CB; 1244 - let tmp1 = (d0 - d4) << CB; 1245 - let (t10, t13) = (tmp0 + tmp3, tmp0 - tmp3); 1246 - let (t11, t12) = (tmp1 + tmp2, tmp1 - tmp2); 1247 - 1248 - let (zz1, zz2, zz3, zz4) = (d7 + d1, d5 + d3, d7 + d3, d5 + d1); 1249 - let z5 = (zz3 + zz4).wrapping_mul(F1175); 1250 - let mut o0 = d7.wrapping_mul(F0298); 1251 - let mut o1 = d5.wrapping_mul(F2053); 1252 - let mut o2 = d3.wrapping_mul(F3072); 1253 - let mut o3 = d1.wrapping_mul(F1501); 1254 - let (s1, s2) = (zz1.wrapping_mul(-F0899), zz2.wrapping_mul(-F2562)); 1255 - let s3 = zz3.wrapping_mul(-F1961) + z5; 1256 - let s4 = zz4.wrapping_mul(-F0390) + z5; 1257 - o0 += s1 + s3; 1258 - o1 += s2 + s4; 1259 - o2 += s2 + s3; 1260 - o3 += s1 + s4; 1261 - 1262 - let sh = CB - P1; 1263 - ws[b] = descale(t10 + o3, sh); 1264 - ws[b + 7] = descale(t10 - o3, sh); 1265 - ws[b + 1] = descale(t11 + o2, sh); 1266 - ws[b + 6] = descale(t11 - o2, sh); 1267 - ws[b + 2] = descale(t12 + o1, sh); 1268 - ws[b + 5] = descale(t12 - o1, sh); 1269 - ws[b + 3] = descale(t13 + o0, sh); 1270 - ws[b + 4] = descale(t13 - o0, sh); 1271 - } 1272 - 1273 - for col in 0..8 { 1274 - let (d0, d1, d2, d3) = (ws[col], ws[col + 8], ws[col + 16], ws[col + 24]); 1275 - let (d4, d5, d6, d7) = (ws[col + 32], ws[col + 40], ws[col + 48], ws[col + 56]); 1276 - 1277 - if d1 == 0 && d2 == 0 && d3 == 0 && d4 == 0 && d5 == 0 && d6 == 0 && d7 == 0 { 1278 - let v = clamp(descale(d0, P1 + 3) + 128); 1279 - out[col] = v; 1280 - out[col + 8] = v; 1281 - out[col + 16] = v; 1282 - out[col + 24] = v; 1283 - out[col + 32] = v; 1284 - out[col + 40] = v; 1285 - out[col + 48] = v; 1286 - out[col + 56] = v; 1287 - continue; 1288 - } 1289 - 1290 - let z1 = (d2 + d6).wrapping_mul(F0541); 1291 - let tmp2 = z1 + d6.wrapping_mul(-F1847); 1292 - let tmp3 = z1 + d2.wrapping_mul(F0765); 1293 - let tmp0 = (d0 + d4) << CB; 1294 - let tmp1 = (d0 - d4) << CB; 1295 - let (t10, t13) = (tmp0 + tmp3, tmp0 - tmp3); 1296 - let (t11, t12) = (tmp1 + tmp2, tmp1 - tmp2); 1297 - 1298 - let (zz1, zz2, zz3, zz4) = (d7 + d1, d5 + d3, d7 + d3, d5 + d1); 1299 - let z5 = (zz3 + zz4).wrapping_mul(F1175); 1300 - let mut o0 = d7.wrapping_mul(F0298); 1301 - let mut o1 = d5.wrapping_mul(F2053); 1302 - let mut o2 = d3.wrapping_mul(F3072); 1303 - let mut o3 = d1.wrapping_mul(F1501); 1304 - let (s1, s2) = (zz1.wrapping_mul(-F0899), zz2.wrapping_mul(-F2562)); 1305 - let s3 = zz3.wrapping_mul(-F1961) + z5; 1306 - let s4 = zz4.wrapping_mul(-F0390) + z5; 1307 - o0 += s1 + s3; 1308 - o1 += s2 + s4; 1309 - o2 += s2 + s3; 1310 - o3 += s1 + s4; 1311 - 1312 - let sh = CB + P1 + 3; 1313 - out[col] = clamp(descale(t10 + o3, sh) + 128); 1314 - out[col + 56] = clamp(descale(t10 - o3, sh) + 128); 1315 - out[col + 8] = clamp(descale(t11 + o2, sh) + 128); 1316 - out[col + 48] = clamp(descale(t11 - o2, sh) + 128); 1317 - out[col + 16] = clamp(descale(t12 + o1, sh) + 128); 1318 - out[col + 40] = clamp(descale(t12 - o1, sh) + 128); 1319 - out[col + 24] = clamp(descale(t13 + o0, sh) + 128); 1320 - out[col + 32] = clamp(descale(t13 - o0, sh) + 128); 1321 - } 1322 - } 1323 - 1324 - // Floyd-Steinberg dithering 1325 - 1326 - // dither one row of Y pixels from the MCU row buffer inline 1327 - #[inline] 1328 - fn dither_row_grey( 1329 - row: &[u8], 1330 - scale: usize, 1331 - out_w: usize, 1332 - err_cur: &mut [i16], 1333 - err_nxt: &mut [i16], 1334 - out_row: &mut [u8], 1335 - ) { 1336 - for ox in 0..out_w { 1337 - let sx = ox * scale; 1338 - let g = row[sx] as i16; 1339 - let val = (g + err_cur[ox + 1]).clamp(0, 255); 1340 - let black = val < 128; 1341 - let q = if black { 0i16 } else { 255 }; 1342 - let e = val - q; 1343 - if black { 1344 - out_row[ox / 8] |= 1 << (7 - (ox & 7)); 1345 - } 1346 - err_cur[ox + 2] += e * 7 / 16; 1347 - err_nxt[ox] += e * 3 / 16; 1348 - err_nxt[ox + 1] += e * 5 / 16; 1349 - err_nxt[ox + 2] += e / 16; 1350 - } 1351 - } 1352 - 1353 - // helpers 1354 - 1355 - #[inline] 1356 - fn descale(x: i32, n: i32) -> i32 { 1357 - (x + (1 << (n - 1))) >> n 1358 - } 1359 - 1360 - #[inline] 1361 - fn clamp(x: i32) -> u8 { 1362 - x.clamp(0, 255) as u8 1363 - } 1364 - 1365 - #[inline] 1366 - fn be_u16(d: &[u8], o: usize) -> u16 { 1367 - u16::from_be_bytes([d[o], d[o + 1]]) 1368 - }
-132
smol-epub/src/lib.rs
··· 1 - //! # smol-epub 2 - //! 3 - //! Minimal `no_std` EPUB parser with streaming decompression, HTML 4 - //! stripping, CSS resolution, and optional 1-bit image decoders. 5 - //! 6 - //! Designed for memory-constrained embedded targets (≥ 140 KB heap), 7 - //! but works anywhere `alloc` is available. 8 - //! 9 - //! ## Modules 10 - //! 11 - //! | Module | Purpose | 12 - //! |--------|---------| 13 - //! | [`zip`] | ZIP central-directory parser, streaming DEFLATE extraction | 14 - //! | [`xml`] | Minimal XML tag / attribute scanner (EPUB metadata) | 15 - //! | [`css`] | CSS property parser for EPUB stylesheets | 16 - //! | [`epub`] | EPUB structure: `container.xml` → OPF → spine / metadata / TOC | 17 - //! | [`html_strip`] | Single-pass, streaming HTML-to-styled-text converter | 18 - //! | [`cache`] | Chapter decompress-and-strip pipeline with cache metadata | 19 - //! | [`png`] | PNG decoder → 1-bit Floyd–Steinberg dithered bitmap *(feature `images`)* | 20 - //! | [`jpeg`] | JPEG decoder → 1-bit Floyd–Steinberg dithered bitmap *(feature `images`)* | 21 - //! 22 - //! ## Feature flags 23 - //! 24 - //! | Flag | Default | Description | 25 - //! |------|---------|-------------| 26 - //! | `images` | ✓ | Enable [`png`] and [`jpeg`] image decoders | 27 - //! 28 - //! ## Streaming I/O model 29 - //! 30 - //! Functions that read from an external byte source accept a generic 31 - //! closure with signature: 32 - //! 33 - //! ```text 34 - //! FnMut(offset: u32, buf: &mut [u8]) -> Result<usize, E> 35 - //! ``` 36 - //! 37 - //! This works with SD cards, flash, `std::fs::File`, in-memory 38 - //! buffers, or any other random-access byte store. 39 - //! 40 - //! ## Quick start 41 - //! 42 - //! ```rust,ignore 43 - //! use smol_epub::zip::ZipIndex; 44 - //! use smol_epub::epub::{self, EpubMeta, EpubSpine, EpubToc}; 45 - //! 46 - //! // 1. Build ZIP index from the file's central directory 47 - //! let mut zip = ZipIndex::new(); 48 - //! // ... parse_eocd, read CD, parse_central_directory ... 49 - //! 50 - //! // 2. Parse EPUB structure 51 - //! let container = smol_epub::zip::extract_entry(/* ... */)?; 52 - //! let mut opf_path = [0u8; epub::OPF_PATH_CAP]; 53 - //! let opf_len = epub::parse_container(&container, &mut opf_path)?; 54 - //! 55 - //! // 3. Extract metadata and reading-order spine 56 - //! let mut meta = EpubMeta::new(); 57 - //! let mut spine = EpubSpine::new(); 58 - //! epub::parse_opf(&opf_data, opf_dir, &zip, &mut meta, &mut spine)?; 59 - //! 60 - //! // 4. Optionally parse the table of contents 61 - //! let mut toc = EpubToc::new(); 62 - //! if let Some(src) = epub::find_toc_source(&opf_data, opf_dir, &zip) { 63 - //! epub::parse_toc(src, &toc_data, toc_dir, &spine, &zip, &mut toc); 64 - //! } 65 - //! 66 - //! // 5. Stream-decompress + HTML-strip chapters via cache module 67 - //! let bytes_written = smol_epub::cache::stream_strip_entry( 68 - //! &entry, local_offset, read_fn, output_fn, 69 - //! )?; 70 - //! ``` 71 - 72 - #![no_std] 73 - #![warn(missing_docs)] 74 - 75 - extern crate alloc; 76 - 77 - use alloc::vec::Vec; 78 - 79 - // ── public modules ────────────────────────────────────────────────── 80 - 81 - pub mod cache; 82 - pub mod css; 83 - pub mod epub; 84 - pub mod html_strip; 85 - pub mod xml; 86 - pub mod zip; 87 - 88 - #[cfg(feature = "images")] 89 - pub mod jpeg; 90 - #[cfg(feature = "images")] 91 - pub mod png; 92 - 93 - // ── shared types ──────────────────────────────────────────────────── 94 - 95 - /// A decoded 1-bit monochrome image, packed MSB-first, row-major. 96 - /// 97 - /// A **set** bit (1) represents black (ink); a **clear** bit (0) represents 98 - /// white (paper). This convention matches most e-ink controllers directly. 99 - /// 100 - /// Produced by the [`png`] and [`jpeg`] decoders when the `images` 101 - /// feature is enabled. 102 - /// 103 - /// # Layout 104 - /// 105 - /// ```text 106 - /// stride = ceil(width / 8) bytes per row 107 - /// data.len() == stride * height 108 - /// ``` 109 - /// 110 - /// Pixel (x, y) is bit `(7 - x % 8)` of byte `data[y * stride + x / 8]`. 111 - #[derive(Clone)] 112 - pub struct DecodedImage { 113 - /// Image width in pixels. 114 - pub width: u16, 115 - /// Image height in pixels. 116 - pub height: u16, 117 - /// Packed 1-bit pixel data, `stride * height` bytes. 118 - pub data: Vec<u8>, 119 - /// Bytes per row (`ceil(width / 8)`). 120 - pub stride: usize, 121 - } 122 - 123 - impl core::fmt::Debug for DecodedImage { 124 - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 125 - f.debug_struct("DecodedImage") 126 - .field("width", &self.width) 127 - .field("height", &self.height) 128 - .field("stride", &self.stride) 129 - .field("data_len", &self.data.len()) 130 - .finish() 131 - } 132 - }
-1074
smol-epub/src/png.rs
··· 1 - //! Minimal PNG decoder producing 1-bit Floyd–Steinberg dithered bitmaps. 2 - //! 3 - //! Streams row-by-row through `miniz_oxide`; peak RAM ≈ 90 KB 4 - //! (32 KB dictionary + 11 KB decompressor + output bitmap). 5 - //! 6 - //! Supported colour types: greyscale, RGB, palette, grey+alpha, RGBA. 7 - //! Interlaced (Adam7) images are rejected (rare in EPUB content and 8 - //! would double code complexity). 9 - //! 10 - //! Output is packed 1-bit MSB-first, row-major — see [`DecodedImage`](crate::DecodedImage). 11 - 12 - extern crate alloc; 13 - 14 - use alloc::boxed::Box; 15 - use alloc::vec; 16 - use alloc::vec::Vec; 17 - 18 - use crate::DecodedImage; 19 - 20 - // PNG constants 21 - 22 - const PNG_SIG: [u8; 8] = [137, 80, 78, 71, 13, 10, 26, 10]; 23 - 24 - const CHUNK_IHDR: [u8; 4] = *b"IHDR"; 25 - const CHUNK_PLTE: [u8; 4] = *b"PLTE"; 26 - const CHUNK_IDAT: [u8; 4] = *b"IDAT"; 27 - 28 - const COLOR_GREYSCALE: u8 = 0; 29 - const COLOR_RGB: u8 = 2; 30 - const COLOR_PALETTE: u8 = 3; 31 - const COLOR_GREY_ALPHA: u8 = 4; 32 - const COLOR_RGBA: u8 = 6; 33 - 34 - const FILTER_NONE: u8 = 0; 35 - const FILTER_SUB: u8 = 1; 36 - const FILTER_UP: u8 = 2; 37 - const FILTER_AVERAGE: u8 = 3; 38 - const FILTER_PAETH: u8 = 4; 39 - 40 - // max total pixels we are willing to decode (memory guard) 41 - const MAX_PIXELS: u32 = 800 * 800; 42 - 43 - // miniz_oxide LZ dictionary size; must be a power of two >= 32768 44 - const DICT_SIZE: usize = 32_768; 45 - 46 - /// Backward-compatible alias for [`DecodedImage`](crate::DecodedImage). 47 - pub type PngImage = DecodedImage; 48 - 49 - /// Decode a PNG from an in-memory buffer to a 1-bit dithered bitmap. 50 - /// 51 - /// The image is integer-downscaled so the result fits within 52 - /// `max_w` × `max_h` pixels. 53 - pub fn decode_png_fit(data: &[u8], max_w: u16, max_h: u16) -> Result<DecodedImage, &'static str> { 54 - let header = parse_ihdr(data)?; 55 - let idat = collect_idat(data)?; 56 - let plte = collect_plte(data)?; 57 - 58 - if header.width.saturating_mul(header.height) > MAX_PIXELS { 59 - return Err("png: image exceeds pixel limit"); 60 - } 61 - 62 - // integer down-scale factor (1 = no scaling) 63 - let scale = { 64 - let sw = header 65 - .width 66 - .checked_add(max_w as u32 - 1) 67 - .unwrap_or(u32::MAX) 68 - / max_w as u32; 69 - let sh = header 70 - .height 71 - .checked_add(max_h as u32 - 1) 72 - .unwrap_or(u32::MAX) 73 - / max_h as u32; 74 - sw.max(sh).max(1) as usize 75 - }; 76 - 77 - let out_w = (header.width as usize / scale).max(1); 78 - let out_h = (header.height as usize / scale).max(1); 79 - let out_stride = (out_w + 7) / 8; 80 - 81 - // palette -> greyscale LUT (only used for colour type 3) 82 - let palette_grey = build_palette_lut(header.color_type, &plte)?; 83 - 84 - let scanline_bytes = header.scanline_bytes(); 85 - let bpp = header.bytes_per_pixel(); 86 - let src_h = header.height as usize; 87 - 88 - // allocate working buffers 89 - 90 - let mut output = Vec::new(); 91 - output 92 - .try_reserve_exact(out_stride * out_h) 93 - .map_err(|_| "png: OOM for output bitmap")?; 94 - output.resize(out_stride * out_h, 0u8); 95 - 96 - let mut prev_row = vec![0u8; scanline_bytes]; 97 - let mut curr_row = vec![0u8; scanline_bytes]; 98 - 99 - // Floyd-Steinberg error buffers (+2 for left/right sentinels) 100 - let mut err_cur = vec![0i16; out_w + 2]; 101 - let mut err_nxt = vec![0i16; out_w + 2]; 102 - 103 - // streaming scanline accumulator: 1 filter byte + scanline_bytes 104 - let row_total = 1 + scanline_bytes; 105 - let mut row_buf = vec![0u8; row_total]; 106 - let mut row_pos: usize = 0; 107 - 108 - // streaming decompressor (~11KB heap-allocated) 109 - 110 - let decomp_layout = core::alloc::Layout::new::<miniz_oxide::inflate::core::DecompressorOxide>(); 111 - let decomp_ptr = unsafe { alloc::alloc::alloc_zeroed(decomp_layout) }; 112 - if decomp_ptr.is_null() { 113 - return Err("png: OOM for decompressor"); 114 - } 115 - let mut decomp = 116 - unsafe { Box::from_raw(decomp_ptr as *mut miniz_oxide::inflate::core::DecompressorOxide) }; 117 - 118 - // 32KB circular dictionary for wrapping-mode inflate 119 - let mut dict = vec![0u8; DICT_SIZE]; 120 - 121 - let mut in_pos: usize = 0; 122 - let mut dict_pos: usize = 0; // cumulative output pos 123 - let mut src_y: usize = 0; // source row counter 124 - let mut out_y: usize = 0; // output row counter 125 - 126 - loop { 127 - let has_more = in_pos < idat.len(); 128 - 129 - let flags = miniz_oxide::inflate::core::inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER 130 - | if has_more { 131 - miniz_oxide::inflate::core::inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 132 - } else { 133 - 0 134 - }; 135 - 136 - let write_pos = dict_pos & (DICT_SIZE - 1); 137 - let (status, consumed, produced) = miniz_oxide::inflate::core::decompress( 138 - &mut *decomp, 139 - &idat[in_pos..], 140 - &mut dict, 141 - write_pos, 142 - flags, 143 - ); 144 - 145 - in_pos += consumed; 146 - 147 - // feed decompressed bytes into the scanline accumulator 148 - for i in 0..produced { 149 - row_buf[row_pos] = dict[(write_pos + i) & (DICT_SIZE - 1)]; 150 - row_pos += 1; 151 - 152 - if row_pos == row_total { 153 - let filter = row_buf[0]; 154 - curr_row.copy_from_slice(&row_buf[1..]); 155 - 156 - unfilter_row(filter, &mut curr_row, &prev_row, bpp); 157 - 158 - // only dither + output rows that map to an output pixel row 159 - if src_y % scale == 0 && out_y < out_h { 160 - dither_row( 161 - &curr_row, 162 - &header, 163 - &palette_grey, 164 - scale, 165 - out_w, 166 - &mut err_cur, 167 - &mut err_nxt, 168 - &mut output[out_y * out_stride..(out_y + 1) * out_stride], 169 - ); 170 - out_y += 1; 171 - 172 - // swap error-diffusion buffers 173 - core::mem::swap(&mut err_cur, &mut err_nxt); 174 - err_nxt.fill(0); 175 - } 176 - 177 - core::mem::swap(&mut prev_row, &mut curr_row); 178 - curr_row.fill(0); 179 - row_pos = 0; 180 - src_y += 1; 181 - } 182 - } 183 - 184 - dict_pos += produced; 185 - 186 - match status { 187 - miniz_oxide::inflate::TINFLStatus::Done => break, 188 - miniz_oxide::inflate::TINFLStatus::NeedsMoreInput => { 189 - if !has_more { 190 - return Err("png: truncated IDAT stream"); 191 - } 192 - if consumed == 0 && produced == 0 { 193 - return Err("png: IDAT decompression stuck"); 194 - } 195 - } 196 - miniz_oxide::inflate::TINFLStatus::HasMoreOutput => { 197 - // dictionary full; circular buffer recycles automatically 198 - if produced == 0 && consumed == 0 { 199 - return Err("png: decompression stalled (output)"); 200 - } 201 - } 202 - _ => return Err("png: IDAT decompression error"), 203 - } 204 - } 205 - 206 - if src_y < src_h { 207 - log::warn!("png: expected {} rows, got {}", src_h, src_y); 208 - } 209 - 210 - Ok(DecodedImage { 211 - width: out_w as u16, 212 - height: out_y as u16, 213 - data: output, 214 - stride: out_stride, 215 - }) 216 - } 217 - 218 - // ── streaming PNG decoders ────────────────────────────────────────── 219 - // Decode PNG images from ZIP entries without extracting to a contiguous 220 - // buffer; IDAT data is fed directly into zlib row-by-row. 221 - 222 - /// Read-chunk size used by the streaming decoders (bytes). 223 - const STREAMING_READ_BUF: usize = 4096; 224 - 225 - /// DEFLATE sliding-window for outer ZIP decompression (bytes). 226 - const ZIP_DEFLATE_WINDOW: usize = 32_768; 227 - 228 - // sequential byte source for streaming PNG decoder 229 - trait ReadExact { 230 - fn read_exact(&mut self, buf: &mut [u8]) -> Result<(), &'static str>; 231 - 232 - fn skip(&mut self, mut n: usize) -> Result<(), &'static str> { 233 - let mut trash = [0u8; 64]; 234 - while n > 0 { 235 - let chunk = n.min(64); 236 - self.read_exact(&mut trash[..chunk])?; 237 - n -= chunk; 238 - } 239 - Ok(()) 240 - } 241 - } 242 - 243 - // reads sequentially from a STORED ZIP entry via a user-supplied closure 244 - struct StoredSource<F> { 245 - read_fn: F, 246 - offset: u32, 247 - end: u32, 248 - } 249 - 250 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> ReadExact for StoredSource<F> { 251 - fn read_exact(&mut self, buf: &mut [u8]) -> Result<(), &'static str> { 252 - let mut done = 0usize; 253 - while done < buf.len() { 254 - if self.offset >= self.end { 255 - return Err("png: unexpected EOF in stored entry"); 256 - } 257 - let remaining = (self.end - self.offset) as usize; 258 - let want = (buf.len() - done).min(remaining); 259 - let n = (self.read_fn)(self.offset, &mut buf[done..done + want])?; 260 - if n == 0 { 261 - return Err("png: unexpected EOF in stored entry"); 262 - } 263 - self.offset += n as u32; 264 - done += n; 265 - } 266 - Ok(()) 267 - } 268 - 269 - fn skip(&mut self, n: usize) -> Result<(), &'static str> { 270 - let n32 = n as u32; 271 - if self.offset.saturating_add(n32) > self.end { 272 - return Err("png: skip past end of stored entry"); 273 - } 274 - self.offset += n32; 275 - Ok(()) 276 - } 277 - } 278 - 279 - // reads sequentially from a DEFLATE-compressed ZIP entry via a user-supplied closure 280 - struct DeflateSource<F> { 281 - read_fn: F, 282 - file_pos: u32, 283 - comp_left: usize, 284 - rbuf: Vec<u8>, 285 - in_avail: usize, 286 - decomp: Box<miniz_oxide::inflate::core::DecompressorOxide>, 287 - window: Vec<u8>, 288 - dict_pos: usize, 289 - read_pos: usize, 290 - avail: usize, 291 - done: bool, 292 - } 293 - 294 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> DeflateSource<F> { 295 - fn new(read_fn: F, data_offset: u32, comp_size: u32) -> Result<Self, &'static str> { 296 - use miniz_oxide::inflate::core::DecompressorOxide; 297 - 298 - let decomp_ptr = 299 - unsafe { alloc::alloc::alloc_zeroed(core::alloc::Layout::new::<DecompressorOxide>()) }; 300 - if decomp_ptr.is_null() { 301 - return Err("png: OOM for DEFLATE decompressor"); 302 - } 303 - let decomp = unsafe { Box::from_raw(decomp_ptr as *mut DecompressorOxide) }; 304 - 305 - let mut window = Vec::new(); 306 - window 307 - .try_reserve_exact(ZIP_DEFLATE_WINDOW) 308 - .map_err(|_| "png: OOM for DEFLATE window")?; 309 - window.resize(ZIP_DEFLATE_WINDOW, 0); 310 - 311 - let mut rbuf = Vec::new(); 312 - rbuf.try_reserve_exact(STREAMING_READ_BUF) 313 - .map_err(|_| "png: OOM for DEFLATE read buffer")?; 314 - rbuf.resize(STREAMING_READ_BUF, 0); 315 - 316 - Ok(Self { 317 - read_fn, 318 - file_pos: data_offset, 319 - comp_left: comp_size as usize, 320 - rbuf, 321 - in_avail: 0, 322 - decomp, 323 - window, 324 - dict_pos: 0, 325 - read_pos: 0, 326 - avail: 0, 327 - done: false, 328 - }) 329 - } 330 - 331 - fn pump(&mut self) -> Result<(), &'static str> { 332 - use miniz_oxide::inflate::TINFLStatus; 333 - use miniz_oxide::inflate::core::{decompress, inflate_flags}; 334 - 335 - if self.done { 336 - return Ok(()); 337 - } 338 - 339 - if self.in_avail < STREAMING_READ_BUF && self.comp_left > 0 { 340 - let space = STREAMING_READ_BUF - self.in_avail; 341 - let want = space.min(self.comp_left); 342 - match (self.read_fn)( 343 - self.file_pos, 344 - &mut self.rbuf[self.in_avail..self.in_avail + want], 345 - ) { 346 - Ok(n) if n > 0 => { 347 - self.file_pos += n as u32; 348 - self.comp_left -= n; 349 - self.in_avail += n; 350 - } 351 - Ok(_) => { 352 - self.comp_left = 0; 353 - } 354 - Err(_) => return Err("png: read failed during DEFLATE"), 355 - } 356 - } 357 - 358 - let flags = if self.comp_left > 0 { 359 - inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 360 - } else { 361 - 0 362 - }; 363 - 364 - let write_pos = self.dict_pos & (ZIP_DEFLATE_WINDOW - 1); 365 - let (status, consumed, produced) = decompress( 366 - &mut *self.decomp, 367 - &self.rbuf[..self.in_avail], 368 - &mut self.window, 369 - write_pos, 370 - flags, 371 - ); 372 - 373 - if consumed > 0 && consumed < self.in_avail { 374 - self.rbuf.copy_within(consumed..self.in_avail, 0); 375 - } 376 - self.in_avail -= consumed; 377 - self.dict_pos += produced; 378 - self.avail += produced; 379 - 380 - match status { 381 - TINFLStatus::Done => { 382 - self.done = true; 383 - } 384 - TINFLStatus::HasMoreOutput | TINFLStatus::NeedsMoreInput => {} 385 - _ => return Err("png: DEFLATE decompression error"), 386 - } 387 - 388 - Ok(()) 389 - } 390 - } 391 - 392 - impl<F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>> ReadExact for DeflateSource<F> { 393 - fn read_exact(&mut self, buf: &mut [u8]) -> Result<(), &'static str> { 394 - let mut total = 0usize; 395 - while total < buf.len() { 396 - if self.avail == 0 { 397 - if self.done { 398 - return Err("png: unexpected end of DEFLATE stream"); 399 - } 400 - self.pump()?; 401 - if self.avail == 0 { 402 - return Err("png: unexpected end of DEFLATE stream"); 403 - } 404 - } 405 - let rp = self.read_pos & (ZIP_DEFLATE_WINDOW - 1); 406 - let contiguous = (ZIP_DEFLATE_WINDOW - rp).min(self.avail); 407 - let n = contiguous.min(buf.len() - total); 408 - buf[total..total + n].copy_from_slice(&self.window[rp..rp + n]); 409 - self.read_pos += n; 410 - self.avail -= n; 411 - total += n; 412 - } 413 - Ok(()) 414 - } 415 - } 416 - 417 - // decode a PNG from a STORED ZIP entry by streaming from SD 418 - /// Decode a PNG from a **stored** (uncompressed) ZIP entry by streaming 419 - /// reads through `read_fn`. 420 - /// 421 - /// `read_fn(offset, buf)` reads bytes at the given absolute offset and 422 - /// returns the number of bytes actually read. 423 - pub fn decode_png_streaming<F>( 424 - read_fn: F, 425 - data_offset: u32, 426 - data_size: u32, 427 - max_w: u16, 428 - max_h: u16, 429 - ) -> Result<DecodedImage, &'static str> 430 - where 431 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 432 - { 433 - let mut src = StoredSource { 434 - read_fn, 435 - offset: data_offset, 436 - end: data_offset + data_size, 437 - }; 438 - decode_png_from(&mut src, max_w, max_h) 439 - } 440 - 441 - /// Backward-compatible alias for [`decode_png_streaming`]. 442 - pub fn decode_png_sd<F>( 443 - read_fn: F, 444 - data_offset: u32, 445 - data_size: u32, 446 - max_w: u16, 447 - max_h: u16, 448 - ) -> Result<DecodedImage, &'static str> 449 - where 450 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 451 - { 452 - decode_png_streaming(read_fn, data_offset, data_size, max_w, max_h) 453 - } 454 - 455 - /// Decode a PNG from a **DEFLATE-compressed** ZIP entry by streaming 456 - /// reads through `read_fn`. 457 - /// 458 - /// Both ZIP decompression and PNG IDAT inflation are streamed 459 - /// concurrently, so the full entry is never held in memory. 460 - pub fn decode_png_deflate_streaming<F>( 461 - read_fn: F, 462 - data_offset: u32, 463 - comp_size: u32, 464 - max_w: u16, 465 - max_h: u16, 466 - ) -> Result<DecodedImage, &'static str> 467 - where 468 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 469 - { 470 - let mut src = DeflateSource::new(read_fn, data_offset, comp_size)?; 471 - decode_png_from(&mut src, max_w, max_h) 472 - } 473 - 474 - /// Backward-compatible alias for [`decode_png_deflate_streaming`]. 475 - pub fn decode_png_deflate_sd<F>( 476 - read_fn: F, 477 - data_offset: u32, 478 - comp_size: u32, 479 - max_w: u16, 480 - max_h: u16, 481 - ) -> Result<DecodedImage, &'static str> 482 - where 483 - F: FnMut(u32, &mut [u8]) -> Result<usize, &'static str>, 484 - { 485 - decode_png_deflate_streaming(read_fn, data_offset, comp_size, max_w, max_h) 486 - } 487 - 488 - /// Core streaming PNG decoder; generic over byte source. 489 - /// Reads chunks sequentially, feeds IDAT into zlib row-by-row; 490 - /// never holds the full PNG in RAM. 491 - fn decode_png_from<R: ReadExact>( 492 - src: &mut R, 493 - max_w: u16, 494 - max_h: u16, 495 - ) -> Result<DecodedImage, &'static str> { 496 - // PNG signature 497 - let mut sig = [0u8; 8]; 498 - src.read_exact(&mut sig)?; 499 - if sig != PNG_SIG { 500 - return Err("png: invalid signature"); 501 - } 502 - 503 - // IHDR (must be first chunk) 504 - let mut chunk_hdr = [0u8; 8]; // 4-byte length + 4-byte type 505 - src.read_exact(&mut chunk_hdr)?; 506 - let ihdr_len = be_u32(&chunk_hdr, 0) as usize; 507 - if [chunk_hdr[4], chunk_hdr[5], chunk_hdr[6], chunk_hdr[7]] != CHUNK_IHDR || ihdr_len < 13 { 508 - return Err("png: missing or invalid IHDR"); 509 - } 510 - let mut ihdr_raw = [0u8; 13]; 511 - src.read_exact(&mut ihdr_raw)?; 512 - if ihdr_len > 13 { 513 - src.skip(ihdr_len - 13)?; 514 - } 515 - src.skip(4)?; // skip CRC 516 - 517 - let header = PngHeader { 518 - width: be_u32(&ihdr_raw, 0), 519 - height: be_u32(&ihdr_raw, 4), 520 - bit_depth: ihdr_raw[8], 521 - color_type: ihdr_raw[9], 522 - }; 523 - if header.width == 0 || header.height == 0 { 524 - return Err("png: zero dimensions"); 525 - } 526 - if ihdr_raw[12] != 0 { 527 - return Err("png: interlaced PNGs not supported"); 528 - } 529 - match (header.color_type, header.bit_depth) { 530 - (COLOR_GREYSCALE, 1 | 2 | 4 | 8 | 16) => {} 531 - (COLOR_RGB, 8 | 16) => {} 532 - (COLOR_PALETTE, 1 | 2 | 4 | 8) => {} 533 - (COLOR_GREY_ALPHA, 8 | 16) => {} 534 - (COLOR_RGBA, 8 | 16) => {} 535 - _ => return Err("png: unsupported colour type / bit depth"), 536 - } 537 - if header.width.saturating_mul(header.height) > MAX_PIXELS { 538 - return Err("png: image exceeds pixel limit"); 539 - } 540 - 541 - // scan for PLTE, skip to first IDAT 542 - let mut plte: Option<Vec<u8>> = None; 543 - let first_idat_len: usize; 544 - loop { 545 - src.read_exact(&mut chunk_hdr)?; 546 - let clen = be_u32(&chunk_hdr, 0) as usize; 547 - let ctype = [chunk_hdr[4], chunk_hdr[5], chunk_hdr[6], chunk_hdr[7]]; 548 - if ctype == CHUNK_IDAT { 549 - first_idat_len = clen; 550 - break; 551 - } else if ctype == CHUNK_PLTE && clen <= 768 && clen % 3 == 0 { 552 - let mut p = Vec::new(); 553 - p.try_reserve_exact(clen).map_err(|_| "png: OOM for PLTE")?; 554 - p.resize(clen, 0); 555 - src.read_exact(&mut p)?; 556 - src.skip(4)?; // CRC 557 - plte = Some(p); 558 - } else { 559 - src.skip(clen + 4)?; // skip data + CRC 560 - } 561 - } 562 - 563 - let palette_grey = build_palette_lut(header.color_type, &plte)?; 564 - drop(plte); 565 - 566 - // output dimensions 567 - let scale = { 568 - let sw = header 569 - .width 570 - .checked_add(max_w as u32 - 1) 571 - .unwrap_or(u32::MAX) 572 - / max_w as u32; 573 - let sh = header 574 - .height 575 - .checked_add(max_h as u32 - 1) 576 - .unwrap_or(u32::MAX) 577 - / max_h as u32; 578 - sw.max(sh).max(1) as usize 579 - }; 580 - let out_w = (header.width as usize / scale).max(1); 581 - let out_h = (header.height as usize / scale).max(1); 582 - let out_stride = (out_w + 7) / 8; 583 - let scanline_bytes = header.scanline_bytes(); 584 - let bpp = header.bytes_per_pixel(); 585 - let src_h = header.height as usize; 586 - 587 - log::info!( 588 - "png: streaming {}x{} -> {}x{} (scale {})", 589 - header.width, 590 - header.height, 591 - out_w, 592 - out_h, 593 - scale 594 - ); 595 - 596 - // allocate working buffers 597 - let mut output = Vec::new(); 598 - output 599 - .try_reserve_exact(out_stride * out_h) 600 - .map_err(|_| "png: OOM for output bitmap")?; 601 - output.resize(out_stride * out_h, 0u8); 602 - 603 - let mut prev_row = vec![0u8; scanline_bytes]; 604 - let mut curr_row = vec![0u8; scanline_bytes]; 605 - let mut err_cur = vec![0i16; out_w + 2]; 606 - let mut err_nxt = vec![0i16; out_w + 2]; 607 - let row_total = 1 + scanline_bytes; 608 - let mut row_buf = vec![0u8; row_total]; 609 - let mut row_pos: usize = 0; 610 - 611 - // streaming zlib decompressor for IDAT data 612 - let decomp_layout = core::alloc::Layout::new::<miniz_oxide::inflate::core::DecompressorOxide>(); 613 - let decomp_ptr = unsafe { alloc::alloc::alloc_zeroed(decomp_layout) }; 614 - if decomp_ptr.is_null() { 615 - return Err("png: OOM for decompressor"); 616 - } 617 - let mut decomp = 618 - unsafe { Box::from_raw(decomp_ptr as *mut miniz_oxide::inflate::core::DecompressorOxide) }; 619 - let mut dict = vec![0u8; DICT_SIZE]; 620 - let mut dict_pos: usize = 0; 621 - let mut src_y: usize = 0; 622 - let mut out_y: usize = 0; 623 - 624 - // feed IDAT chunks into zlib row-by-row 625 - let mut idat_buf = [0u8; STREAMING_READ_BUF]; 626 - let mut in_avail: usize = 0; 627 - let mut idat_chunk_left = first_idat_len; 628 - let mut more_idat = true; 629 - 630 - loop { 631 - // top up input buffer from the IDAT stream 632 - while in_avail < STREAMING_READ_BUF { 633 - if idat_chunk_left > 0 { 634 - let space = STREAMING_READ_BUF - in_avail; 635 - let want = idat_chunk_left.min(space); 636 - src.read_exact(&mut idat_buf[in_avail..in_avail + want])?; 637 - in_avail += want; 638 - idat_chunk_left -= want; 639 - } else if more_idat { 640 - src.skip(4)?; // CRC 641 - src.read_exact(&mut chunk_hdr)?; 642 - let clen = be_u32(&chunk_hdr, 0) as usize; 643 - let ctype = [chunk_hdr[4], chunk_hdr[5], chunk_hdr[6], chunk_hdr[7]]; 644 - if ctype == CHUNK_IDAT { 645 - idat_chunk_left = clen; 646 - } else { 647 - more_idat = false; 648 - break; 649 - } 650 - } else { 651 - break; 652 - } 653 - } 654 - 655 - let has_more = idat_chunk_left > 0 || more_idat; 656 - let flags = miniz_oxide::inflate::core::inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER 657 - | if has_more { 658 - miniz_oxide::inflate::core::inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 659 - } else { 660 - 0 661 - }; 662 - 663 - let write_pos = dict_pos & (DICT_SIZE - 1); 664 - let (status, consumed, produced) = miniz_oxide::inflate::core::decompress( 665 - &mut *decomp, 666 - &idat_buf[..in_avail], 667 - &mut dict, 668 - write_pos, 669 - flags, 670 - ); 671 - 672 - if consumed > 0 && consumed < in_avail { 673 - idat_buf.copy_within(consumed..in_avail, 0); 674 - } 675 - in_avail -= consumed; 676 - 677 - // feed decompressed bytes into the scanline accumulator 678 - for i in 0..produced { 679 - row_buf[row_pos] = dict[(write_pos + i) & (DICT_SIZE - 1)]; 680 - row_pos += 1; 681 - 682 - if row_pos == row_total { 683 - let filter = row_buf[0]; 684 - curr_row.copy_from_slice(&row_buf[1..]); 685 - 686 - unfilter_row(filter, &mut curr_row, &prev_row, bpp); 687 - 688 - if src_y % scale == 0 && out_y < out_h { 689 - dither_row( 690 - &curr_row, 691 - &header, 692 - &palette_grey, 693 - scale, 694 - out_w, 695 - &mut err_cur, 696 - &mut err_nxt, 697 - &mut output[out_y * out_stride..(out_y + 1) * out_stride], 698 - ); 699 - out_y += 1; 700 - core::mem::swap(&mut err_cur, &mut err_nxt); 701 - err_nxt.fill(0); 702 - } 703 - 704 - core::mem::swap(&mut prev_row, &mut curr_row); 705 - curr_row.fill(0); 706 - row_pos = 0; 707 - src_y += 1; 708 - } 709 - } 710 - 711 - dict_pos += produced; 712 - 713 - match status { 714 - miniz_oxide::inflate::TINFLStatus::Done => break, 715 - miniz_oxide::inflate::TINFLStatus::NeedsMoreInput => { 716 - if !has_more && in_avail == 0 { 717 - return Err("png: truncated IDAT stream"); 718 - } 719 - if consumed == 0 && produced == 0 && in_avail >= STREAMING_READ_BUF { 720 - return Err("png: IDAT decompression stuck"); 721 - } 722 - } 723 - miniz_oxide::inflate::TINFLStatus::HasMoreOutput => { 724 - if produced == 0 && consumed == 0 { 725 - return Err("png: decompression stalled (output)"); 726 - } 727 - } 728 - _ => return Err("png: IDAT decompression error"), 729 - } 730 - } 731 - 732 - if src_y < src_h { 733 - log::warn!("png: expected {} rows, got {}", src_h, src_y); 734 - } 735 - 736 - Ok(DecodedImage { 737 - width: out_w as u16, 738 - height: out_y as u16, 739 - data: output, 740 - stride: out_stride, 741 - }) 742 - } 743 - 744 - // IHDR / chunk parsing 745 - 746 - struct PngHeader { 747 - width: u32, 748 - height: u32, 749 - bit_depth: u8, 750 - color_type: u8, 751 - } 752 - 753 - impl PngHeader { 754 - // bytes per complete pixel; filter stride for Sub/Paeth; 1 for sub-byte depths 755 - fn bytes_per_pixel(&self) -> usize { 756 - let channels: usize = match self.color_type { 757 - COLOR_GREYSCALE => 1, 758 - COLOR_RGB => 3, 759 - COLOR_PALETTE => 1, 760 - COLOR_GREY_ALPHA => 2, 761 - COLOR_RGBA => 4, 762 - _ => 1, 763 - }; 764 - if self.bit_depth >= 8 { 765 - channels * (self.bit_depth as usize / 8) 766 - } else { 767 - 1 // sub-byte packed 768 - } 769 - } 770 - 771 - // byte length of one unfiltered row (without the leading filter byte) 772 - fn scanline_bytes(&self) -> usize { 773 - let bits_per_pixel: usize = match self.color_type { 774 - COLOR_GREYSCALE => self.bit_depth as usize, 775 - COLOR_RGB => 3 * self.bit_depth as usize, 776 - COLOR_PALETTE => self.bit_depth as usize, 777 - COLOR_GREY_ALPHA => 2 * self.bit_depth as usize, 778 - COLOR_RGBA => 4 * self.bit_depth as usize, 779 - _ => self.bit_depth as usize, 780 - }; 781 - (self.width as usize * bits_per_pixel + 7) / 8 782 - } 783 - } 784 - 785 - // big-endian u32 (PNG uses network byte order) 786 - #[inline] 787 - fn be_u32(d: &[u8], o: usize) -> u32 { 788 - u32::from_be_bytes([d[o], d[o + 1], d[o + 2], d[o + 3]]) 789 - } 790 - 791 - // iterator over PNG chunks; yields (type, data) pairs 792 - struct ChunkIter<'a> { 793 - data: &'a [u8], 794 - pos: usize, 795 - } 796 - 797 - impl<'a> ChunkIter<'a> { 798 - fn new(data: &'a [u8]) -> Result<Self, &'static str> { 799 - if data.len() < 8 || data[..8] != PNG_SIG { 800 - return Err("png: invalid signature"); 801 - } 802 - Ok(Self { data, pos: 8 }) 803 - } 804 - } 805 - 806 - impl<'a> Iterator for ChunkIter<'a> { 807 - type Item = ([u8; 4], &'a [u8]); 808 - 809 - fn next(&mut self) -> Option<Self::Item> { 810 - if self.pos + 12 > self.data.len() { 811 - return None; 812 - } 813 - let len = be_u32(self.data, self.pos) as usize; 814 - let ctype: [u8; 4] = self.data[self.pos + 4..self.pos + 8].try_into().ok()?; 815 - let data_start = self.pos + 8; 816 - let data_end = data_start + len; 817 - if data_end + 4 > self.data.len() { 818 - return None; 819 - } 820 - self.pos = data_end + 4; // CRC 821 - Some((ctype, &self.data[data_start..data_end])) 822 - } 823 - } 824 - 825 - fn parse_ihdr(data: &[u8]) -> Result<PngHeader, &'static str> { 826 - let mut chunks = ChunkIter::new(data)?; 827 - let (ctype, cdata) = chunks.next().ok_or("png: missing IHDR")?; 828 - if ctype != CHUNK_IHDR || cdata.len() < 13 { 829 - return Err("png: invalid IHDR"); 830 - } 831 - 832 - let header = PngHeader { 833 - width: be_u32(cdata, 0), 834 - height: be_u32(cdata, 4), 835 - bit_depth: cdata[8], 836 - color_type: cdata[9], 837 - }; 838 - 839 - if header.width == 0 || header.height == 0 { 840 - return Err("png: zero dimensions"); 841 - } 842 - if cdata[12] != 0 { 843 - return Err("png: interlaced PNGs not supported"); 844 - } 845 - 846 - match (header.color_type, header.bit_depth) { 847 - (COLOR_GREYSCALE, 1 | 2 | 4 | 8 | 16) => {} 848 - (COLOR_RGB, 8 | 16) => {} 849 - (COLOR_PALETTE, 1 | 2 | 4 | 8) => {} 850 - (COLOR_GREY_ALPHA, 8 | 16) => {} 851 - (COLOR_RGBA, 8 | 16) => {} 852 - _ => return Err("png: unsupported colour type / bit depth"), 853 - } 854 - 855 - Ok(header) 856 - } 857 - 858 - // concatenate all IDAT chunk payloads into a single buffer 859 - fn collect_idat(data: &[u8]) -> Result<Vec<u8>, &'static str> { 860 - let chunks = ChunkIter::new(data)?; 861 - let total: usize = chunks 862 - .filter(|(t, _)| *t == CHUNK_IDAT) 863 - .map(|(_, d)| d.len()) 864 - .sum(); 865 - if total == 0 { 866 - return Err("png: no IDAT data"); 867 - } 868 - 869 - let mut idat = Vec::new(); 870 - idat.try_reserve_exact(total) 871 - .map_err(|_| "png: IDAT too large for memory")?; 872 - for (ctype, cdata) in ChunkIter::new(data).unwrap() { 873 - if ctype == CHUNK_IDAT { 874 - idat.extend_from_slice(cdata); 875 - } 876 - } 877 - Ok(idat) 878 - } 879 - 880 - // read PLTE chunk if present; up to 768 bytes (256 x RGB) 881 - fn collect_plte(data: &[u8]) -> Result<Option<Vec<u8>>, &'static str> { 882 - for (ctype, cdata) in ChunkIter::new(data)? { 883 - if ctype == CHUNK_PLTE { 884 - if cdata.len() % 3 != 0 || cdata.len() > 768 { 885 - return Err("png: invalid PLTE"); 886 - } 887 - let mut plte = Vec::new(); 888 - plte.try_reserve_exact(cdata.len()) 889 - .map_err(|_| "png: OOM for PLTE")?; 890 - plte.extend_from_slice(cdata); 891 - return Ok(Some(plte)); 892 - } 893 - } 894 - Ok(None) 895 - } 896 - 897 - // build a 256-entry greyscale LUT from the palette 898 - fn build_palette_lut(color_type: u8, plte: &Option<Vec<u8>>) -> Result<[u8; 256], &'static str> { 899 - let mut lut = [0u8; 256]; 900 - if color_type == COLOR_PALETTE { 901 - let plte_data = plte.as_ref().ok_or("png: palette image without PLTE")?; 902 - for i in 0..plte_data.len() / 3 { 903 - let r = plte_data[i * 3] as u16; 904 - let g = plte_data[i * 3 + 1] as u16; 905 - let b = plte_data[i * 3 + 2] as u16; 906 - // BT.601 luma: 0.299R + 0.587G + 0.114B 907 - lut[i] = ((r * 77 + g * 150 + b * 29) >> 8) as u8; 908 - } 909 - } 910 - Ok(lut) 911 - } 912 - 913 - // unfiltering 914 - 915 - // reconstruct one scanline in-place given the previous unfiltered row; bpp = byte stride 916 - fn unfilter_row(filter: u8, row: &mut [u8], prev: &[u8], bpp: usize) { 917 - let len = row.len(); 918 - match filter { 919 - FILTER_NONE => {} 920 - FILTER_SUB => { 921 - for i in bpp..len { 922 - row[i] = row[i].wrapping_add(row[i - bpp]); 923 - } 924 - } 925 - FILTER_UP => { 926 - for i in 0..len { 927 - row[i] = row[i].wrapping_add(prev[i]); 928 - } 929 - } 930 - FILTER_AVERAGE => { 931 - for i in 0..len { 932 - let a = if i >= bpp { row[i - bpp] as u16 } else { 0 }; 933 - let b = prev[i] as u16; 934 - row[i] = row[i].wrapping_add(((a + b) / 2) as u8); 935 - } 936 - } 937 - FILTER_PAETH => { 938 - for i in 0..len { 939 - let a = if i >= bpp { row[i - bpp] } else { 0 }; 940 - let b = prev[i]; 941 - let c = if i >= bpp { prev[i - bpp] } else { 0 }; 942 - row[i] = row[i].wrapping_add(paeth(a, b, c)); 943 - } 944 - } 945 - _ => {} // unknown filter; treat as None (best-effort) 946 - } 947 - } 948 - 949 - #[inline] 950 - fn paeth(a: u8, b: u8, c: u8) -> u8 { 951 - let a = a as i16; 952 - let b = b as i16; 953 - let c = c as i16; 954 - let p = a + b - c; 955 - let pa = (p - a).unsigned_abs(); 956 - let pb = (p - b).unsigned_abs(); 957 - let pc = (p - c).unsigned_abs(); 958 - if pa <= pb && pa <= pc { 959 - a as u8 960 - } else if pb <= pc { 961 - b as u8 962 - } else { 963 - c as u8 964 - } 965 - } 966 - 967 - // pixel -> greyscale conversion 968 - 969 - // sample one pixel from an unfiltered scanline; return 0-255 grey. 970 - // alpha pre-blended against white (e-paper background). 971 - #[inline] 972 - fn pixel_to_grey(row: &[u8], x: usize, hdr: &PngHeader, pal: &[u8; 256]) -> u8 { 973 - match (hdr.color_type, hdr.bit_depth) { 974 - // greyscale 975 - (COLOR_GREYSCALE, 8) => row[x], 976 - (COLOR_GREYSCALE, 16) => row[x * 2], // high byte only 977 - (COLOR_GREYSCALE, bd) => unpack_sub_byte(row, x, bd), 978 - 979 - // RGB 980 - (COLOR_RGB, 8) => rgb_to_grey(row[x * 3], row[x * 3 + 1], row[x * 3 + 2]), 981 - (COLOR_RGB, 16) => rgb_to_grey(row[x * 6], row[x * 6 + 2], row[x * 6 + 4]), 982 - 983 - // palette 984 - (COLOR_PALETTE, 8) => pal[row[x] as usize], 985 - (COLOR_PALETTE, bd) => { 986 - let idx = unpack_sub_byte_raw(row, x, bd); 987 - pal[idx as usize] 988 - } 989 - 990 - // greyscale + alpha 991 - (COLOR_GREY_ALPHA, 8) => blend_white(row[x * 2], row[x * 2 + 1]), 992 - (COLOR_GREY_ALPHA, 16) => blend_white(row[x * 4], row[x * 4 + 2]), 993 - 994 - // RGBA 995 - (COLOR_RGBA, 8) => { 996 - let g = rgb_to_grey(row[x * 4], row[x * 4 + 1], row[x * 4 + 2]); 997 - blend_white(g, row[x * 4 + 3]) 998 - } 999 - (COLOR_RGBA, 16) => { 1000 - let g = rgb_to_grey(row[x * 8], row[x * 8 + 2], row[x * 8 + 4]); 1001 - blend_white(g, row[x * 8 + 6]) 1002 - } 1003 - 1004 - _ => 128, // unreachable for validated header 1005 - } 1006 - } 1007 - 1008 - // BT.601 luma from 8-bit RGB channels 1009 - #[inline] 1010 - fn rgb_to_grey(r: u8, g: u8, b: u8) -> u8 { 1011 - ((r as u16 * 77 + g as u16 * 150 + b as u16 * 29) >> 8) as u8 1012 - } 1013 - 1014 - // alpha-blend grey against white: out = grey*a/255 + 255*(255-a)/255 1015 - #[inline] 1016 - fn blend_white(grey: u8, alpha: u8) -> u8 { 1017 - let g = grey as u16; 1018 - let a = alpha as u16; 1019 - ((g * a + 255 * (255 - a)) / 255) as u8 1020 - } 1021 - 1022 - // unpack a sub-byte greyscale sample (1/2/4 bit) and scale to 0-255 1023 - #[inline] 1024 - fn unpack_sub_byte(row: &[u8], x: usize, bit_depth: u8) -> u8 { 1025 - let raw = unpack_sub_byte_raw(row, x, bit_depth); 1026 - let max = (1u16 << bit_depth) - 1; 1027 - (raw as u16 * 255 / max) as u8 1028 - } 1029 - 1030 - // unpack a sub-byte sample without rescaling (for palette index) 1031 - #[inline] 1032 - fn unpack_sub_byte_raw(row: &[u8], x: usize, bit_depth: u8) -> u8 { 1033 - let bpp = bit_depth as usize; 1034 - let ppb = 8 / bpp; // pixels per byte 1035 - let byte_idx = x / ppb; 1036 - let bit_offset = (ppb - 1 - x % ppb) * bpp; 1037 - let mask = (1u8 << bpp) - 1; 1038 - (row[byte_idx] >> bit_offset) & mask 1039 - } 1040 - 1041 - // Floyd-Steinberg dithering 1042 - 1043 - // dither one source row into 1-bit output; pick every scale-th pixel 1044 - fn dither_row( 1045 - src_row: &[u8], 1046 - hdr: &PngHeader, 1047 - pal: &[u8; 256], 1048 - scale: usize, 1049 - out_w: usize, 1050 - err_cur: &mut [i16], 1051 - err_nxt: &mut [i16], 1052 - out_row: &mut [u8], 1053 - ) { 1054 - for ox in 0..out_w { 1055 - let sx = ox * scale; 1056 - let grey = pixel_to_grey(src_row, sx, hdr, pal) as i16; 1057 - // add accumulated error (offset by 1 for the left sentinel) 1058 - let val = (grey + err_cur[ox + 1]).clamp(0, 255); 1059 - // val < 128 -> black (bit set), else white (bit clear) 1060 - let black = val < 128; 1061 - let quantised = if black { 0i16 } else { 255 }; 1062 - let err = val - quantised; 1063 - 1064 - if black { 1065 - out_row[ox / 8] |= 1 << (7 - (ox & 7)); 1066 - } 1067 - 1068 - // distribute error to neighbours (Floyd-Steinberg weights) 1069 - err_cur[ox + 2] += err * 7 / 16; // right 1070 - err_nxt[ox] += err * 3 / 16; // below-left 1071 - err_nxt[ox + 1] += err * 5 / 16; // below 1072 - err_nxt[ox + 2] += err / 16; // below-right 1073 - } 1074 - }
-268
smol-epub/src/xml.rs
··· 1 - //! Minimal XML tag/attribute scanner for EPUB metadata. 2 - //! 3 - //! Not a general-purpose XML parser — handles `container.xml` and OPF 4 - //! documents only. Single-pass, forward-only, namespace-aware, lenient. 5 - 6 - /// Extract the value of an attribute from a raw XML opening-tag byte slice. 7 - /// 8 - /// `tag_bytes` should start at the tag name (after `<`) and end before `>`. 9 - /// Returns `None` if the attribute is not found. 10 - pub fn get_attr<'a>(tag_bytes: &'a [u8], attr_name: &[u8]) -> Option<&'a [u8]> { 11 - let mut pos = 0; 12 - let len = tag_bytes.len(); 13 - 14 - while pos < len && !is_ws(tag_bytes[pos]) && tag_bytes[pos] != b'>' && tag_bytes[pos] != b'/' { 15 - pos += 1; 16 - } 17 - 18 - while pos < len { 19 - while pos < len && is_ws(tag_bytes[pos]) { 20 - pos += 1; 21 - } 22 - if pos >= len || tag_bytes[pos] == b'>' || tag_bytes[pos] == b'/' { 23 - break; 24 - } 25 - 26 - let name_start = pos; 27 - while pos < len 28 - && tag_bytes[pos] != b'=' 29 - && !is_ws(tag_bytes[pos]) 30 - && tag_bytes[pos] != b'>' 31 - && tag_bytes[pos] != b'/' 32 - { 33 - pos += 1; 34 - } 35 - let name_end = pos; 36 - 37 - while pos < len && is_ws(tag_bytes[pos]) { 38 - pos += 1; 39 - } 40 - if pos >= len || tag_bytes[pos] != b'=' { 41 - continue; 42 - } 43 - pos += 1; 44 - while pos < len && is_ws(tag_bytes[pos]) { 45 - pos += 1; 46 - } 47 - if pos >= len { 48 - break; 49 - } 50 - 51 - let quote = tag_bytes[pos]; 52 - if quote != b'"' && quote != b'\'' { 53 - while pos < len && !is_ws(tag_bytes[pos]) && tag_bytes[pos] != b'>' { 54 - pos += 1; 55 - } 56 - continue; 57 - } 58 - pos += 1; 59 - 60 - let value_start = pos; 61 - while pos < len && tag_bytes[pos] != quote { 62 - pos += 1; 63 - } 64 - let value_end = pos; 65 - if pos < len { 66 - pos += 1; 67 - } 68 - 69 - if &tag_bytes[name_start..name_end] == attr_name { 70 - return Some(&tag_bytes[value_start..value_end]); 71 - } 72 - } 73 - 74 - None 75 - } 76 - 77 - /// Return the text content of the first element whose local name matches 78 - /// `tag_name` (namespace-aware: `dc:title` matches `title`). 79 - pub fn tag_text<'a>(data: &'a [u8], tag_name: &[u8]) -> Option<&'a [u8]> { 80 - let mut pos = 0; 81 - 82 - while pos < data.len() { 83 - let Some(lt) = find_byte(&data[pos..], b'<') else { 84 - break; 85 - }; 86 - let lt = pos + lt; 87 - pos = lt + 1; 88 - 89 - if pos >= data.len() { 90 - break; 91 - } 92 - 93 - let first = data[pos]; 94 - if first == b'/' || first == b'?' || first == b'!' { 95 - pos = skip_construct(data, pos - 1); 96 - continue; 97 - } 98 - 99 - let name_start = pos; 100 - while pos < data.len() && !is_tag_delim(data[pos]) { 101 - pos += 1; 102 - } 103 - let name = &data[name_start..pos]; 104 - 105 - if !tag_name_matches(name, tag_name) { 106 - pos = skip_to_gt(data, pos); 107 - continue; 108 - } 109 - 110 - let tag_end = skip_to_gt(data, pos); 111 - if tag_end > 0 && tag_end - 1 < data.len() && data[tag_end - 1] == b'/' { 112 - pos = tag_end; 113 - continue; 114 - } 115 - pos = tag_end; 116 - 117 - let text_start = pos; 118 - while pos + 1 < data.len() { 119 - if data[pos] == b'<' && data[pos + 1] == b'/' { 120 - return Some(trim_ws(&data[text_start..pos])); 121 - } 122 - pos += 1; 123 - } 124 - break; 125 - } 126 - 127 - None 128 - } 129 - 130 - /// Invoke `cb` for every opening tag whose local name matches `tag_name` 131 - /// (namespace-aware). The callback receives the tag body bytes (from the 132 - /// tag name up to but not including `>`). 133 - pub fn for_each_tag<'a>(data: &'a [u8], tag_name: &[u8], mut cb: impl FnMut(&'a [u8])) { 134 - let mut pos = 0; 135 - 136 - while pos < data.len() { 137 - let Some(lt) = find_byte(&data[pos..], b'<') else { 138 - break; 139 - }; 140 - let lt = pos + lt; 141 - pos = lt + 1; 142 - 143 - if pos >= data.len() { 144 - break; 145 - } 146 - 147 - let first = data[pos]; 148 - if first == b'/' || first == b'?' || first == b'!' { 149 - pos = skip_construct(data, lt); 150 - continue; 151 - } 152 - 153 - let name_start = pos; 154 - while pos < data.len() && !is_tag_delim(data[pos]) { 155 - pos += 1; 156 - } 157 - let name = &data[name_start..pos]; 158 - 159 - if !tag_name_matches(name, tag_name) { 160 - pos = skip_to_gt(data, pos); 161 - continue; 162 - } 163 - 164 - let content_start = name_start; 165 - let mut end = pos; 166 - while end < data.len() && data[end] != b'>' { 167 - end += 1; 168 - } 169 - 170 - cb(&data[content_start..end]); 171 - 172 - pos = if end < data.len() { end + 1 } else { end }; 173 - } 174 - } 175 - 176 - // namespace-aware name match: "dc:title" matches "title" 177 - fn tag_name_matches(full_name: &[u8], target: &[u8]) -> bool { 178 - if full_name == target { 179 - return true; 180 - } 181 - if full_name.len() > target.len() + 1 { 182 - let colon_pos = full_name.len() - target.len() - 1; 183 - if full_name[colon_pos] == b':' && &full_name[colon_pos + 1..] == target { 184 - return true; 185 - } 186 - } 187 - false 188 - } 189 - 190 - fn find_byte(haystack: &[u8], needle: u8) -> Option<usize> { 191 - haystack.iter().position(|&b| b == needle) 192 - } 193 - 194 - #[inline] 195 - fn is_ws(b: u8) -> bool { 196 - matches!(b, b' ' | b'\t' | b'\n' | b'\r') 197 - } 198 - 199 - #[inline] 200 - fn is_tag_delim(b: u8) -> bool { 201 - matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') 202 - } 203 - 204 - fn skip_to_gt(data: &[u8], mut pos: usize) -> usize { 205 - while pos < data.len() { 206 - if data[pos] == b'>' { 207 - return pos + 1; 208 - } 209 - pos += 1; 210 - } 211 - data.len() 212 - } 213 - 214 - fn skip_construct(data: &[u8], lt_pos: usize) -> usize { 215 - let pos = lt_pos + 1; 216 - if pos >= data.len() { 217 - return data.len(); 218 - } 219 - 220 - match data[pos] { 221 - b'/' => skip_to_gt(data, pos), 222 - b'?' => { 223 - let mut p = pos + 1; 224 - while p + 1 < data.len() { 225 - if data[p] == b'?' && data[p + 1] == b'>' { 226 - return p + 2; 227 - } 228 - p += 1; 229 - } 230 - data.len() 231 - } 232 - b'!' => { 233 - let rest = &data[pos + 1..]; 234 - if rest.starts_with(b"--") { 235 - let mut p = pos + 3; 236 - while p + 2 < data.len() { 237 - if data[p] == b'-' && data[p + 1] == b'-' && data[p + 2] == b'>' { 238 - return p + 3; 239 - } 240 - p += 1; 241 - } 242 - data.len() 243 - } else if rest.starts_with(b"[CDATA[") { 244 - let mut p = pos + 8; 245 - while p + 2 < data.len() { 246 - if data[p] == b']' && data[p + 1] == b']' && data[p + 2] == b'>' { 247 - return p + 3; 248 - } 249 - p += 1; 250 - } 251 - data.len() 252 - } else { 253 - skip_to_gt(data, pos) 254 - } 255 - } 256 - _ => skip_to_gt(data, lt_pos), 257 - } 258 - } 259 - 260 - fn trim_ws(data: &[u8]) -> &[u8] { 261 - let start = data.iter().position(|b| !is_ws(*b)).unwrap_or(data.len()); 262 - let end = data 263 - .iter() 264 - .rposition(|b| !is_ws(*b)) 265 - .map(|p| p + 1) 266 - .unwrap_or(start); 267 - if start >= end { &[] } else { &data[start..end] } 268 - }
-412
smol-epub/src/zip.rs
··· 1 - //! ZIP central-directory parser and streaming entry extraction. 2 - //! 3 - //! [`ZipIndex`] holds up to 256 entries inline (~5 KB); entry names are 4 - //! heap-allocated during parse. DEFLATE decompression streams in 4 KB 5 - //! chunks; `try_reserve` is used throughout for graceful OOM handling. 6 - 7 - use alloc::boxed::Box; 8 - use alloc::vec; 9 - use alloc::vec::Vec; 10 - 11 - const MAX_ENTRY_SIZE: u32 = 192 * 1024; // max uncompressed entry size (OOM guard) 12 - 13 - const EOCD_SIG: u32 = 0x0605_4b50; 14 - const CD_SIG: u32 = 0x0201_4b50; 15 - const LOCAL_SIG: u32 = 0x0403_4b50; 16 - 17 - /// ZIP compression method: stored (no compression). 18 - pub const METHOD_STORED: u16 = 0; 19 - /// ZIP compression method: DEFLATE. 20 - pub const METHOD_DEFLATE: u16 = 8; 21 - 22 - #[inline] 23 - fn le_u16(d: &[u8], o: usize) -> u16 { 24 - u16::from_le_bytes([d[o], d[o + 1]]) 25 - } 26 - 27 - #[inline] 28 - fn le_u32(d: &[u8], o: usize) -> u32 { 29 - u32::from_le_bytes([d[o], d[o + 1], d[o + 2], d[o + 3]]) 30 - } 31 - 32 - /// A single entry in the ZIP central directory. 33 - #[derive(Clone, Copy)] 34 - pub struct ZipEntry { 35 - /// Byte offset into the name pool where this entry's name starts. 36 - pub name_start: u16, 37 - /// Length of the entry name in bytes. 38 - pub name_len: u16, 39 - /// Byte offset of the local file header in the ZIP file. 40 - pub local_offset: u32, 41 - /// Compressed size in bytes. 42 - pub comp_size: u32, 43 - /// Uncompressed size in bytes. 44 - pub uncomp_size: u32, 45 - /// Compression method ([`METHOD_STORED`] or [`METHOD_DEFLATE`]). 46 - pub method: u16, 47 - } 48 - 49 - impl ZipEntry { 50 - const EMPTY: Self = Self { 51 - name_start: 0, 52 - name_len: 0, 53 - local_offset: 0, 54 - comp_size: 0, 55 - uncomp_size: 0, 56 - method: 0, 57 - }; 58 - } 59 - 60 - /// Maximum number of entries the [`ZipIndex`] can hold. 61 - pub const MAX_ENTRIES: usize = 256; 62 - 63 - /// In-memory index of a ZIP archive's central directory. 64 - /// 65 - /// Holds up to [`MAX_ENTRIES`] entries inline (~5 KB); entry names are 66 - /// stored in a single heap-allocated byte pool. 67 - pub struct ZipIndex { 68 - entries: [ZipEntry; MAX_ENTRIES], 69 - count: u16, 70 - names: Vec<u8>, 71 - } 72 - 73 - impl Default for ZipIndex { 74 - fn default() -> Self { 75 - Self::new() 76 - } 77 - } 78 - 79 - impl ZipIndex { 80 - /// Create a new, empty index. 81 - pub const fn new() -> Self { 82 - Self { 83 - entries: [ZipEntry::EMPTY; MAX_ENTRIES], 84 - count: 0, 85 - names: Vec::new(), 86 - } 87 - } 88 - 89 - /// Remove all entries and free the name pool. 90 - pub fn clear(&mut self) { 91 - self.count = 0; 92 - self.names = Vec::new(); 93 - } 94 - 95 - /// Parse the End-of-Central-Directory record from the last bytes of a 96 - /// ZIP file. Returns `(cd_offset, cd_size)`. 97 - /// 98 - /// `tail` should be the final ≤ 65557 bytes of the file (22 bytes is 99 - /// the minimum for a ZIP with no comment). 100 - pub fn parse_eocd(tail: &[u8], file_size: u32) -> Result<(u32, u32), &'static str> { 101 - if tail.len() < 22 { 102 - return Err("zip: tail too short for EOCD"); 103 - } 104 - 105 - let mut i = tail.len() - 22; 106 - loop { 107 - if le_u32(tail, i) == EOCD_SIG { 108 - break; 109 - } 110 - if i == 0 { 111 - return Err("zip: EOCD signature not found"); 112 - } 113 - i -= 1; 114 - } 115 - 116 - let cd_size = le_u32(tail, i + 12); 117 - let cd_offset = le_u32(tail, i + 16); 118 - 119 - if cd_offset.saturating_add(cd_size) > file_size { 120 - return Err("zip: CD extends past EOF"); 121 - } 122 - 123 - Ok((cd_offset, cd_size)) 124 - } 125 - 126 - /// Parse a central-directory blob into this index, replacing any 127 - /// previously stored entries. 128 - pub fn parse_central_directory(&mut self, cd: &[u8]) -> Result<(), &'static str> { 129 - self.count = 0; 130 - self.names.clear(); 131 - let _ = self.names.try_reserve(cd.len().min(8192)); 132 - 133 - let mut pos = 0; 134 - 135 - while pos + 46 <= cd.len() { 136 - if le_u32(cd, pos) != CD_SIG { 137 - break; 138 - } 139 - 140 - let method = le_u16(cd, pos + 10); 141 - let comp_size = le_u32(cd, pos + 20); 142 - let uncomp_size = le_u32(cd, pos + 24); 143 - let name_len = le_u16(cd, pos + 28) as usize; 144 - let extra_len = le_u16(cd, pos + 30) as usize; 145 - let comment_len = le_u16(cd, pos + 32) as usize; 146 - let local_offset = le_u32(cd, pos + 42); 147 - 148 - let name_start_in_cd = pos + 46; 149 - let entry_end = name_start_in_cd + name_len + extra_len + comment_len; 150 - 151 - if entry_end > cd.len() { 152 - return Err("zip: CD entry extends past buffer"); 153 - } 154 - 155 - let idx = self.count as usize; 156 - if idx < MAX_ENTRIES { 157 - let ns = self.names.len(); 158 - if ns + name_len <= u16::MAX as usize && self.names.try_reserve(name_len).is_ok() { 159 - self.names 160 - .extend_from_slice(&cd[name_start_in_cd..name_start_in_cd + name_len]); 161 - 162 - self.entries[idx] = ZipEntry { 163 - name_start: ns as u16, 164 - name_len: name_len as u16, 165 - local_offset, 166 - comp_size, 167 - uncomp_size, 168 - method, 169 - }; 170 - self.count += 1; 171 - } 172 - } 173 - 174 - pos = entry_end; 175 - } 176 - 177 - if self.count == 0 { 178 - return Err("zip: no entries in CD"); 179 - } 180 - 181 - Ok(()) 182 - } 183 - 184 - /// Number of entries in the index. 185 - #[inline] 186 - pub fn count(&self) -> usize { 187 - self.count as usize 188 - } 189 - 190 - /// Return a reference to the entry at `idx`. Panics if out of range. 191 - #[inline] 192 - pub fn entry(&self, idx: usize) -> &ZipEntry { 193 - assert!(idx < self.count as usize); 194 - &self.entries[idx] 195 - } 196 - 197 - /// Return the filename of the entry at `idx` as a `&str`. 198 - pub fn entry_name(&self, idx: usize) -> &str { 199 - let e = self.entry(idx); 200 - let start = e.name_start as usize; 201 - let end = start + e.name_len as usize; 202 - core::str::from_utf8(&self.names[start..end]).unwrap_or("") 203 - } 204 - 205 - /// Find an entry by exact (case-sensitive) name. Returns its index. 206 - pub fn find(&self, name: &str) -> Option<usize> { 207 - let name_bytes = name.as_bytes(); 208 - for i in 0..self.count as usize { 209 - let e = &self.entries[i]; 210 - let start = e.name_start as usize; 211 - let end = start + e.name_len as usize; 212 - if &self.names[start..end] == name_bytes { 213 - return Some(i); 214 - } 215 - } 216 - None 217 - } 218 - 219 - /// Find an entry by case-insensitive ASCII name. Returns its index. 220 - pub fn find_icase(&self, name: &str) -> Option<usize> { 221 - let target = name.as_bytes(); 222 - for i in 0..self.count as usize { 223 - let e = &self.entries[i]; 224 - let start = e.name_start as usize; 225 - let end = start + e.name_len as usize; 226 - let entry_name = &self.names[start..end]; 227 - if entry_name.eq_ignore_ascii_case(target) { 228 - return Some(i); 229 - } 230 - } 231 - None 232 - } 233 - 234 - /// Given the first 30+ bytes of a local file header, return the number 235 - /// of bytes to skip past the header to reach the entry's data. 236 - pub fn local_header_data_skip(header: &[u8]) -> Result<u32, &'static str> { 237 - if header.len() < 30 { 238 - return Err("zip: local header too short"); 239 - } 240 - if le_u32(header, 0) != LOCAL_SIG { 241 - return Err("zip: bad local header signature"); 242 - } 243 - let name_len = le_u16(header, 26) as u32; 244 - let extra_len = le_u16(header, 28) as u32; 245 - Ok(30 + name_len + extra_len) 246 - } 247 - } 248 - 249 - // ── entry extraction ──────────────────────────────────────────────── 250 - 251 - /// Extract a complete ZIP entry into a heap-allocated `Vec<u8>`. 252 - /// 253 - /// Supports both stored and DEFLATE-compressed entries. The `read_fn` 254 - /// closure reads bytes at a given absolute offset. 255 - pub fn extract_entry<E, F>( 256 - entry: &ZipEntry, 257 - local_offset: u32, 258 - mut read_fn: F, 259 - ) -> Result<Vec<u8>, &'static str> 260 - where 261 - F: FnMut(u32, &mut [u8]) -> Result<usize, E>, 262 - { 263 - let mut header = [0u8; 30]; 264 - read_fn(local_offset, &mut header).map_err(|_| "zip: read local header failed")?; 265 - let skip = ZipIndex::local_header_data_skip(&header)?; 266 - let data_offset = local_offset + skip; 267 - 268 - if entry.uncomp_size > MAX_ENTRY_SIZE { 269 - return Err("zip: entry too large"); 270 - } 271 - 272 - match entry.method { 273 - METHOD_STORED => extract_stored(entry, data_offset, &mut read_fn), 274 - METHOD_DEFLATE => extract_deflate(entry, data_offset, &mut read_fn), 275 - _ => Err("zip: unsupported compression method"), 276 - } 277 - } 278 - 279 - fn extract_stored<E, F>( 280 - entry: &ZipEntry, 281 - data_offset: u32, 282 - read_fn: &mut F, 283 - ) -> Result<Vec<u8>, &'static str> 284 - where 285 - F: FnMut(u32, &mut [u8]) -> Result<usize, E>, 286 - { 287 - let size = entry.uncomp_size as usize; 288 - log::info!("zip: stored entry ({} bytes)", size); 289 - 290 - let mut out = Vec::new(); 291 - out.try_reserve_exact(size) 292 - .map_err(|_| "zip: chapter too large for memory")?; 293 - out.resize(size, 0); 294 - read_all(data_offset, &mut out, read_fn)?; 295 - Ok(out) 296 - } 297 - 298 - const DEFLATE_READ_BUF: usize = 4096; 299 - 300 - fn extract_deflate<E, F>( 301 - entry: &ZipEntry, 302 - data_offset: u32, 303 - read_fn: &mut F, 304 - ) -> Result<Vec<u8>, &'static str> 305 - where 306 - F: FnMut(u32, &mut [u8]) -> Result<usize, E>, 307 - { 308 - use miniz_oxide::inflate::TINFLStatus; 309 - use miniz_oxide::inflate::core::DecompressorOxide; 310 - use miniz_oxide::inflate::core::decompress; 311 - use miniz_oxide::inflate::core::inflate_flags; 312 - 313 - let comp_size = entry.comp_size as usize; 314 - let uncomp_size = entry.uncomp_size as usize; 315 - 316 - log::info!("zip: deflate stream {} -> {} bytes", comp_size, uncomp_size); 317 - 318 - let mut output = Vec::new(); 319 - output 320 - .try_reserve_exact(uncomp_size) 321 - .map_err(|_| "zip: chapter too large for memory")?; 322 - output.resize(uncomp_size, 0); 323 - 324 - // ~11KB DecompressorOxide; alloc zeroed directly (Box::new overflows stack) 325 - let decomp_ptr = 326 - unsafe { alloc::alloc::alloc_zeroed(core::alloc::Layout::new::<DecompressorOxide>()) }; 327 - if decomp_ptr.is_null() { 328 - return Err("zip: out of memory for decompressor"); 329 - } 330 - let mut decomp = unsafe { Box::from_raw(decomp_ptr as *mut DecompressorOxide) }; 331 - let mut out_pos: usize = 0; 332 - 333 - let mut rbuf = vec![0u8; DEFLATE_READ_BUF]; 334 - let mut in_avail: usize = 0; 335 - let mut file_pos = data_offset; 336 - let mut comp_left = comp_size; 337 - 338 - loop { 339 - // top up compressed read buffer 340 - if in_avail < DEFLATE_READ_BUF && comp_left > 0 { 341 - let space = DEFLATE_READ_BUF - in_avail; 342 - let want = space.min(comp_left); 343 - match read_fn(file_pos, &mut rbuf[in_avail..in_avail + want]) { 344 - Ok(n) if n > 0 => { 345 - file_pos += n as u32; 346 - comp_left -= n; 347 - in_avail += n; 348 - } 349 - Ok(_) => { 350 - comp_left = 0; 351 - } 352 - Err(_) => return Err("zip: read failed during deflate"), 353 - } 354 - } 355 - 356 - if in_avail == 0 && out_pos == 0 { 357 - return Err("zip: empty deflate stream"); 358 - } 359 - 360 - let flags = inflate_flags::TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF 361 - | if comp_left > 0 { 362 - inflate_flags::TINFL_FLAG_HAS_MORE_INPUT 363 - } else { 364 - 0 365 - }; 366 - 367 - let (status, consumed, produced) = 368 - decompress(&mut *decomp, &rbuf[..in_avail], &mut output, out_pos, flags); 369 - 370 - out_pos += produced; 371 - 372 - if consumed > 0 && consumed < in_avail { 373 - rbuf.copy_within(consumed..in_avail, 0); 374 - } 375 - in_avail -= consumed; 376 - 377 - match status { 378 - TINFLStatus::Done => break, 379 - TINFLStatus::NeedsMoreInput => { 380 - if comp_left == 0 && in_avail == 0 { 381 - return Err("zip: truncated deflate stream"); 382 - } 383 - if consumed == 0 && produced == 0 && in_avail >= DEFLATE_READ_BUF { 384 - return Err("zip: deflate stream stuck"); 385 - } 386 - } 387 - TINFLStatus::HasMoreOutput => { 388 - return Err("zip: deflate output exceeds declared size"); 389 - } 390 - _ => return Err("zip: deflate decompression error"), 391 - } 392 - } 393 - 394 - output.truncate(out_pos); 395 - Ok(output) 396 - } 397 - 398 - fn read_all<E, F>(offset: u32, buf: &mut [u8], read_fn: &mut F) -> Result<(), &'static str> 399 - where 400 - F: FnMut(u32, &mut [u8]) -> Result<usize, E>, 401 - { 402 - let mut total = 0usize; 403 - while total < buf.len() { 404 - let n = 405 - read_fn(offset + total as u32, &mut buf[total..]).map_err(|_| "zip: read failed")?; 406 - if n == 0 { 407 - return Err("zip: unexpected EOF"); 408 - } 409 - total += n; 410 - } 411 - Ok(()) 412 - }