feat: entity enrichment observations from Kindle importer

+246 -2

2 changed files

expand all

tests

test_kindle_importer.py

think

importers

kindle.py

+212

tests/test_kindle_importer.py

··· 7 7 import tempfile 8 8 from pathlib import Path 9 9 10 + from think.entities.observations import load_observations 10 11 from think.importers.kindle import KindleImporter, _parse_block, _parse_date 11 12 12 13 importer = KindleImporter() ··· 203 204 finally: 204 205 os.unlink(f.name) 205 206 os.environ.pop("JOURNAL_PATH", None) 207 + 208 + 209 + def test_observations_author_of(tmp_path, monkeypatch): 210 + content = _make_clippings_file([_make_clipping()]) 211 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 212 + f.write(content) 213 + f.flush() 214 + try: 215 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 216 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 217 + author_obs = load_observations("test.kindle", "Author Name") 218 + author_contents = [o["content"] for o in author_obs] 219 + assert "Author of Test Book (via Kindle, 2025-03-15)" in author_contents 220 + finally: 221 + os.unlink(f.name) 222 + 223 + 224 + def test_observations_by_author(tmp_path, monkeypatch): 225 + content = _make_clippings_file([_make_clipping()]) 226 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 227 + f.write(content) 228 + f.flush() 229 + try: 230 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 231 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 232 + book_obs = load_observations("test.kindle", "Test Book") 233 + book_contents = [o["content"] for o in book_obs] 234 + assert "By Author Name (via Kindle, 2025-03-15)" in book_contents 235 + finally: 236 + os.unlink(f.name) 237 + 238 + 239 + def test_observations_engagement(tmp_path, monkeypatch): 240 + content = _make_clippings_file( 241 + [ 242 + _make_clipping( 243 + meta="- Your Highlight on page 42 | location 100-101 | Added on Saturday, March 15, 2025 10:30:00 AM", 244 + ), 245 + _make_clipping( 246 + meta="- Your Highlight on page 43 | location 102-103 | Added on Saturday, March 15, 2025 10:31:00 AM", 247 + content="Second highlight.", 248 + ), 249 + _make_clipping( 250 + meta="- Your Note on page 44 | location 104 | Added on Saturday, March 15, 2025 10:32:00 AM", 251 + content="A note.", 252 + ), 253 + ] 254 + ) 255 + highlights_only = _make_clippings_file( 256 + [ 257 + _make_clipping( 258 + meta="- Your Highlight on page 42 | location 100-101 | Added on Saturday, March 15, 2025 10:30:00 AM", 259 + ), 260 + _make_clipping( 261 + meta="- Your Highlight on page 43 | location 102-103 | Added on Saturday, March 15, 2025 10:31:00 AM", 262 + content="Second highlight.", 263 + ), 264 + ] 265 + ) 266 + 267 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 268 + f.write(content) 269 + f.flush() 270 + try: 271 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 272 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 273 + book_obs = load_observations("test.kindle", "Test Book") 274 + book_contents = [o["content"] for o in book_obs] 275 + assert "2 highlights, 1 notes (via Kindle, 2025-03-15)" in book_contents 276 + finally: 277 + os.unlink(f.name) 278 + 279 + second_tmp_path = tmp_path / "highlights_only" 280 + second_tmp_path.mkdir() 281 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 282 + f.write(highlights_only) 283 + f.flush() 284 + try: 285 + monkeypatch.setenv("JOURNAL_PATH", str(second_tmp_path)) 286 + importer.process(Path(f.name), second_tmp_path, facet="test.kindle") 287 + book_obs = load_observations("test.kindle", "Test Book") 288 + book_contents = [o["content"] for o in book_obs] 289 + assert "2 highlights (via Kindle, 2025-03-15)" in book_contents 290 + finally: 291 + os.unlink(f.name) 292 + 293 + 294 + def test_observations_multi_book_author(tmp_path, monkeypatch): 295 + content = _make_clippings_file( 296 + [ 297 + _make_clipping(title="Test Book (Author Name)"), 298 + _make_clipping(title="Second Book (Author Name)"), 299 + ] 300 + ) 301 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 302 + f.write(content) 303 + f.flush() 304 + try: 305 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 306 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 307 + author_obs = load_observations("test.kindle", "Author Name") 308 + author_contents = [o["content"] for o in author_obs] 309 + assert "Author of Test Book (via Kindle, 2025-03-15)" in author_contents 310 + assert "Author of Second Book (via Kindle, 2025-03-15)" in author_contents 311 + finally: 312 + os.unlink(f.name) 313 + 314 + 315 + def test_observations_no_author(tmp_path, monkeypatch): 316 + content = _make_clippings_file([_make_clipping(title="Title Without Author")]) 317 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 318 + f.write(content) 319 + f.flush() 320 + try: 321 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 322 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 323 + book_obs = load_observations("test.kindle", "Title Without Author") 324 + book_contents = [o["content"] for o in book_obs] 325 + assert not any(c.startswith("By ") for c in book_contents) 326 + author_entities_dir = tmp_path / "facets" / "test.kindle" / "entities" 327 + if author_entities_dir.exists(): 328 + entity_names = { 329 + entity_dir.name 330 + for entity_dir in author_entities_dir.iterdir() 331 + if entity_dir.is_dir() 332 + } 333 + assert "author_name" not in entity_names 334 + finally: 335 + os.unlink(f.name) 336 + 337 + 338 + def test_observations_dedup_on_reimport(tmp_path, monkeypatch): 339 + content = _make_clippings_file([_make_clipping()]) 340 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 341 + f.write(content) 342 + f.flush() 343 + try: 344 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 345 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 346 + first = load_observations("test.kindle", "Test Book") 347 + first_by_author = [ 348 + o 349 + for o in first 350 + if o["content"] == "By Author Name (via Kindle, 2025-03-15)" 351 + ] 352 + assert len(first_by_author) == 1 353 + 354 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 355 + second = load_observations("test.kindle", "Test Book") 356 + second_by_author = [ 357 + o 358 + for o in second 359 + if o["content"] == "By Author Name (via Kindle, 2025-03-15)" 360 + ] 361 + assert len(second_by_author) == 1 362 + finally: 363 + os.unlink(f.name) 364 + 365 + 366 + def test_observations_engagement_notes_only(tmp_path, monkeypatch): 367 + """Notes-only book gets notes count without highlights.""" 368 + content = _make_clippings_file( 369 + [ 370 + _make_clipping( 371 + meta="- Your Note on page 10 | Added on Saturday, March 15, 2025 10:30:00 AM", 372 + content="A note.", 373 + ), 374 + _make_clipping( 375 + meta="- Your Note on page 11 | Added on Saturday, March 15, 2025 10:31:00 AM", 376 + content="Another note.", 377 + ), 378 + ] 379 + ) 380 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 381 + f.write(content) 382 + f.flush() 383 + try: 384 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 385 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 386 + book_obs = load_observations("test.kindle", "Test Book") 387 + book_contents = [o["content"] for o in book_obs] 388 + assert "2 notes (via Kindle, 2025-03-15)" in book_contents 389 + # No highlights count should appear 390 + assert not any("highlights" in c for c in book_contents) 391 + finally: 392 + os.unlink(f.name) 393 + 394 + 395 + def test_observations_engagement_excludes_bookmarks(tmp_path, monkeypatch): 396 + content = _make_clippings_file( 397 + [ 398 + _make_clipping( 399 + meta="- Your Highlight on page 42 | location 100-101 | Added on Saturday, March 15, 2025 10:30:00 AM", 400 + ), 401 + _make_clipping( 402 + meta="- Your Bookmark on page 43 | location 102-103 | Added on Saturday, March 15, 2025 10:31:00 AM", 403 + content="Saved bookmark.", 404 + ), 405 + ] 406 + ) 407 + with tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False) as f: 408 + f.write(content) 409 + f.flush() 410 + try: 411 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 412 + importer.process(Path(f.name), tmp_path, facet="test.kindle") 413 + book_obs = load_observations("test.kindle", "Test Book") 414 + book_contents = [o["content"] for o in book_obs] 415 + assert "1 highlights (via Kindle, 2025-03-15)" in book_contents 416 + finally: 417 + os.unlink(f.name) 206 418 207 419 208 420 # --- Registry test ---

+34 -2

think/importers/kindle.py

··· 331 331 ) 332 332 entry_segment_map = {idx: segment for idx, segment in enumerate(item_segments)} 333 333 manifest_entries: list[dict] = [] 334 + book_obs: dict[str, list[str]] = {} 335 + author_obs: dict[str, list[str]] = {} 334 336 for book_idx, (book_title, indices) in enumerate(sorted(books_map.items())): 335 337 book_entries = [entries[i] for i in indices] 336 338 author = book_entries[0].get("author", "") ··· 359 361 ], 360 362 } 361 363 ) 364 + obs_date = first_dt.strftime("%Y-%m-%d") 365 + highlight_only = sum( 366 + 1 for entry in book_entries if entry.get("clip_type") == "highlight" 367 + ) 368 + observations: list[str] = [] 369 + if author: 370 + observations.append(f"By {author} (via Kindle, {obs_date})") 371 + if highlight_only > 0 and note_count > 0: 372 + observations.append( 373 + f"{highlight_only} highlights, {note_count} notes " 374 + f"(via Kindle, {obs_date})" 375 + ) 376 + elif highlight_only > 0: 377 + observations.append( 378 + f"{highlight_only} highlights (via Kindle, {obs_date})" 379 + ) 380 + elif note_count > 0: 381 + observations.append(f"{note_count} notes (via Kindle, {obs_date})") 382 + if observations: 383 + book_obs[book_title] = observations 384 + if author: 385 + author_obs.setdefault(author, []).append( 386 + f"Author of {book_title} (via Kindle, {obs_date})" 387 + ) 362 388 write_content_manifest(import_id, manifest_entries) 363 389 364 390 segment_days = {day for day, _ in segments} ··· 371 397 ) 372 398 entity_defs: list[dict] = [] 373 399 for book in sorted(books): 374 - entity_defs.append({"name": book, "type": "Book"}) 400 + d: dict = {"name": book, "type": "Book"} 401 + if book in book_obs: 402 + d["observations"] = book_obs[book] 403 + entity_defs.append(d) 375 404 for author in sorted(authors): 376 405 if author: 377 - entity_defs.append({"name": author, "type": "Person"}) 406 + d = {"name": author, "type": "Person"} 407 + if author in author_obs: 408 + d["observations"] = author_obs[author] 409 + entity_defs.append(d) 378 410 379 411 resolved = seed_entities(facet, earliest_day, entity_defs) 380 412 entities_seeded = len(resolved)

Configure Feed

Configure Feed