Merge branch 'hopper-a5od7hcg-import-dry-run' · solpbc.org/solstone@4a4189f

+181 -18

2 changed files

expand all

tests

test_importer.py

think

importers

cli.py

+103

tests/test_importer.py

··· 413 413 seg_key, seg_dir, seg_files = segments[0] 414 414 assert seg_key == "120001_300" # Deconflicted key 415 415 assert seg_dir == day_dir / "import.audio" / "120001_300" 416 + 417 + 418 + def test_importer_dry_run_text(tmp_path, monkeypatch, capsys): 419 + """Test --dry-run for text import prints plan without writing files.""" 420 + mod = importlib.import_module("think.importers.cli") 421 + 422 + txt = tmp_path / "sample.txt" 423 + txt.write_text("hello\nworld\n") 424 + 425 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 426 + monkeypatch.setattr( 427 + "sys.argv", 428 + ["sol import", str(txt), "--timestamp", "20240101_120000", "--dry-run"], 429 + ) 430 + 431 + mod.main() 432 + 433 + captured = capsys.readouterr() 434 + assert "File:" in captured.out 435 + assert "Size:" in captured.out 436 + assert "Timestamp:" in captured.out 437 + assert "Source:" in captured.out 438 + assert "Stream:" in captured.out 439 + assert "Target day:" in captured.out 440 + assert "Content:" in captured.out 441 + assert "characters" in captured.out 442 + assert "lines" in captured.out 443 + assert "import.text" in captured.out 444 + assert "20240101" in captured.out 445 + assert "12 characters" in captured.out 446 + assert "2 lines" in captured.out 447 + 448 + assert not (tmp_path / "imports").exists() 449 + assert not (tmp_path / "20240101").exists() 450 + 451 + 452 + def test_importer_dry_run_audio(tmp_path, monkeypatch, capsys): 453 + """Test --dry-run for audio import prints plan without writing files.""" 454 + mod = importlib.import_module("think.importers.cli") 455 + 456 + mp3 = tmp_path / "sample.mp3" 457 + mp3.write_bytes(b"fake audio") 458 + 459 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 460 + monkeypatch.setattr(mod, "_get_audio_duration", lambda p: 420.0) 461 + callosum_cls = MagicMock() 462 + monkeypatch.setattr(mod, "CallosumConnection", callosum_cls) 463 + monkeypatch.setattr( 464 + "sys.argv", 465 + ["sol import", str(mp3), "--timestamp", "20240101_120000", "--dry-run"], 466 + ) 467 + 468 + mod.main() 469 + 470 + captured = capsys.readouterr() 471 + assert "File:" in captured.out 472 + assert "Size:" in captured.out 473 + assert "Timestamp:" in captured.out 474 + assert "Source:" in captured.out 475 + assert "Stream:" in captured.out 476 + assert "Target day:" in captured.out 477 + assert "Duration:" in captured.out 478 + assert "Segments:" in captured.out 479 + assert "Keys:" in captured.out 480 + assert "import.audio" in captured.out 481 + assert "20240101" in captured.out 482 + assert "7.0 minutes" in captured.out 483 + assert "2 (5-minute chunks)" in captured.out 484 + assert "120000_300" in captured.out 485 + assert "120500_300" in captured.out 486 + 487 + assert not (tmp_path / "imports").exists() 488 + assert not (tmp_path / "20240101").exists() 489 + assert callosum_cls.call_count == 0 490 + 491 + 492 + def test_importer_dry_run_auto(tmp_path, monkeypatch, capsys): 493 + """Test --dry-run with --auto detects timestamp and prints summary.""" 494 + mod = importlib.import_module("think.importers.cli") 495 + 496 + txt = tmp_path / "notes.txt" 497 + txt.write_text("meeting notes") 498 + 499 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 500 + monkeypatch.setattr( 501 + mod, "detect_created", lambda p, **kw: {"day": "20240315", "time": "140000"} 502 + ) 503 + monkeypatch.setattr( 504 + "sys.argv", 505 + ["sol import", str(txt), "--auto", "--dry-run"], 506 + ) 507 + 508 + mod.main() 509 + 510 + captured = capsys.readouterr() 511 + assert "Detected timestamp: 20240315_140000" in captured.out 512 + assert "auto-importing" in captured.out 513 + assert "import.text" in captured.out 514 + assert "Target day: 20240315" in captured.out 515 + assert "Content:" in captured.out 516 + 517 + assert not (tmp_path / "imports").exists() 518 + assert not (tmp_path / "20240315").exists()

+78 -18

think/importers/cli.py

··· 14 14 15 15 from think.callosum import CallosumConnection 16 16 from think.detect_created import detect_created 17 - from think.importers.audio import prepare_audio_segments 17 + from think.importers.audio import _get_audio_duration, prepare_audio_segments 18 18 from think.importers.shared import ( 19 19 _get_relative_path, 20 20 _is_in_imports, 21 21 _setup_import, 22 22 ) 23 - from think.importers.text import process_transcript 23 + from think.importers.text import _read_transcript, process_transcript 24 24 from think.importers.utils import save_import_segments 25 25 from think.streams import stream_name, update_stream, write_segment_stream 26 26 from think.utils import day_path, get_journal, get_rev, segment_key, setup_cli ··· 195 195 help="Auto-accept detected timestamp and proceed with import", 196 196 ) 197 197 parser.add_argument( 198 + "--dry-run", 199 + action="store_true", 200 + help="Show what would be imported without writing to the journal", 201 + ) 202 + parser.add_argument( 198 203 "--backends", 199 204 action="store_true", 200 205 help="List syncable importer backends", ··· 267 272 if not TIME_RE.fullmatch(args.timestamp): 268 273 raise SystemExit("timestamp must be in YYYYMMDD_HHMMSS format") 269 274 275 + base_dt = dt.datetime.strptime(args.timestamp, "%Y%m%d_%H%M%S") 276 + day = base_dt.strftime("%Y%m%d") 277 + 278 + # Derive stream identity for this import 279 + if args.source: 280 + import_source = args.source 281 + else: 282 + # Auto-detect from file extension 283 + _ext = os.path.splitext(args.media)[1].lower() 284 + if _ext == ".m4a": 285 + import_source = "apple" 286 + elif _ext in {".txt", ".md", ".pdf"}: 287 + import_source = "text" 288 + else: 289 + import_source = "audio" 290 + stream = stream_name(import_source=import_source) 291 + 292 + if args.dry_run: 293 + from think.importers.plaud import format_size 294 + 295 + # Print summary without writing anything 296 + file_size = os.path.getsize(args.media) 297 + display = _format_timestamp_display(args.timestamp) 298 + 299 + print() 300 + print(f" File: {args.media}") 301 + print(f" Size: {format_size(file_size)}") 302 + print(f" Timestamp: {args.timestamp} ({display})") 303 + print(f" Source: {import_source}") 304 + print(f" Stream: {stream}") 305 + print(f" Target day: {day}") 306 + 307 + ext = os.path.splitext(args.media)[1].lower() 308 + if ext in {".txt", ".md", ".pdf"}: 309 + text = _read_transcript(args.media) 310 + chars = len(text) 311 + lines = text.count("\n") + (1 if text and not text.endswith("\n") else 0) 312 + print() 313 + print(f" Content: {chars:,} characters, {lines:,} lines") 314 + else: 315 + duration = _get_audio_duration(args.media) 316 + if duration is not None: 317 + segment_duration = 300 318 + num_segments = int( 319 + (duration + segment_duration - 1) // segment_duration 320 + ) 321 + if num_segments == 0: 322 + num_segments = 1 323 + 324 + keys = [] 325 + for i in range(num_segments): 326 + ts = base_dt + dt.timedelta(minutes=i * 5) 327 + keys.append(f"{ts.strftime('%H%M%S')}_{segment_duration}") 328 + 329 + if duration < 60: 330 + dur_str = f"{duration:.0f} seconds" 331 + elif duration < 3600: 332 + dur_str = f"{duration / 60:.1f} minutes" 333 + else: 334 + dur_str = f"{duration / 3600:.1f} hours" 335 + 336 + print() 337 + print(f" Duration: {dur_str}") 338 + print(f" Segments: {num_segments} (5-minute chunks)") 339 + print(f" Keys: {', '.join(keys)}") 340 + else: 341 + print() 342 + print(" Duration: unknown (ffprobe failed)") 343 + print() 344 + return 345 + 270 346 # Check if file needs setup (not already in imports/) 271 347 needs_setup = not _is_in_imports(args.media) 272 348 ··· 282 358 ) 283 359 print("Starting import...") 284 360 285 - base_dt = dt.datetime.strptime(args.timestamp, "%Y%m%d_%H%M%S") 286 - day = base_dt.strftime("%Y%m%d") 287 361 logger.info(f"Using provided timestamp: {args.timestamp}") 288 362 day_dir = str(day_path(day)) 289 - 290 - # Derive stream identity for this import 291 - if args.source: 292 - import_source = args.source 293 - else: 294 - # Auto-detect from file extension 295 - _ext = os.path.splitext(args.media)[1].lower() 296 - if _ext == ".m4a": 297 - import_source = "apple" 298 - elif _ext in {".txt", ".md", ".pdf"}: 299 - import_source = "text" 300 - else: 301 - import_source = "audio" 302 - stream = stream_name(import_source=import_source) 303 363 304 364 # Initialize importer tract state 305 365 _import_id = args.timestamp

Configure Feed

Configure Feed