Fix text import pipeline: raw path, segments.json, format_audio parsing

+13 -8

observe/hear.py

··· 354 354 chunks: list[dict[str, Any]] = [] 355 355 356 356 # Parse day and time from path structure 357 - # Expected format: YYYYMMDD/HHMMSS_LEN/audio.jsonl 357 + # Supports both layouts: 358 + # YYYYMMDD/HHMMSS_LEN/audio.jsonl (legacy) 359 + # YYYYMMDD/stream/HHMMSS_LEN/audio.jsonl (stream-based) 358 360 day_str = None 359 361 start_time = None 360 362 base_timestamp = 0 ··· 362 364 if file_path: 363 365 file_path = Path(file_path) 364 366 parts = file_path.parts 367 + rev_parts = list(reversed(parts)) 365 368 366 - # Try to find YYYYMMDD and HHMMSS_LEN in path 367 - for i, part in enumerate(reversed(parts)): 369 + # Try to find YYYYMMDD in path 370 + for i, part in enumerate(rev_parts): 368 371 if re.match(r"^\d{8}$", part): 369 372 day_str = part 370 - # Check if previous part (parent dir) is HHMMSS_LEN segment 371 - if i > 0: 372 - from think.utils import segment_parse 373 + # Scan parts between file and day for a valid segment key 374 + from think.utils import segment_parse 373 375 374 - prev_part = list(reversed(parts))[i - 1] 375 - start_time, _ = segment_parse(prev_part) 376 + for j in range(1, i): 377 + parsed_time, _ = segment_parse(rev_parts[j]) 378 + if parsed_time is not None: 379 + start_time = parsed_time 380 + break 376 381 break 377 382 378 383 # Build header line

+158

tests/test_importer.py

··· 116 116 ] 117 117 assert metadata1["imported"]["id"] == "20240101_120000" 118 118 assert "facet" not in metadata1["imported"] 119 + # raw path should resolve from segment dir (3 levels deep) to imports/ 120 + assert metadata1["raw"] == "../../../imports/20240101_120000/sample.txt" 119 121 120 122 assert entries2 == [ 121 123 {"start": "12:05:00", "speaker": "Unknown", "text": "seg2", "source": "import"} 122 124 ] 123 125 assert metadata2["imported"]["id"] == "20240101_120000" 124 126 assert "facet" not in metadata2["imported"] 127 + 128 + # segments.json should be written in the import directory 129 + segments_json = tmp_path / "imports" / "20240101_120000" / "segments.json" 130 + assert segments_json.exists() 131 + seg_data = json.loads(segments_json.read_text()) 132 + assert seg_data["day"] == "20240101" 133 + assert "120000_300" in seg_data["segments"] 134 + assert "120500_5" in seg_data["segments"] 135 + 136 + # stream.json should be written in each segment directory 137 + stream1 = day_dir / "import.text" / "120000_300" / "stream.json" 138 + assert stream1.exists() 139 + stream1_data = json.loads(stream1.read_text()) 140 + assert stream1_data["stream"] == "import.text" 141 + 142 + 143 + def test_importer_pdf(tmp_path, monkeypatch): 144 + """Test importing a PDF transcript file.""" 145 + mod = importlib.import_module("think.importers.cli") 146 + text_mod = importlib.import_module("think.importers.text") 147 + 148 + # Create a fake PDF file (content doesn't matter — pypdf is mocked) 149 + pdf = tmp_path / "meeting.pdf" 150 + pdf.write_bytes(b"%PDF-1.4 fake") 151 + 152 + monkeypatch.setenv("JOURNAL_PATH", str(tmp_path)) 153 + monkeypatch.setattr( 154 + mod, "detect_created", lambda p, **kw: {"day": "20251205", "time": "163000"} 155 + ) 156 + 157 + # Mock _read_transcript to return extracted text (bypasses pypdf) 158 + monkeypatch.setattr( 159 + text_mod, "_read_transcript", lambda path: "Board meeting notes\nAction items" 160 + ) 161 + 162 + # Mock segment detection: single segment for short text 163 + def mock_detect_segment(text, start_time): 164 + return [("16:30:00", text)] 165 + 166 + monkeypatch.setattr(text_mod, "detect_transcript_segment", mock_detect_segment) 167 + 168 + # Mock JSON conversion 169 + def mock_detect_json(text, segment_start): 170 + return [ 171 + {"start": segment_start, "speaker": "Jack", "text": "Board meeting notes"}, 172 + {"start": "16:30:30", "speaker": "Ramon", "text": "Action items"}, 173 + {"topics": "board meeting, action items", "setting": "workplace"}, 174 + ] 175 + 176 + monkeypatch.setattr(text_mod, "detect_transcript_json", mock_detect_json) 177 + 178 + # Mock CallosumConnection and status emitter 179 + monkeypatch.setattr(mod, "CallosumConnection", lambda **kwargs: MagicMock()) 180 + monkeypatch.setattr(mod, "_status_emitter", lambda: None) 181 + 182 + monkeypatch.setattr( 183 + "sys.argv", 184 + [ 185 + "sol import", 186 + str(pdf), 187 + "--timestamp", 188 + "20251205_163000", 189 + "--facet", 190 + "work", 191 + "--setting", 192 + "board meeting", 193 + ], 194 + ) 195 + mod.main() 196 + 197 + day_dir = day_path("20251205") 198 + # Single segment, last segment defaults to 5s 199 + f1 = day_dir / "import.text" / "163000_5" / "imported_audio.jsonl" 200 + assert f1.exists() 201 + 202 + lines = f1.read_text().strip().split("\n") 203 + metadata = json.loads(lines[0]) 204 + entries = [json.loads(line) for line in lines[1:]] 205 + 206 + # Verify metadata 207 + assert metadata["imported"]["id"] == "20251205_163000" 208 + assert metadata["imported"]["facet"] == "work" 209 + assert metadata["imported"]["setting"] == "board meeting" 210 + assert metadata["raw"] == "../../../imports/20251205_163000/meeting.pdf" 211 + 212 + # Verify entries — text entries get source="import", topics/setting entry does not 213 + assert entries[0] == { 214 + "start": "16:30:00", 215 + "speaker": "Jack", 216 + "text": "Board meeting notes", 217 + "source": "import", 218 + } 219 + assert entries[1] == { 220 + "start": "16:30:30", 221 + "speaker": "Ramon", 222 + "text": "Action items", 223 + "source": "import", 224 + } 225 + # Topics/setting metadata entry preserved without source field 226 + assert entries[2] == { 227 + "topics": "board meeting, action items", 228 + "setting": "workplace", 229 + } 230 + 231 + # Verify .pdf auto-detected as text import (stream = import.text) 232 + stream_json = day_dir / "import.text" / "163000_5" / "stream.json" 233 + assert stream_json.exists() 234 + stream_data = json.loads(stream_json.read_text()) 235 + assert stream_data["stream"] == "import.text" 236 + 237 + # Verify segments.json written 238 + segments_json = tmp_path / "imports" / "20251205_163000" / "segments.json" 239 + assert segments_json.exists() 240 + 241 + 242 + def test_format_audio_stream_path(): 243 + """Test format_audio correctly parses timestamps from stream-based paths.""" 244 + from observe.hear import format_audio 245 + 246 + entries = [ 247 + {"imported": {"id": "20240101_120000"}, "raw": "test.txt"}, 248 + {"start": "12:00:00", "speaker": "Alice", "text": "Hello"}, 249 + {"start": "12:00:30", "speaker": "Bob", "text": "Hi there"}, 250 + ] 251 + 252 + # Stream-based path: day/stream/segment/imported_audio.jsonl 253 + context = { 254 + "file_path": Path("/journal/20240101/import.text/120000_300/imported_audio.jsonl") 255 + } 256 + chunks, meta = format_audio(entries, context) 257 + 258 + assert len(chunks) == 2 259 + # Verify timestamps are non-zero (base_timestamp correctly parsed from path) 260 + assert chunks[0]["timestamp"] > 0 261 + assert chunks[1]["timestamp"] > chunks[0]["timestamp"] 262 + # Verify header includes start time 263 + assert meta.get("header") and "12:00" in meta["header"] 264 + 265 + 266 + def test_format_audio_legacy_path(): 267 + """Test format_audio still works with legacy day/segment/ paths.""" 268 + from observe.hear import format_audio 269 + 270 + entries = [ 271 + {"raw": "raw.flac", "model": "whisper-1"}, 272 + {"start": "12:34:56", "source": "mic", "text": "Test"}, 273 + ] 274 + 275 + # Legacy path: day/segment/audio.jsonl (no stream directory) 276 + context = { 277 + "file_path": Path("/journal/20240101/123456_300/audio.jsonl") 278 + } 279 + chunks, meta = format_audio(entries, context) 280 + 281 + assert len(chunks) == 1 282 + assert chunks[0]["timestamp"] > 0 125 283 126 284 127 285 def test_get_audio_duration(tmp_path):

+6 -1

think/importers/cli.py

··· 400 400 except Exception as e: 401 401 logger.warning(f"Failed to write stream identity: {e}") 402 402 403 + # Save segment list for tracking (same as audio path) 404 + save_import_segments(journal_root, args.timestamp, created_segments, day) 405 + 403 406 # Emit observe.observed for text imports (already processed) 404 407 for seg in created_segments: 405 408 _callosum.emit( ··· 635 638 raise 636 639 637 640 finally: 638 - # Stop status thread 641 + # Stop status thread and Callosum connection 639 642 _status_running = False 640 643 if _status_thread: 641 644 _status_thread.join(timeout=6) 645 + if _callosum: 646 + _callosum.stop() 642 647 643 648 644 649 if __name__ == "__main__":

+2 -2

think/importers/shared.py

··· 41 41 file_path: Path to write JSONL file 42 42 entries: List of transcript entries 43 43 import_id: Import identifier 44 - raw_filename: Source file name (relative path from segment to imports/) 44 + raw_filename: Source filename (basename only, used to build relative path) 45 45 facet: Optional facet name 46 46 setting: Optional setting description 47 47 """ ··· 56 56 57 57 # Add raw file reference (path relative from segment to imports directory) 58 58 if raw_filename: 59 - metadata["raw"] = f"../../imports/{import_id}/{raw_filename}" 59 + metadata["raw"] = f"../../../imports/{import_id}/{raw_filename}" 60 60 61 61 # Write JSONL: metadata first, then entries with source field 62 62 jsonl_lines = [json.dumps(metadata)]

Configure Feed

Configure Feed