Refactor describe.py for primary/secondary region detection with bounding boxes

+313 -75

observe/describe.py

··· 3 3 Describe screencast videos by detecting significant frame changes. 4 4 5 5 Processes per-monitor screencast files (.webm/.mp4/.mov), detects changes using 6 - RMS-based comparison, and sends full frames to Gemini for analysis. 6 + RMS-based comparison, and sends frames to Gemini for multi-stage analysis: 7 + 8 + 1. Initial categorization identifies primary/secondary regions with bounding boxes 9 + 2. Follow-up analysis (text extraction or meeting analysis) uses cropped regions 10 + based on category, or full frame when both regions need the same analysis type 7 11 """ 8 12 9 13 from __future__ import annotations ··· 26 30 from think.utils import setup_cli 27 31 28 32 logger = logging.getLogger(__name__) 33 + 34 + # Minimum box size in pixels for follow-up processing 35 + MIN_BOX_SIZE = 300 29 36 30 37 31 38 class RequestType(Enum): ··· 228 235 img.save(buf, format="PNG", compress_level=1) 229 236 return buf.getvalue() 230 237 238 + def _denormalize_box( 239 + self, box_2d: List[int], img_width: int, img_height: int 240 + ) -> tuple: 241 + """ 242 + Convert 0-1000 normalized coords to pixel coords. 243 + 244 + Parameters 245 + ---------- 246 + box_2d : List[int] 247 + Normalized coordinates [y0, x0, y1, x1] in 0-1000 range 248 + img_width : int 249 + Image width in pixels 250 + img_height : int 251 + Image height in pixels 252 + 253 + Returns 254 + ------- 255 + tuple 256 + Pixel coordinates (y0, x0, y1, x1) 257 + """ 258 + y0, x0, y1, x1 = box_2d 259 + return ( 260 + int(y0 * img_height / 1000), 261 + int(x0 * img_width / 1000), 262 + int(y1 * img_height / 1000), 263 + int(x1 * img_width / 1000), 264 + ) 265 + 266 + def _box_qualifies(self, box_pixels: tuple) -> bool: 267 + """ 268 + Check if denormalized box is >MIN_BOX_SIZE in both dimensions. 269 + 270 + Parameters 271 + ---------- 272 + box_pixels : tuple 273 + Pixel coordinates (y0, x0, y1, x1) 274 + 275 + Returns 276 + ------- 277 + bool 278 + True if box qualifies for follow-up processing 279 + """ 280 + y0, x0, y1, x1 = box_pixels 281 + width = x1 - x0 282 + height = y1 - y0 283 + return width > MIN_BOX_SIZE and height > MIN_BOX_SIZE 284 + 285 + def _crop_to_box(self, img: Image.Image, box_pixels: tuple) -> Image.Image: 286 + """ 287 + Crop PIL Image using denormalized pixel coords. 288 + 289 + Parameters 290 + ---------- 291 + img : Image.Image 292 + Source image to crop 293 + box_pixels : tuple 294 + Pixel coordinates (y0, x0, y1, x1) 295 + 296 + Returns 297 + ------- 298 + Image.Image 299 + Cropped image 300 + """ 301 + y0, x0, y1, x1 = box_pixels 302 + # Clamp to image bounds 303 + x0 = max(0, min(x0, img.width)) 304 + x1 = max(0, min(x1, img.width)) 305 + y0 = max(0, min(y0, img.height)) 306 + y1 = max(0, min(y1, img.height)) 307 + # PIL crop uses (left, upper, right, lower) = (x0, y0, x1, y1) 308 + return img.crop((x0, y0, x1, y1)) 309 + 310 + def _get_follow_up_prompt(self, category: str) -> Optional[str]: 311 + """ 312 + Map category to follow-up prompt type. 313 + 314 + Parameters 315 + ---------- 316 + category : str 317 + Category from initial analysis 318 + 319 + Returns 320 + ------- 321 + Optional[str] 322 + "meeting", "text", or None if no follow-up needed 323 + """ 324 + if category == "meeting": 325 + return "meeting" 326 + text_categories = CONFIG.get("text_extraction_categories", []) 327 + if category in text_categories: 328 + return "text" 329 + return None 330 + 231 331 def _user_contents(self, prompt: str, image, entities: bool = False) -> list: 232 332 """Build contents list with optional entity context.""" 233 333 contents = [prompt] ··· 280 380 Path to write JSONL output (when None, no output file is written) 281 381 """ 282 382 from think.batch import GeminiBatch 283 - from think.models import GEMINI_LITE 383 + from think.models import GEMINI_FLASH 284 384 285 385 # Load prompt templates 286 386 prompt_path = Path(__file__).parent / use_prompt ··· 331 431 "Analyze this screenshot frame from a screencast recording.", 332 432 frame_img, 333 433 ), 334 - model=GEMINI_LITE, 434 + model=GEMINI_FLASH, 335 435 system_instruction=system_instruction, 336 436 json_output=True, 337 437 temperature=0.7, ··· 347 447 req.request_type = RequestType.DESCRIBE_JSON 348 448 req.json_analysis = None # Will store the JSON analysis result 349 449 req.meeting_analysis = None # Will store meeting analysis if applicable 450 + req.extracted_text = None # Will store text extraction if applicable 350 451 req.requests = [] # Track all requests for this frame 351 452 req.initial_image = frame_img # Keep reference to close after completion 453 + req.pending_follow_ups = 0 # Track how many follow-ups are pending 454 + req.follow_up_source = None # "primary", "secondary", or "full" 352 455 353 456 batch.add(req) 354 457 ··· 361 464 total_frames = 0 362 465 failed_frames = 0 363 466 467 + # Track frames by frame_id for merging follow-up results 468 + frame_results = {} # frame_id -> result dict 469 + 364 470 # Stream results as they complete, with retry logic 365 471 async for req in batch.drain_batch(): 366 - total_frames += 1 472 + # Only count initial DESCRIBE_JSON requests as frames (not follow-ups) 473 + if req.request_type == RequestType.DESCRIBE_JSON: 474 + total_frames += 1 475 + 367 476 # Check for errors 368 477 has_error = bool(req.error) 369 478 error_msg = req.error ··· 386 495 except json.JSONDecodeError as e: 387 496 has_error = True 388 497 error_msg = f"Invalid JSON response: {e}" 498 + elif req.request_type == RequestType.DESCRIBE_TEXT: 499 + # Store text extraction result 500 + req.extracted_text = req.response 389 501 390 502 # Retry logic (up to 5 attempts total, so 4 retries) 391 503 if has_error and req.retry_count < 4: ··· 396 508 ) 397 509 continue # Don't output, wait for retry result 398 510 399 - # Track failure after all retries exhausted 400 - if has_error: 511 + # Track failure after all retries exhausted (only for initial requests) 512 + if has_error and req.request_type == RequestType.DESCRIBE_JSON: 401 513 failed_frames += 1 402 514 403 515 # Record this request's result (after retries are done) ··· 408 520 } 409 521 if req.retry_count > 0: 410 522 request_record["retries"] = req.retry_count 523 + if req.follow_up_source: 524 + request_record["source"] = req.follow_up_source 411 525 412 526 req.requests.append(request_record) 413 527 ··· 419 533 ) 420 534 421 535 if should_process_further: 422 - visible_category = req.json_analysis.get("visible", "") 536 + # Extract primary and secondary regions 537 + primary = req.json_analysis.get("primary", {}) 538 + secondary = req.json_analysis.get("secondary", False) 423 539 424 - # Check for meeting analysis 425 - if visible_category == "meeting": 426 - logger.info(f"Frame {req.frame_id}: Triggering meeting analysis") 427 - # Reload frame image from cached bytes (already full frame) 428 - meeting_img = Image.open(io.BytesIO(req.frame_bytes)) 540 + # Load full frame for potential processing 541 + full_img = Image.open(io.BytesIO(req.frame_bytes)) 542 + img_width, img_height = full_img.width, full_img.height 429 543 430 - batch.update( 431 - req, 432 - contents=self._user_contents( 433 - "Analyze this meeting screenshot.", 434 - meeting_img, 435 - entities=True, 436 - ), 437 - model=GEMINI_LITE, 438 - system_instruction=meeting_system_instruction, 439 - json_output=True, 440 - max_output_tokens=10240, 441 - thinking_budget=6144, 544 + # Analyze primary region 545 + primary_prompt_type = None 546 + primary_box_pixels = None 547 + if primary and primary.get("box_2d"): 548 + primary_box_pixels = self._denormalize_box( 549 + primary["box_2d"], img_width, img_height 442 550 ) 443 - # Don't close yet - batch needs it for encoding 444 - # Store reference for cleanup later 445 - req.meeting_image = meeting_img 551 + if self._box_qualifies(primary_box_pixels): 552 + primary_prompt_type = self._get_follow_up_prompt( 553 + primary.get("category", "") 554 + ) 446 555 447 - # Close initial image since DESCRIBE_JSON is complete 448 - if hasattr(req, "initial_image") and req.initial_image: 449 - req.initial_image.close() 450 - req.initial_image = None 556 + # Analyze secondary region 557 + secondary_prompt_type = None 558 + secondary_box_pixels = None 559 + if ( 560 + secondary 561 + and isinstance(secondary, dict) 562 + and secondary.get("box_2d") 563 + ): 564 + secondary_box_pixels = self._denormalize_box( 565 + secondary["box_2d"], img_width, img_height 566 + ) 567 + if self._box_qualifies(secondary_box_pixels): 568 + secondary_prompt_type = self._get_follow_up_prompt( 569 + secondary.get("category", "") 570 + ) 451 571 452 - req.request_type = RequestType.DESCRIBE_MEETING 453 - req.retry_count = 0 454 - continue # Don't output yet, wait for meeting analysis 572 + # Determine follow-up strategy 573 + follow_ups = [] 455 574 456 - # Check for text extraction 457 - text_categories = CONFIG.get("text_extraction_categories", []) 458 - if visible_category in text_categories: 575 + if primary_prompt_type and secondary_prompt_type: 576 + if primary_prompt_type == secondary_prompt_type: 577 + # Same prompt type - use full frame, single call 578 + follow_ups.append((primary_prompt_type, full_img, "full")) 579 + logger.info( 580 + f"Frame {req.frame_id}: Single {primary_prompt_type} follow-up (full frame)" 581 + ) 582 + else: 583 + # Different prompt types - parallel cropped calls 584 + primary_img = self._crop_to_box(full_img, primary_box_pixels) 585 + secondary_img = self._crop_to_box( 586 + full_img, secondary_box_pixels 587 + ) 588 + follow_ups.append((primary_prompt_type, primary_img, "primary")) 589 + follow_ups.append( 590 + (secondary_prompt_type, secondary_img, "secondary") 591 + ) 592 + logger.info( 593 + f"Frame {req.frame_id}: Parallel follow-ups - " 594 + f"primary={primary_prompt_type}, secondary={secondary_prompt_type}" 595 + ) 596 + elif primary_prompt_type: 597 + # Only primary needs follow-up 598 + primary_img = self._crop_to_box(full_img, primary_box_pixels) 599 + follow_ups.append((primary_prompt_type, primary_img, "primary")) 459 600 logger.info( 460 - f"Frame {req.frame_id}: Triggering text extraction for category '{visible_category}'" 601 + f"Frame {req.frame_id}: {primary_prompt_type} follow-up (primary)" 602 + ) 603 + elif secondary_prompt_type: 604 + # Only secondary needs follow-up 605 + secondary_img = self._crop_to_box(full_img, secondary_box_pixels) 606 + follow_ups.append( 607 + (secondary_prompt_type, secondary_img, "secondary") 461 608 ) 462 - # Reload frame image from cached bytes 463 - text_img = Image.open(io.BytesIO(req.frame_bytes)) 609 + logger.info( 610 + f"Frame {req.frame_id}: {secondary_prompt_type} follow-up (secondary)" 611 + ) 464 612 465 - # Update request for text extraction and re-add 466 - batch.update( 467 - req, 468 - contents=self._user_contents( 469 - "Extract text from this screenshot frame.", 470 - text_img, 471 - entities=True, 472 - ), 473 - model=GEMINI_LITE, 474 - system_instruction=text_system_instruction, 475 - json_output=False, 476 - max_output_tokens=8192, 477 - thinking_budget=4096, 478 - ) 479 - # Don't close yet - batch needs it for encoding 480 - # Store reference for cleanup later 481 - req.text_image = text_img 613 + # Create follow-up requests 614 + if follow_ups: 615 + req.pending_follow_ups = len(follow_ups) 482 616 483 617 # Close initial image since DESCRIBE_JSON is complete 484 618 if hasattr(req, "initial_image") and req.initial_image: 485 619 req.initial_image.close() 486 620 req.initial_image = None 487 621 488 - req.request_type = RequestType.DESCRIBE_TEXT 489 - req.retry_count = 0 490 - continue # Don't output yet, wait for text extraction 622 + for i, (prompt_type, img, source) in enumerate(follow_ups): 623 + if i == 0: 624 + # Reuse original request for first follow-up 625 + follow_req = req 626 + else: 627 + # Create new request for additional follow-ups 628 + follow_req = batch.create(contents=[]) 629 + # Copy essential metadata 630 + follow_req.frame_id = req.frame_id 631 + follow_req.timestamp = req.timestamp 632 + follow_req.frame_bytes = req.frame_bytes 633 + follow_req.json_analysis = req.json_analysis 634 + follow_req.meeting_analysis = req.meeting_analysis 635 + follow_req.extracted_text = req.extracted_text 636 + follow_req.requests = req.requests 637 + follow_req.pending_follow_ups = req.pending_follow_ups 491 638 492 - # Final output - this frame is complete 639 + follow_req.follow_up_source = source 640 + follow_req.retry_count = 0 641 + 642 + if prompt_type == "meeting": 643 + batch.update( 644 + follow_req, 645 + contents=self._user_contents( 646 + "Analyze this meeting screenshot.", 647 + img, 648 + entities=True, 649 + ), 650 + model=GEMINI_FLASH, 651 + system_instruction=meeting_system_instruction, 652 + json_output=True, 653 + max_output_tokens=10240, 654 + thinking_budget=6144, 655 + ) 656 + follow_req.request_type = RequestType.DESCRIBE_MEETING 657 + follow_req.follow_up_image = img 658 + else: # text 659 + batch.update( 660 + follow_req, 661 + contents=self._user_contents( 662 + "Extract text from this screenshot frame.", 663 + img, 664 + entities=True, 665 + ), 666 + model=GEMINI_FLASH, 667 + system_instruction=text_system_instruction, 668 + json_output=False, 669 + max_output_tokens=8192, 670 + thinking_budget=4096, 671 + ) 672 + follow_req.request_type = RequestType.DESCRIBE_TEXT 673 + follow_req.follow_up_image = img 674 + 675 + # Close full_img if we're not using it directly 676 + if not any(source == "full" for _, _, source in follow_ups): 677 + full_img.close() 678 + 679 + continue # Don't output yet, wait for follow-ups 680 + else: 681 + # No follow-ups needed, close full_img 682 + full_img.close() 683 + 684 + # Handle follow-up completion for parallel requests 685 + if req.request_type in ( 686 + RequestType.DESCRIBE_MEETING, 687 + RequestType.DESCRIBE_TEXT, 688 + ): 689 + # Store result in frame_results for merging 690 + if req.frame_id not in frame_results: 691 + frame_results[req.frame_id] = { 692 + "frame_id": req.frame_id, 693 + "timestamp": req.timestamp, 694 + "requests": req.requests, 695 + "analysis": req.json_analysis, 696 + "pending": req.pending_follow_ups, 697 + } 698 + if has_error: 699 + frame_results[req.frame_id]["error"] = error_msg 700 + 701 + result = frame_results[req.frame_id] 702 + 703 + # Merge this follow-up's result 704 + if req.meeting_analysis: 705 + result["meeting_analysis"] = req.meeting_analysis 706 + if req.extracted_text: 707 + result["extracted_text"] = req.extracted_text 708 + 709 + # Update requests list (avoid duplicates by using shared list) 710 + result["requests"] = req.requests 711 + 712 + # Decrement pending count 713 + result["pending"] -= 1 714 + 715 + # Close follow-up image 716 + if hasattr(req, "follow_up_image") and req.follow_up_image: 717 + req.follow_up_image.close() 718 + req.follow_up_image = None 719 + 720 + # If all follow-ups complete, output the result 721 + if result["pending"] <= 0: 722 + del result["pending"] # Remove internal tracking field 723 + 724 + # Write to file and optionally to stdout 725 + result_line = json.dumps(result) 726 + if output_file: 727 + output_file.write(result_line + "\n") 728 + output_file.flush() 729 + if logger.isEnabledFor(logging.DEBUG): 730 + print(result_line, flush=True) 731 + 732 + # Clean up frame_results entry 733 + del frame_results[req.frame_id] 734 + 735 + # Aggressively clear heavy fields 736 + req.frame_bytes = None 737 + req.json_analysis = None 738 + req.meeting_analysis = None 739 + req.extracted_text = None 740 + 741 + continue 742 + 743 + # Final output for frames with no follow-ups (DESCRIBE_JSON only) 493 744 result = { 494 745 "frame_id": req.frame_id, 495 746 "timestamp": req.timestamp, ··· 504 755 if req.json_analysis: 505 756 result["analysis"] = req.json_analysis 506 757 507 - # Add meeting analysis if we have it (from DESCRIBE_MEETING) 508 - if req.meeting_analysis: 509 - result["meeting_analysis"] = req.meeting_analysis 510 - 511 - # Add extracted text if we have it (from DESCRIBE_TEXT) 512 - if req.request_type == RequestType.DESCRIBE_TEXT and req.response: 513 - result["extracted_text"] = req.response 514 - 515 758 # Write to file and optionally to stdout 516 759 result_line = json.dumps(result) 517 760 if output_file: ··· 524 767 if hasattr(req, "initial_image") and req.initial_image: 525 768 req.initial_image.close() 526 769 req.initial_image = None 527 - if hasattr(req, "meeting_image") and req.meeting_image: 528 - req.meeting_image.close() 529 - req.meeting_image = None 530 - if hasattr(req, "text_image") and req.text_image: 531 - req.text_image.close() 532 - req.text_image = None 533 770 534 771 # Aggressively clear heavy fields now that request is finalized 535 772 req.frame_bytes = None 536 773 req.json_analysis = None 537 774 req.meeting_analysis = None 775 + req.extracted_text = None 538 776 539 777 # Close output file 540 778 if output_file:

+56 -17

observe/describe_json.txt

··· 1 - # Screenshot Context Analysis 1 + # Desktop Screenshot Context Analysis 2 2 3 - You are analyzing a desktop screenshot from a screencast recording. 3 + You are analyzing a full desktop screenshot to identify the types of applications and content visible. 4 4 5 - Respond with JSON describing the context: 5 + Respond with JSON describing the visual context and your best estimation of the appropriate category/categories. 6 6 7 7 ```json 8 8 { 9 - "visual_description": "<2 sentences describing what you see>", 10 - "visible": "<pick the most appropriate category: terminal|code|messaging|meeting|browsing|reading|media|gaming|productivity>" 9 + "visual_description": "<1–2 sentences describing what is visible>", 10 + "primary": { 11 + "category": "<use a valid category from below>", 12 + "box_2d": [y0, x0, y1, x1] 13 + }, 14 + "secondary": false | { 15 + "category": "<use a valid category from below>", 16 + "box_2d": [y0, x0, y1, x1] 17 + } 11 18 } 12 19 ``` 13 20 14 21 ## Field Notes 15 22 16 - - **visual_description**: Describe what you see in 2 good sentences for someone who is visually impaired. Describe the visual elements, layout, and content - don't transcribe text verbatim. 17 - - **visible**: Pick the single most appropriate category for what is visible in the focus area 18 - - **terminal**: Command line interfaces 19 - - **code**: Code editors and IDEs 20 - - **messaging**: Chat, email, Slack, Discord, iMessage, Messages, any chat or messaging view 21 - - **meeting**: Video calls and conferencing such as Zoom, Webex, Meet, Teams, etc 22 - - **browsing**: General web browsing, social media posts, shopping 23 - - **reading**: Documents, articles, PDFs, documentation 24 - - **media**: Videos, video players, Youtube, social media images and videos 25 - - **gaming**: Any games or playing a game 26 - - **productivity**: Spreadsheets, presentations, word processors, calendars, any other workplace activities and tools 23 + ### visual_description 24 + 25 + * Describe the **overall desktop view** in **1–2 sentences** for a visually impaired user. 26 + * Focus on layout, window arrangement, and types of content. 27 + * Do **not** transcribe text verbatim. 28 + 29 + ### primary 30 + 31 + * Represents the **dominant or user-focused application/window** on the desktop. 32 + * Typically the largest, foreground, or task-relevant window. 33 + * **category**: Use a valid category from the list below. 34 + * **box_2d**: 35 + 36 + * Pixel coordinates relative to the full screenshot 37 + * Format: `[y0, x0, y1, x1]` 38 + * Must **fully enclose** the primary application or content area 39 + 40 + ### secondary 41 + 42 + * Represents a **clearly distinct secondary application/window**, if present. 43 + * Set **secondary** to `false` if: 44 + * One application occupies essentially the entire screen, or 45 + * Other windows are background-only or not meaningfully relevant 46 + * If another category is present choose the largest most visible one: 47 + * **category**: Use a valid category from the list below 48 + * **box_2d**: Must fully enclose the secondary application or content area 49 + * The secondary bounding box should not significantly overlap the primary bounding box. 27 50 28 - Keep responses concise and accurate. 51 + ### Categories 52 + 53 + * **terminal**: Command-line interfaces or shells 54 + * **code**: Code editors and IDEs 55 + * **messaging**: Chat, email, Slack, Discord, etc. 56 + * **meeting**: Video conferencing tools 57 + * **browsing**: Web browsing, social media, shopping 58 + * **reading**: Documents, articles, PDFs, documentation 59 + * **media**: Video or image viewing 60 + * **gaming**: Games or game interfaces 61 + * **productivity**: Spreadsheets, documents, presentations, calendars, task tools 62 + 63 + ## General Instructions 64 + 65 + * Bounding boxes must be accurate and conservative (fully containing windows). 66 + * Prefer correctness and clarity over exhaustiveness. 67 + * Output **only valid JSON**.

+6 -1

observe/screen.py

··· 146 146 # Add analysis if present 147 147 analysis = frame.get("analysis", {}) 148 148 if analysis: 149 - category = analysis.get("visible", "unknown") 149 + # Extract category from primary region, fall back to legacy visible field 150 + primary = analysis.get("primary", {}) 151 + if primary: 152 + category = primary.get("category", "unknown") 153 + else: 154 + category = analysis.get("visible", "unknown") 150 155 description = analysis.get("visual_description", "") 151 156 152 157 lines.append(f"**Category:** {category}")

+5 -5

tests/test_cluster.py

··· 133 133 # Raw screen.jsonl with frame analysis (what cluster_period should use) 134 134 (segment / "screen.jsonl").write_text( 135 135 '{"raw": "screen.webm"}\n' 136 - '{"timestamp": 10, "analysis": {"visible": "code_editor", ' 136 + '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}, ' 137 137 '"visual_description": "VS Code with Python file"}}\n' 138 138 ) 139 139 # Also create screen.md (insight) to verify it's NOT used by cluster_period ··· 170 170 # Also create screen.jsonl to verify it's NOT used when insights=True, screen=False 171 171 (segment / "screen.jsonl").write_text( 172 172 '{"raw": "screen.webm"}\n' 173 - '{"timestamp": 10, "analysis": {"visible": "code_editor"}}\n' 173 + '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}}}\n' 174 174 ) 175 175 176 176 # Test insights=True returns *.md summaries, not raw screen data ··· 200 200 segment.mkdir() 201 201 (segment / "screen.jsonl").write_text( 202 202 '{"raw": "screen.webm"}\n' 203 - '{"timestamp": 10, "analysis": {"visible": "code_editor"}}\n' 203 + '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}}}\n' 204 204 ) 205 205 (segment / "screen.md").write_text("Screen summary insight") 206 206 ··· 228 228 segment.mkdir() 229 229 (segment / "screen.jsonl").write_text( 230 230 '{"raw": "screen.webm"}\n' 231 - '{"timestamp": 10, "analysis": {"visible": "code_editor", ' 231 + '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}, ' 232 232 '"visual_description": "Primary monitor with VS Code"}}\n' 233 233 ) 234 234 (segment / "monitor_2_screen.jsonl").write_text( 235 235 '{"raw": "monitor_2.webm"}\n' 236 - '{"timestamp": 10, "analysis": {"visible": "browser", ' 236 + '{"timestamp": 10, "analysis": {"primary": {"category": "browser"}, ' 237 237 '"visual_description": "Secondary monitor with documentation"}}\n' 238 238 ) 239 239

+9 -6

tests/test_formatters.py

··· 181 181 entries = [ 182 182 { 183 183 "timestamp": 5, 184 - "analysis": {"visible": "code", "visual_description": "Python code"}, 184 + "analysis": { 185 + "primary": {"category": "code"}, 186 + "visual_description": "Python code", 187 + }, 185 188 "extracted_text": "def hello():\n pass", 186 189 } 187 190 ] ··· 199 202 """Test screen formatting with entity context.""" 200 203 from observe.screen import format_screen 201 204 202 - entries = [{"timestamp": 0, "analysis": {"visible": "browser"}}] 205 + entries = [{"timestamp": 0, "analysis": {"primary": {"category": "browser"}}}] 203 206 context = {"entity_names": "Alice, Bob", "include_entity_context": True} 204 207 205 208 chunks, meta = format_screen(entries, context) ··· 212 215 """Test screen formatting without entity context.""" 213 216 from observe.screen import format_screen 214 217 215 - entries = [{"timestamp": 0, "analysis": {"visible": "browser"}}] 218 + entries = [{"timestamp": 0, "analysis": {"primary": {"category": "browser"}}}] 216 219 context = {"include_entity_context": False} 217 220 218 221 chunks, meta = format_screen(entries, context) ··· 260 263 261 264 entries = [ 262 265 {"raw": "screen.webm"}, # Metadata line 263 - {"timestamp": 5, "analysis": {"visible": "code"}}, 266 + {"timestamp": 5, "analysis": {"primary": {"category": "code"}}}, 264 267 ] 265 268 266 269 chunks, meta = format_screen(entries) ··· 1486 1489 """Test format_screen returns indexer with topic.""" 1487 1490 from observe.screen import format_screen 1488 1491 1489 - entries = [{"timestamp": 0, "analysis": {"visible": "code"}}] 1492 + entries = [{"timestamp": 0, "analysis": {"primary": {"category": "code"}}}] 1490 1493 chunks, meta = format_screen(entries) 1491 1494 1492 1495 assert "indexer" in meta ··· 1587 1590 1588 1591 frame = { 1589 1592 "timestamp": 5, 1590 - "analysis": {"visible": "code"}, 1593 + "analysis": {"primary": {"category": "code"}}, 1591 1594 "extra_field": "value", 1592 1595 } 1593 1596 entries = [frame]

+45 -15

tests/test_screen_formatter.py

··· 11 11 frames = [ 12 12 { 13 13 "timestamp": 0, 14 - "analysis": {"visible": "code", "visual_description": "Editing Python"}, 14 + "analysis": { 15 + "primary": {"category": "code"}, 16 + "visual_description": "Editing Python", 17 + }, 15 18 }, 16 19 { 17 20 "timestamp": 30, 18 - "analysis": {"visible": "terminal", "visual_description": "Running tests"}, 21 + "analysis": { 22 + "primary": {"category": "terminal"}, 23 + "visual_description": "Running tests", 24 + }, 19 25 }, 20 26 { 21 27 "timestamp": 120, 22 - "analysis": {"visible": "browser", "visual_description": "Reading docs"}, 28 + "analysis": { 29 + "primary": {"category": "browser"}, 30 + "visual_description": "Reading docs", 31 + }, 23 32 }, 24 33 ] 25 34 ··· 48 57 frames = [ 49 58 { 50 59 "timestamp": 0, 51 - "analysis": {"visible": "code", "visual_description": "Code"}, 60 + "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 52 61 }, 53 62 { 54 63 "timestamp": 60, 55 - "analysis": {"visible": "terminal", "visual_description": "Terminal"}, 64 + "analysis": { 65 + "primary": {"category": "terminal"}, 66 + "visual_description": "Terminal", 67 + }, 56 68 }, 57 69 ] 58 70 ··· 77 89 frames = [ 78 90 { 79 91 "timestamp": 0, 80 - "analysis": {"visible": "code", "visual_description": "Code"}, 92 + "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 81 93 }, 82 94 { 83 95 "timestamp": 3600, 84 - "analysis": {"visible": "browser", "visual_description": "Browser"}, 96 + "analysis": { 97 + "primary": {"category": "browser"}, 98 + "visual_description": "Browser", 99 + }, 85 100 }, 86 101 ] 87 102 ··· 100 115 frames = [ 101 116 { 102 117 "timestamp": 0, 103 - "analysis": {"visible": "code", "visual_description": "Editing code"}, 118 + "analysis": { 119 + "primary": {"category": "code"}, 120 + "visual_description": "Editing code", 121 + }, 104 122 }, 105 123 { 106 124 "timestamp": 30, 107 - "analysis": {"visible": "browser", "visual_description": "Documentation"}, 125 + "analysis": { 126 + "primary": {"category": "browser"}, 127 + "visual_description": "Documentation", 128 + }, 108 129 }, 109 130 ] 110 131 ··· 127 148 frames = [ 128 149 { 129 150 "timestamp": 0, 130 - "analysis": {"visible": "code", "visual_description": "Editing code"}, 151 + "analysis": { 152 + "primary": {"category": "code"}, 153 + "visual_description": "Editing code", 154 + }, 131 155 }, 132 156 ] 133 157 ··· 148 172 frames = [ 149 173 { 150 174 "timestamp": 0, 151 - "analysis": {"visible": "code", "visual_description": "Code"}, 175 + "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 152 176 }, 153 177 ] 154 178 ··· 172 196 { 173 197 "timestamp": 0, 174 198 "analysis": { 175 - "visible": "terminal", 199 + "primary": {"category": "terminal"}, 176 200 "visual_description": "Terminal window", 177 201 }, 178 202 "extracted_text": "$ python test.py\nAll tests passed", ··· 198 222 frames = [ 199 223 { 200 224 "timestamp": 0, 201 - "analysis": {"visible": "code", "visual_description": "Frame 1"}, 225 + "analysis": { 226 + "primary": {"category": "code"}, 227 + "visual_description": "Frame 1", 228 + }, 202 229 }, 203 230 { 204 231 "timestamp": 30, 205 - "analysis": {"visible": "terminal", "visual_description": "Frame 2"}, 232 + "analysis": { 233 + "primary": {"category": "terminal"}, 234 + "visual_description": "Frame 2", 235 + }, 206 236 }, 207 237 ] 208 238 ··· 220 250 frames = [ 221 251 { 222 252 "timestamp": 0, 223 - "analysis": {"visible": "code", "visual_description": "Test"}, 253 + "analysis": {"primary": {"category": "code"}, "visual_description": "Test"}, 224 254 }, 225 255 ] 226 256

Configure Feed

Configure Feed