Simplify describe.py vision analysis prompt and add focus guidance

+79 -168

observe/describe.py

··· 5 5 Processes per-monitor screencast files (.webm/.mp4/.mov), detects changes using 6 6 RMS-based comparison, and sends frames to Gemini for multi-stage analysis: 7 7 8 - 1. Initial categorization identifies primary/secondary regions with bounding boxes 9 - 2. Follow-up analysis (text extraction or meeting analysis) uses cropped regions 10 - based on category, or full frame when both regions need the same analysis type 8 + 1. Initial categorization identifies primary/secondary app categories 9 + 2. Follow-up analysis (text extraction or meeting analysis) based on category 11 10 """ 12 11 13 12 from __future__ import annotations ··· 30 29 from think.utils import setup_cli 31 30 32 31 logger = logging.getLogger(__name__) 33 - 34 - # Minimum box size in pixels for follow-up processing 35 - MIN_BOX_SIZE = 300 36 32 37 33 38 34 class RequestType(Enum): ··· 235 231 img.save(buf, format="PNG", compress_level=1) 236 232 return buf.getvalue() 237 233 238 - def _denormalize_box( 239 - self, box_2d: List[int], img_width: int, img_height: int 240 - ) -> tuple: 241 - """ 242 - Convert 0-1000 normalized coords to pixel coords. 243 - 244 - Parameters 245 - ---------- 246 - box_2d : List[int] 247 - Normalized coordinates [y0, x0, y1, x1] in 0-1000 range 248 - img_width : int 249 - Image width in pixels 250 - img_height : int 251 - Image height in pixels 252 - 253 - Returns 254 - ------- 255 - tuple 256 - Pixel coordinates (y0, x0, y1, x1) 257 - """ 258 - y0, x0, y1, x1 = box_2d 259 - return ( 260 - int(y0 * img_height / 1000), 261 - int(x0 * img_width / 1000), 262 - int(y1 * img_height / 1000), 263 - int(x1 * img_width / 1000), 264 - ) 265 - 266 - def _box_qualifies(self, box_pixels: tuple) -> bool: 267 - """ 268 - Check if denormalized box is >MIN_BOX_SIZE in both dimensions. 269 - 270 - Parameters 271 - ---------- 272 - box_pixels : tuple 273 - Pixel coordinates (y0, x0, y1, x1) 274 - 275 - Returns 276 - ------- 277 - bool 278 - True if box qualifies for follow-up processing 279 - """ 280 - y0, x0, y1, x1 = box_pixels 281 - width = x1 - x0 282 - height = y1 - y0 283 - return width > MIN_BOX_SIZE and height > MIN_BOX_SIZE 284 - 285 - def _crop_to_box(self, img: Image.Image, box_pixels: tuple) -> Image.Image: 286 - """ 287 - Crop PIL Image using denormalized pixel coords. 288 - 289 - Parameters 290 - ---------- 291 - img : Image.Image 292 - Source image to crop 293 - box_pixels : tuple 294 - Pixel coordinates (y0, x0, y1, x1) 295 - 296 - Returns 297 - ------- 298 - Image.Image 299 - Cropped image 300 - """ 301 - y0, x0, y1, x1 = box_pixels 302 - # Clamp to image bounds 303 - x0 = max(0, min(x0, img.width)) 304 - x1 = max(0, min(x1, img.width)) 305 - y0 = max(0, min(y0, img.height)) 306 - y1 = max(0, min(y1, img.height)) 307 - # PIL crop uses (left, upper, right, lower) = (x0, y0, x1, y1) 308 - return img.crop((x0, y0, x1, y1)) 309 - 310 234 def _get_follow_up_prompt(self, category: str) -> Optional[str]: 311 235 """ 312 236 Map category to follow-up prompt type. ··· 380 304 Path to write JSONL output (when None, no output file is written) 381 305 """ 382 306 from think.batch import GeminiBatch 383 - from think.models import GEMINI_FLASH 307 + from think.models import GEMINI_FLASH, GEMINI_LITE 384 308 385 309 # Load prompt templates 386 310 prompt_path = Path(__file__).parent / use_prompt ··· 431 355 "Analyze this screenshot frame from a screencast recording.", 432 356 frame_img, 433 357 ), 434 - model=GEMINI_FLASH, 358 + model=GEMINI_LITE, 435 359 system_instruction=system_instruction, 436 360 json_output=True, 437 361 temperature=0.7, 438 - max_output_tokens=3072, 439 - thinking_budget=2048, 362 + max_output_tokens=1024, 363 + thinking_budget=1024, 440 364 ) 441 365 442 366 # Attach metadata for tracking (store bytes, not PIL images) ··· 451 375 req.requests = [] # Track all requests for this frame 452 376 req.initial_image = frame_img # Keep reference to close after completion 453 377 req.pending_follow_ups = 0 # Track how many follow-ups are pending 454 - req.follow_up_source = None # "primary", "secondary", or "full" 378 + req.follow_up_source = None # "primary" or "secondary" 455 379 456 380 batch.add(req) 457 381 ··· 533 457 ) 534 458 535 459 if should_process_further: 536 - # Extract primary and secondary regions 537 - primary = req.json_analysis.get("primary", {}) 538 - secondary = req.json_analysis.get("secondary", False) 460 + # Extract categories from new simplified format 461 + primary = req.json_analysis.get("primary", "") 462 + secondary = req.json_analysis.get("secondary", "none") 463 + overlap = req.json_analysis.get("overlap", True) 539 464 540 - # Load full frame for potential processing 541 - full_img = Image.open(io.BytesIO(req.frame_bytes)) 542 - img_width, img_height = full_img.width, full_img.height 465 + # Determine follow-up types 466 + primary_prompt_type = self._get_follow_up_prompt(primary) 467 + secondary_prompt_type = ( 468 + self._get_follow_up_prompt(secondary) 469 + if secondary != "none" 470 + else None 471 + ) 472 + 473 + # Build follow-up list with category context for focus guidance 474 + # Each entry: (prompt_type, source, focus_categories, ignore_category) 475 + # Primary always triggers if it has a follow-up type 476 + # Secondary triggers only if overlap=false AND different type 477 + follow_ups = [] 543 478 544 - # Analyze primary region 545 - primary_prompt_type = None 546 - primary_box_pixels = None 547 - if primary and primary.get("box_2d"): 548 - primary_box_pixels = self._denormalize_box( 549 - primary["box_2d"], img_width, img_height 550 - ) 551 - if self._box_qualifies(primary_box_pixels): 552 - primary_prompt_type = self._get_follow_up_prompt( 553 - primary.get("category", "") 554 - ) 479 + has_secondary = secondary != "none" 480 + same_follow_up_type = ( 481 + primary_prompt_type 482 + and secondary_prompt_type 483 + and primary_prompt_type == secondary_prompt_type 484 + ) 555 485 556 - # Analyze secondary region 557 - secondary_prompt_type = None 558 - secondary_box_pixels = None 559 - if ( 560 - secondary 561 - and isinstance(secondary, dict) 562 - and secondary.get("box_2d") 563 - ): 564 - secondary_box_pixels = self._denormalize_box( 565 - secondary["box_2d"], img_width, img_height 566 - ) 567 - if self._box_qualifies(secondary_box_pixels): 568 - secondary_prompt_type = self._get_follow_up_prompt( 569 - secondary.get("category", "") 486 + if primary_prompt_type: 487 + if same_follow_up_type: 488 + # Both categories need same follow-up - focus on both 489 + follow_ups.append( 490 + (primary_prompt_type, "primary", [primary, secondary], None) 570 491 ) 571 - 572 - # Determine follow-up strategy 573 - follow_ups = [] 574 - 575 - if primary_prompt_type and secondary_prompt_type: 576 - if primary_prompt_type == secondary_prompt_type: 577 - # Same prompt type - use full frame, single call 578 - follow_ups.append((primary_prompt_type, full_img, "full")) 579 - logger.info( 580 - f"Frame {req.frame_id}: Single {primary_prompt_type} follow-up (full frame)" 492 + elif has_secondary: 493 + # Different types - focus on primary, ignore secondary 494 + follow_ups.append( 495 + (primary_prompt_type, "primary", [primary], secondary) 581 496 ) 582 497 else: 583 - # Different prompt types - parallel cropped calls 584 - primary_img = self._crop_to_box(full_img, primary_box_pixels) 585 - secondary_img = self._crop_to_box( 586 - full_img, secondary_box_pixels 587 - ) 588 - follow_ups.append((primary_prompt_type, primary_img, "primary")) 498 + # No secondary - no focus guidance needed 589 499 follow_ups.append( 590 - (secondary_prompt_type, secondary_img, "secondary") 500 + (primary_prompt_type, "primary", [primary], None) 591 501 ) 592 - logger.info( 593 - f"Frame {req.frame_id}: Parallel follow-ups - " 594 - f"primary={primary_prompt_type}, secondary={secondary_prompt_type}" 595 - ) 596 - elif primary_prompt_type: 597 - # Only primary needs follow-up 598 - primary_img = self._crop_to_box(full_img, primary_box_pixels) 599 - follow_ups.append((primary_prompt_type, primary_img, "primary")) 600 - logger.info( 601 - f"Frame {req.frame_id}: {primary_prompt_type} follow-up (primary)" 602 - ) 603 - elif secondary_prompt_type: 604 - # Only secondary needs follow-up 605 - secondary_img = self._crop_to_box(full_img, secondary_box_pixels) 502 + 503 + if ( 504 + not overlap 505 + and secondary_prompt_type 506 + and secondary_prompt_type != primary_prompt_type 507 + ): 508 + # Secondary needs different follow-up - focus on it, ignore primary 606 509 follow_ups.append( 607 - (secondary_prompt_type, secondary_img, "secondary") 608 - ) 609 - logger.info( 610 - f"Frame {req.frame_id}: {secondary_prompt_type} follow-up (secondary)" 510 + (secondary_prompt_type, "secondary", [secondary], primary) 611 511 ) 612 512 613 - # Create follow-up requests 513 + # Create follow-up requests (all use full frame) 614 514 if follow_ups: 515 + # Load full frame for follow-up processing 516 + full_img = Image.open(io.BytesIO(req.frame_bytes)) 615 517 req.pending_follow_ups = len(follow_ups) 616 518 617 519 # Close initial image since DESCRIBE_JSON is complete ··· 619 521 req.initial_image.close() 620 522 req.initial_image = None 621 523 622 - for i, (prompt_type, img, source) in enumerate(follow_ups): 524 + for i, (prompt_type, source, focus_cats, ignore_cat) in enumerate( 525 + follow_ups 526 + ): 623 527 if i == 0: 624 528 # Reuse original request for first follow-up 625 529 follow_req = req ··· 639 543 follow_req.follow_up_source = source 640 544 follow_req.retry_count = 0 641 545 546 + # Build focus guidance suffix 547 + focus_suffix = "" 548 + if len(focus_cats) > 1: 549 + # Multiple categories to focus on (same follow-up type) 550 + cats_str = " and ".join(focus_cats) 551 + focus_suffix = f" Focus on both the {cats_str} content." 552 + elif ignore_cat: 553 + # Single category, ignore the other 554 + focus_suffix = ( 555 + f" Focus on the {focus_cats[0]} content, " 556 + f"not the {ignore_cat} content." 557 + ) 558 + 642 559 if prompt_type == "meeting": 643 560 batch.update( 644 561 follow_req, 645 562 contents=self._user_contents( 646 - "Analyze this meeting screenshot.", 647 - img, 563 + f"Analyze this meeting screenshot.{focus_suffix}", 564 + full_img, 648 565 entities=True, 649 566 ), 650 567 model=GEMINI_FLASH, ··· 654 571 thinking_budget=6144, 655 572 ) 656 573 follow_req.request_type = RequestType.DESCRIBE_MEETING 657 - follow_req.follow_up_image = img 658 574 else: # text 659 575 batch.update( 660 576 follow_req, 661 577 contents=self._user_contents( 662 - "Extract text from this screenshot frame.", 663 - img, 578 + f"Extract text from this screenshot.{focus_suffix}", 579 + full_img, 664 580 entities=True, 665 581 ), 666 582 model=GEMINI_FLASH, ··· 670 586 thinking_budget=4096, 671 587 ) 672 588 follow_req.request_type = RequestType.DESCRIBE_TEXT 673 - follow_req.follow_up_image = img 674 589 675 - # Close full_img if we're not using it directly 676 - if not any(source == "full" for _, _, source in follow_ups): 677 - full_img.close() 590 + logger.info( 591 + f"Frame {req.frame_id}: {len(follow_ups)} follow-up(s) - " 592 + f"{', '.join(pt for pt, _ , _, _ in follow_ups)}" 593 + ) 678 594 679 - continue # Don't output yet, wait for follow-ups 680 - else: 681 - # No follow-ups needed, close full_img 595 + # Close full_img after all follow-up requests are created 682 596 full_img.close() 597 + 598 + continue # Don't output yet, wait for follow-ups 683 599 684 600 # Handle follow-up completion for parallel requests 685 601 if req.request_type in ( ··· 711 627 712 628 # Decrement pending count 713 629 result["pending"] -= 1 714 - 715 - # Close follow-up image 716 - if hasattr(req, "follow_up_image") and req.follow_up_image: 717 - req.follow_up_image.close() 718 - req.follow_up_image = None 719 630 720 631 # If all follow-ups complete, output the result 721 632 if result["pending"] <= 0:

+21 -62

observe/describe_json.txt

··· 1 - # Desktop Screenshot Context Analysis 2 - 3 - You are analyzing a full desktop screenshot to identify the types of applications and content visible. 4 - 5 - Respond with JSON describing the visual context and your best estimation of the appropriate category/categories. 1 + You have one job: identify the primary foreground and (if present) secondary app categories in this desktop screenshot, and return ONLY this JSON: 6 2 7 - ```json 8 3 { 9 - "visual_description": "<1–2 sentences describing what is visible>", 10 - "primary": { 11 - "category": "<use a valid category from below>", 12 - "box_2d": [y0, x0, y1, x1] 13 - }, 14 - "secondary": false | { 15 - "category": "<use a valid category from below>", 16 - "box_2d": [y0, x0, y1, x1] 17 - } 4 + "visual_description":"<1–2 sentences describing what is visible>", 5 + "primary": "<largest and most visible app category>", 6 + "secondary": "<second most visible app category or 'none'>", 7 + "overlap": <boolean, does the primary overlap or cover the secondary, or is it fully standalone and separate> 18 8 } 19 - ``` 20 9 21 - ## Field Notes 10 + Rules: 11 + - For visual_description summarize the **overall desktop view** in **1–2 sentences** for a visually impaired user, focus on layout, window arrangement, and types of content. 12 + - For the most visible primary foreground app choose the best category from the list below. 13 + - Set "secondary" to "none" and "overlap" to true if the primary effectively fills the screen or no distinct second category/window is visible. 14 + - Set overlap to true if the primary app overlaps, covers, clips, or obscures the secondary in any way. 15 + - Only set a category for secondary if it is very visible and occupies more than 30% of the screen. 22 16 23 - ### visual_description 24 - 25 - * Describe the **overall desktop view** in **1–2 sentences** for a visually impaired user. 26 - * Focus on layout, window arrangement, and types of content. 27 - * Do **not** transcribe text verbatim. 28 - 29 - ### primary 30 - 31 - * Represents the **dominant or user-focused application/window** on the desktop. 32 - * Typically the largest, foreground, or task-relevant window. 33 - * **category**: Use a valid category from the list below. 34 - * **box_2d**: 35 - 36 - * Pixel coordinates relative to the full screenshot 37 - * Format: `[y0, x0, y1, x1]` 38 - * Must **fully enclose** the primary application or content area 39 - 40 - ### secondary 41 - 42 - * Represents a **clearly distinct secondary application/window**, if present. 43 - * Set **secondary** to `false` if: 44 - * One application occupies essentially the entire screen, or 45 - * Other windows are background-only or not meaningfully relevant 46 - * If another category is present choose the largest most visible one: 47 - * **category**: Use a valid category from the list below 48 - * **box_2d**: Must fully enclose the secondary application or content area 49 - * The secondary bounding box should not significantly overlap the primary bounding box. 50 - 51 - ### Categories 52 - 53 - * **terminal**: Command-line interfaces or shells 54 - * **code**: Code editors and IDEs 55 - * **messaging**: Chat, email, Slack, Discord, etc. 56 - * **meeting**: Video conferencing tools 57 - * **browsing**: Web browsing, social media, shopping 58 - * **reading**: Documents, articles, PDFs, documentation 59 - * **media**: Video or image viewing 60 - * **gaming**: Games or game interfaces 61 - * **productivity**: Spreadsheets, documents, presentations, calendars, task tools 62 - 63 - ## General Instructions 64 - 65 - * Bounding boxes must be accurate and conservative (fully containing windows). 66 - * Prefer correctness and clarity over exhaustiveness. 67 - * Output **only valid JSON**. 17 + Categories (choose one): 18 + - terminal: Command line interfaces, logs, shell 19 + - code: Code editors and IDEs 20 + - messaging: Chat or email apps (Slack, Discord, Messages/iMessage, Gmail, etc.) 21 + - meeting: Video calls/conferencing (Zoom, Meet, Teams, Webex, etc.) 22 + - browsing: General web browsing, social feeds, shopping 23 + - reading: Documents, articles, PDFs, documentation 24 + - media: Video players/streams, YouTube, image/video-heavy feeds 25 + - gaming: Video games, puzzles, idle games 26 + - productivity: Spreadsheets, slides, document editors, calendars, task and issue tracking tools, other workplace desktop or web apps and professional tools

+3 -6

observe/screen.py

··· 146 146 # Add analysis if present 147 147 analysis = frame.get("analysis", {}) 148 148 if analysis: 149 - # Extract category from primary region, fall back to legacy visible field 150 - primary = analysis.get("primary", {}) 151 - if primary: 152 - category = primary.get("category", "unknown") 153 - else: 154 - category = analysis.get("visible", "unknown") 149 + # New format: primary is a string category 150 + # Fall back to legacy "visible" field for old data 151 + category = analysis.get("primary", analysis.get("visible", "unknown")) 155 152 description = analysis.get("visual_description", "") 156 153 157 154 lines.append(f"**Category:** {category}")

+5 -5

tests/test_cluster.py

··· 133 133 # Raw screen.jsonl with frame analysis (what cluster_period should use) 134 134 (segment / "screen.jsonl").write_text( 135 135 '{"raw": "screen.webm"}\n' 136 - '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}, ' 136 + '{"timestamp": 10, "analysis": {"primary": "code_editor", ' 137 137 '"visual_description": "VS Code with Python file"}}\n' 138 138 ) 139 139 # Also create screen.md (insight) to verify it's NOT used by cluster_period ··· 170 170 # Also create screen.jsonl to verify it's NOT used when insights=True, screen=False 171 171 (segment / "screen.jsonl").write_text( 172 172 '{"raw": "screen.webm"}\n' 173 - '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}}}\n' 173 + '{"timestamp": 10, "analysis": {"primary": "code_editor"}}\n' 174 174 ) 175 175 176 176 # Test insights=True returns *.md summaries, not raw screen data ··· 200 200 segment.mkdir() 201 201 (segment / "screen.jsonl").write_text( 202 202 '{"raw": "screen.webm"}\n' 203 - '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}}}\n' 203 + '{"timestamp": 10, "analysis": {"primary": "code_editor"}}\n' 204 204 ) 205 205 (segment / "screen.md").write_text("Screen summary insight") 206 206 ··· 228 228 segment.mkdir() 229 229 (segment / "screen.jsonl").write_text( 230 230 '{"raw": "screen.webm"}\n' 231 - '{"timestamp": 10, "analysis": {"primary": {"category": "code_editor"}, ' 231 + '{"timestamp": 10, "analysis": {"primary": "code_editor", ' 232 232 '"visual_description": "Primary monitor with VS Code"}}\n' 233 233 ) 234 234 (segment / "monitor_2_screen.jsonl").write_text( 235 235 '{"raw": "monitor_2.webm"}\n' 236 - '{"timestamp": 10, "analysis": {"primary": {"category": "browser"}, ' 236 + '{"timestamp": 10, "analysis": {"primary": "browser", ' 237 237 '"visual_description": "Secondary monitor with documentation"}}\n' 238 238 ) 239 239

+6 -6

tests/test_formatters.py

··· 182 182 { 183 183 "timestamp": 5, 184 184 "analysis": { 185 - "primary": {"category": "code"}, 185 + "primary": "code", 186 186 "visual_description": "Python code", 187 187 }, 188 188 "extracted_text": "def hello():\n pass", ··· 202 202 """Test screen formatting with entity context.""" 203 203 from observe.screen import format_screen 204 204 205 - entries = [{"timestamp": 0, "analysis": {"primary": {"category": "browser"}}}] 205 + entries = [{"timestamp": 0, "analysis": {"primary": "browser"}}] 206 206 context = {"entity_names": "Alice, Bob", "include_entity_context": True} 207 207 208 208 chunks, meta = format_screen(entries, context) ··· 215 215 """Test screen formatting without entity context.""" 216 216 from observe.screen import format_screen 217 217 218 - entries = [{"timestamp": 0, "analysis": {"primary": {"category": "browser"}}}] 218 + entries = [{"timestamp": 0, "analysis": {"primary": "browser"}}] 219 219 context = {"include_entity_context": False} 220 220 221 221 chunks, meta = format_screen(entries, context) ··· 263 263 264 264 entries = [ 265 265 {"raw": "screen.webm"}, # Metadata line 266 - {"timestamp": 5, "analysis": {"primary": {"category": "code"}}}, 266 + {"timestamp": 5, "analysis": {"primary": "code"}}, 267 267 ] 268 268 269 269 chunks, meta = format_screen(entries) ··· 1489 1489 """Test format_screen returns indexer with topic.""" 1490 1490 from observe.screen import format_screen 1491 1491 1492 - entries = [{"timestamp": 0, "analysis": {"primary": {"category": "code"}}}] 1492 + entries = [{"timestamp": 0, "analysis": {"primary": "code"}}] 1493 1493 chunks, meta = format_screen(entries) 1494 1494 1495 1495 assert "indexer" in meta ··· 1590 1590 1591 1591 frame = { 1592 1592 "timestamp": 5, 1593 - "analysis": {"primary": {"category": "code"}}, 1593 + "analysis": {"primary": "code"}, 1594 1594 "extra_field": "value", 1595 1595 } 1596 1596 entries = [frame]

+15 -15

tests/test_screen_formatter.py

··· 12 12 { 13 13 "timestamp": 0, 14 14 "analysis": { 15 - "primary": {"category": "code"}, 15 + "primary": "code", 16 16 "visual_description": "Editing Python", 17 17 }, 18 18 }, 19 19 { 20 20 "timestamp": 30, 21 21 "analysis": { 22 - "primary": {"category": "terminal"}, 22 + "primary": "terminal", 23 23 "visual_description": "Running tests", 24 24 }, 25 25 }, 26 26 { 27 27 "timestamp": 120, 28 28 "analysis": { 29 - "primary": {"category": "browser"}, 29 + "primary": "browser", 30 30 "visual_description": "Reading docs", 31 31 }, 32 32 }, ··· 57 57 frames = [ 58 58 { 59 59 "timestamp": 0, 60 - "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 60 + "analysis": {"primary": "code", "visual_description": "Code"}, 61 61 }, 62 62 { 63 63 "timestamp": 60, 64 64 "analysis": { 65 - "primary": {"category": "terminal"}, 65 + "primary": "terminal", 66 66 "visual_description": "Terminal", 67 67 }, 68 68 }, ··· 89 89 frames = [ 90 90 { 91 91 "timestamp": 0, 92 - "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 92 + "analysis": {"primary": "code", "visual_description": "Code"}, 93 93 }, 94 94 { 95 95 "timestamp": 3600, 96 96 "analysis": { 97 - "primary": {"category": "browser"}, 97 + "primary": "browser", 98 98 "visual_description": "Browser", 99 99 }, 100 100 }, ··· 116 116 { 117 117 "timestamp": 0, 118 118 "analysis": { 119 - "primary": {"category": "code"}, 119 + "primary": "code", 120 120 "visual_description": "Editing code", 121 121 }, 122 122 }, 123 123 { 124 124 "timestamp": 30, 125 125 "analysis": { 126 - "primary": {"category": "browser"}, 126 + "primary": "browser", 127 127 "visual_description": "Documentation", 128 128 }, 129 129 }, ··· 149 149 { 150 150 "timestamp": 0, 151 151 "analysis": { 152 - "primary": {"category": "code"}, 152 + "primary": "code", 153 153 "visual_description": "Editing code", 154 154 }, 155 155 }, ··· 172 172 frames = [ 173 173 { 174 174 "timestamp": 0, 175 - "analysis": {"primary": {"category": "code"}, "visual_description": "Code"}, 175 + "analysis": {"primary": "code", "visual_description": "Code"}, 176 176 }, 177 177 ] 178 178 ··· 196 196 { 197 197 "timestamp": 0, 198 198 "analysis": { 199 - "primary": {"category": "terminal"}, 199 + "primary": "terminal", 200 200 "visual_description": "Terminal window", 201 201 }, 202 202 "extracted_text": "$ python test.py\nAll tests passed", ··· 223 223 { 224 224 "timestamp": 0, 225 225 "analysis": { 226 - "primary": {"category": "code"}, 226 + "primary": "code", 227 227 "visual_description": "Frame 1", 228 228 }, 229 229 }, 230 230 { 231 231 "timestamp": 30, 232 232 "analysis": { 233 - "primary": {"category": "terminal"}, 233 + "primary": "terminal", 234 234 "visual_description": "Frame 2", 235 235 }, 236 236 }, ··· 250 250 frames = [ 251 251 { 252 252 "timestamp": 0, 253 - "analysis": {"primary": {"category": "code"}, "visual_description": "Test"}, 253 + "analysis": {"primary": "code", "visual_description": "Test"}, 254 254 }, 255 255 ] 256 256

Configure Feed

Configure Feed