Refactor describe.py to category-specific prompt system

+18 -11

JOURNAL.md

··· 577 577 "frame_id": 123, 578 578 "timestamp": 45.67, 579 579 "requests": [ 580 - {"type": "describe_json", "model": "gemini-2.0-flash-lite", "duration": 0.5} 580 + {"type": "describe", "model": "gemini-2.5-flash-lite", "duration": 0.5}, 581 + {"type": "category", "category": "reading", "model": "gemini-3-flash", "duration": 1.2} 581 582 ], 582 583 "analysis": { 583 - "visual_description": "A terminal window showing command output with green text on dark background.", 584 - "visible": "terminal" 585 - } 584 + "visual_description": "Documentation page showing API reference.", 585 + "primary": "reading", 586 + "secondary": "none", 587 + "overlap": true 588 + }, 589 + "reading": "# API Reference\n\n## Authentication\n\nUse Bearer tokens..." 586 590 } 587 591 ``` 588 592 589 593 **Common fields:** 590 594 - `frame_id` – sequential frame number in the video 591 595 - `timestamp` – time in seconds from video start 592 - - `requests` – list of vision API requests made for this frame 593 - - `analysis` – categorization and visual description from initial analysis 596 + - `requests` – list of vision API requests made for this frame (type: "describe" for initial, "category" for follow-ups) 597 + - `analysis` – categorization result with `primary`, `secondary`, `overlap`, and `visual_description` 594 598 595 - **Optional fields (conditional processing):** 596 - - `extracted_text` – present when frame contains messaging, browsing, reading, or productivity content 597 - - `meeting_analysis` – present when frame contains video conferencing, includes participant detection and bounding boxes 599 + **Category-specific fields (conditional processing):** 600 + - `messaging` – markdown content when frame contains chat/email apps 601 + - `browsing` – markdown content when frame contains web browsing 602 + - `reading` – markdown content when frame contains documents/articles 603 + - `productivity` – markdown content when frame contains spreadsheets/slides/calendars 604 + - `meeting` – JSON object when frame contains video conferencing, includes participant detection and bounding boxes 598 605 - `error` – present when processing failed after retries 599 606 600 607 The vision analysis uses multi-stage conditional processing: 601 608 1. Initial categorization determines content type (terminal, code, messaging, meeting, browsing, reading, media, gaming, productivity) 602 - 2. Text extraction triggered for categories: messaging, browsing, reading, productivity 603 - 3. Meeting analysis triggered for meeting category, provides full-screen participant detection with entity recognition 609 + 2. Category-specific follow-up prompts are discovered from `observe/describe/` directory 610 + 3. Follow-ups triggered for: messaging, browsing, reading, productivity (markdown output), meeting (JSON output) 604 611 605 612 #### Event extracts 606 613

+3 -3

fixtures/journal/20240102/234567_300/screen.jsonl

··· 1 1 {"raw": "screen.webm"} 2 - {"frame_id": 1, "timestamp": 5.2, "monitor": "DP-1", "monitor_position": "center", "requests": [{"type": "describe_json", "model": "gemini-2.0-flash-lite", "duration": 0.4}], "analysis": {"visual_description": "VSCode IDE window with Python code open, showing the FastAPI application auth module with breakpoints set.", "visible": "code"}, "extracted_text": "class TokenValidator:\n def validate_token(self, token: str) -> bool:\n # Breakpoint here\n decoded = jwt.decode(token)"} 3 - {"frame_id": 2, "timestamp": 15.8, "monitor": "DP-1", "monitor_position": "center", "requests": [{"type": "describe_json", "model": "gemini-2.0-flash-lite", "duration": 0.5}], "analysis": {"visual_description": "Terminal window showing Docker logs from the FastAPI container, displaying request logs and database queries.", "visible": "terminal"}, "extracted_text": "fastapi_1 | INFO: 192.168.1.100:54321 - 'GET /auth/validate HTTP/1.1' 200 OK\npostgres_1 | LOG: duration: 2.451 ms statement: SELECT * FROM users WHERE id = $1"} 4 - {"frame_id": 3, "timestamp": 28.3, "monitor": "DP-1", "monitor_position": "center", "requests": [{"type": "describe_json", "model": "gemini-2.0-flash-lite", "duration": 0.3}], "analysis": {"visual_description": "Split terminal view with pytest output on the left and PostgreSQL query logs on the right.", "visible": "terminal"}, "extracted_text": "============================= test session starts ==============================\ntests/test_auth.py::test_token_validation PASSED\ntests/test_auth.py::test_timezone_handling FAILED"} 2 + {"frame_id": 1, "timestamp": 5.2, "requests": [{"type": "describe", "model": "gemini-2.5-flash-lite", "duration": 0.4}], "analysis": {"visual_description": "VSCode IDE window with Python code open, showing the FastAPI application auth module with breakpoints set.", "primary": "code", "secondary": "none", "overlap": true}} 3 + {"frame_id": 2, "timestamp": 15.8, "requests": [{"type": "describe", "model": "gemini-2.5-flash-lite", "duration": 0.5}], "analysis": {"visual_description": "Terminal window showing Docker logs from the FastAPI container, displaying request logs and database queries.", "primary": "terminal", "secondary": "none", "overlap": true}} 4 + {"frame_id": 3, "timestamp": 28.3, "requests": [{"type": "describe", "model": "gemini-2.5-flash-lite", "duration": 0.3}, {"type": "category", "category": "reading", "model": "gemini-3-flash-preview", "duration": 1.2}], "analysis": {"visual_description": "Documentation page showing pytest usage examples.", "primary": "reading", "secondary": "none", "overlap": true}, "reading": "# Pytest Usage\n\n## Running Tests\n\n```bash\npytest tests/test_auth.py -v\n```\n\n### Test Results\n- test_token_validation: PASSED\n- test_timezone_handling: FAILED"}

-8

observe/describe.json

··· 1 - { 2 - "text_extraction_categories": [ 3 - "messaging", 4 - "browsing", 5 - "reading", 6 - "productivity" 7 - ] 8 - }

+108 -192

observe/describe.py

··· 34 34 class RequestType(Enum): 35 35 """Type of vision analysis request.""" 36 36 37 - DESCRIBE_JSON = "describe_json" 38 - DESCRIBE_TEXT = "describe_text" 39 - DESCRIBE_MEETING = "describe_meeting" 37 + DESCRIBE = "describe" # Initial categorization 38 + CATEGORY = "category" # Category-specific follow-up 40 39 41 40 42 - def _load_config() -> dict: 41 + def _discover_category_prompts() -> dict[str, dict]: 43 42 """ 44 - Load describe.json configuration file. 43 + Discover available category prompts from describe/ directory. 44 + 45 + Each category has a .txt prompt and .json metadata file. 45 46 46 47 Returns 47 48 ------- 48 - dict 49 - Configuration dictionary 50 - 51 - Raises 52 - ------ 53 - SystemExit 54 - If config file is missing or invalid 49 + dict[str, dict] 50 + Mapping of category name to metadata (including 'prompt' text) 55 51 """ 56 - config_path = Path(__file__).parent / "describe.json" 57 - if not config_path.exists(): 58 - logger.error(f"Configuration file not found: {config_path}") 59 - raise SystemExit(1) 52 + describe_dir = Path(__file__).parent / "describe" 53 + if not describe_dir.exists(): 54 + logger.warning(f"Category prompts directory not found: {describe_dir}") 55 + return {} 60 56 61 - try: 62 - with open(config_path) as f: 63 - config = json.load(f) 64 - logger.debug(f"Loaded configuration from {config_path}") 65 - return config 66 - except json.JSONDecodeError as e: 67 - logger.error(f"Invalid JSON in configuration file {config_path}: {e}") 68 - raise SystemExit(1) 69 - except Exception as e: 70 - logger.error(f"Failed to load configuration from {config_path}: {e}") 71 - raise SystemExit(1) 57 + categories = {} 58 + for json_path in describe_dir.glob("*.json"): 59 + category = json_path.stem 60 + txt_path = describe_dir / f"{category}.txt" 61 + 62 + if not txt_path.exists(): 63 + logger.warning(f"Missing prompt file for category {category}: {txt_path}") 64 + continue 65 + 66 + try: 67 + with open(json_path) as f: 68 + metadata = json.load(f) 69 + metadata["prompt"] = txt_path.read_text() 70 + categories[category] = metadata 71 + logger.debug(f"Loaded category prompt: {category}") 72 + except Exception as e: 73 + logger.warning(f"Failed to load category {category}: {e}") 74 + 75 + return categories 72 76 73 77 74 - # Load configuration at module level 75 - CONFIG = _load_config() 78 + # Discover category prompts at module level 79 + CATEGORY_PROMPTS = _discover_category_prompts() 76 80 77 81 78 82 class VideoProcessor: ··· 231 235 img.save(buf, format="PNG", compress_level=1) 232 236 return buf.getvalue() 233 237 234 - def _get_follow_up_prompt(self, category: str) -> Optional[str]: 238 + def _get_category_prompt(self, category: str) -> Optional[dict]: 235 239 """ 236 - Map category to follow-up prompt type. 240 + Get category prompt metadata if available. 237 241 238 242 Parameters 239 243 ---------- ··· 242 246 243 247 Returns 244 248 ------- 245 - Optional[str] 246 - "meeting", "text", or None if no follow-up needed 249 + Optional[dict] 250 + Category metadata with 'prompt' and 'output' keys, or None if no follow-up 247 251 """ 248 - if category == "meeting": 249 - return "meeting" 250 - text_categories = CONFIG.get("text_extraction_categories", []) 251 - if category in text_categories: 252 - return "text" 253 - return None 252 + return CATEGORY_PROMPTS.get(category) 254 253 255 254 def _user_contents(self, prompt: str, image, entities: bool = False) -> list: 256 255 """Build contents list with optional entity context.""" ··· 287 286 288 287 async def process_with_vision( 289 288 self, 290 - use_prompt: str = "describe_json.txt", 289 + use_prompt: str = "describe.txt", 291 290 max_concurrent: int = 10, 292 291 output_path: Optional[Path] = None, 293 292 ) -> None: ··· 297 296 Parameters 298 297 ---------- 299 298 use_prompt : str 300 - Prompt template filename to use (default: describe_json.txt) 299 + Prompt template filename to use (default: describe.txt) 301 300 max_concurrent : int 302 301 Maximum number of concurrent API requests (default: 10) 303 302 output_path : Optional[Path] ··· 306 305 from think.batch import GeminiBatch 307 306 from think.models import GEMINI_FLASH, GEMINI_LITE 308 307 309 - # Load prompt templates 308 + # Load primary categorization prompt 310 309 prompt_path = Path(__file__).parent / use_prompt 311 310 if not prompt_path.exists(): 312 311 raise FileNotFoundError(f"Prompt template not found: {prompt_path}") 313 312 314 313 system_instruction = prompt_path.read_text() 315 314 316 - # Load text extraction prompt 317 - text_prompt_path = Path(__file__).parent / "describe_text.txt" 318 - if not text_prompt_path.exists(): 319 - raise FileNotFoundError(f"Text prompt not found: {text_prompt_path}") 320 - 321 - text_system_instruction = text_prompt_path.read_text() 322 - 323 - # Load meeting analysis prompt 324 - meeting_prompt_path = Path(__file__).parent / "describe_meeting.txt" 325 - if not meeting_prompt_path.exists(): 326 - raise FileNotFoundError(f"Meeting prompt not found: {meeting_prompt_path}") 327 - 328 - meeting_system_instruction = meeting_prompt_path.read_text() 329 - 330 315 # Process video to get qualified frames (synchronous) 331 316 qualified_frames = self.process() 332 317 ··· 368 353 req.timestamp = frame_data["timestamp"] 369 354 req.retry_count = 0 370 355 req.frame_bytes = frame_data["frame_bytes"] # Store bytes for reuse 371 - req.request_type = RequestType.DESCRIBE_JSON 356 + req.request_type = RequestType.DESCRIBE 372 357 req.json_analysis = None # Will store the JSON analysis result 373 - req.meeting_analysis = None # Will store meeting analysis if applicable 374 - req.extracted_text = None # Will store text extraction if applicable 358 + req.category_results = {} # Will store category-specific results 375 359 req.requests = [] # Track all requests for this frame 376 360 req.initial_image = frame_img # Keep reference to close after completion 377 361 req.pending_follow_ups = 0 # Track how many follow-ups are pending 378 - req.follow_up_source = None # "primary" or "secondary" 362 + req.follow_up_category = None # Category name for follow-up requests 379 363 380 364 batch.add(req) 381 365 ··· 393 377 394 378 # Stream results as they complete, with retry logic 395 379 async for req in batch.drain_batch(): 396 - # Only count initial DESCRIBE_JSON requests as frames (not follow-ups) 397 - if req.request_type == RequestType.DESCRIBE_JSON: 380 + # Only count initial DESCRIBE requests as frames (not follow-ups) 381 + if req.request_type == RequestType.DESCRIBE: 398 382 total_frames += 1 399 383 400 384 # Check for errors ··· 403 387 404 388 # Handle based on request type 405 389 if not has_error: 406 - if req.request_type == RequestType.DESCRIBE_JSON: 390 + if req.request_type == RequestType.DESCRIBE: 407 391 # Parse JSON analysis 408 392 try: 409 393 analysis = json.loads(req.response) ··· 411 395 except json.JSONDecodeError as e: 412 396 has_error = True 413 397 error_msg = f"Invalid JSON response: {e}" 414 - elif req.request_type == RequestType.DESCRIBE_MEETING: 415 - # Parse meeting analysis 416 - try: 417 - meeting_data = json.loads(req.response) 418 - req.meeting_analysis = meeting_data # Store meeting analysis 419 - except json.JSONDecodeError as e: 420 - has_error = True 421 - error_msg = f"Invalid JSON response: {e}" 422 - elif req.request_type == RequestType.DESCRIBE_TEXT: 423 - # Store text extraction result 424 - req.extracted_text = req.response 398 + elif req.request_type == RequestType.CATEGORY: 399 + # Handle category-specific follow-up result 400 + category = req.follow_up_category 401 + cat_meta = self._get_category_prompt(category) 402 + if cat_meta and cat_meta.get("output") == "json": 403 + try: 404 + result = json.loads(req.response) 405 + req.category_results[category] = result 406 + except json.JSONDecodeError as e: 407 + has_error = True 408 + error_msg = f"Invalid JSON response for {category}: {e}" 409 + else: 410 + # Markdown output - store as-is 411 + req.category_results[category] = req.response 425 412 426 413 # Retry logic (up to 5 attempts total, so 4 retries) 427 414 if has_error and req.retry_count < 4: ··· 433 420 continue # Don't output, wait for retry result 434 421 435 422 # Track failure after all retries exhausted (only for initial requests) 436 - if has_error and req.request_type == RequestType.DESCRIBE_JSON: 423 + if has_error and req.request_type == RequestType.DESCRIBE: 437 424 failed_frames += 1 438 425 439 426 # Record this request's result (after retries are done) ··· 444 431 } 445 432 if req.retry_count > 0: 446 433 request_record["retries"] = req.retry_count 447 - if req.follow_up_source: 448 - request_record["source"] = req.follow_up_source 434 + if req.follow_up_category: 435 + request_record["category"] = req.follow_up_category 449 436 450 437 req.requests.append(request_record) 451 438 452 439 # Check if we should trigger follow-up analysis 453 440 should_process_further = ( 454 441 not has_error 455 - and req.request_type == RequestType.DESCRIBE_JSON 442 + and req.request_type == RequestType.DESCRIBE 456 443 and req.json_analysis 457 444 ) 458 445 459 446 if should_process_further: 460 - # Extract categories from new simplified format 447 + # Extract categories from analysis 461 448 primary = req.json_analysis.get("primary", "") 462 449 secondary = req.json_analysis.get("secondary", "none") 463 450 overlap = req.json_analysis.get("overlap", True) 464 451 465 - # Determine follow-up types 466 - primary_prompt_type = self._get_follow_up_prompt(primary) 467 - secondary_prompt_type = ( 468 - self._get_follow_up_prompt(secondary) 452 + # Determine which categories have follow-up prompts 453 + primary_meta = self._get_category_prompt(primary) 454 + secondary_meta = ( 455 + self._get_category_prompt(secondary) 469 456 if secondary != "none" 470 457 else None 471 458 ) 472 459 473 - # Build follow-up list with category context for focus guidance 474 - # Each entry: (prompt_type, source, focus_categories, ignore_category) 475 - # Primary always triggers if it has a follow-up type 476 - # Secondary triggers only if overlap=false AND different type 460 + # Build follow-up list: each category with a prompt gets a follow-up 461 + # Primary always triggers if it has a prompt 462 + # Secondary triggers only if overlap=false 477 463 follow_ups = [] 478 464 479 - has_secondary = secondary != "none" 480 - same_follow_up_type = ( 481 - primary_prompt_type 482 - and secondary_prompt_type 483 - and primary_prompt_type == secondary_prompt_type 484 - ) 465 + if primary_meta: 466 + follow_ups.append((primary, primary_meta)) 485 467 486 - if primary_prompt_type: 487 - if same_follow_up_type: 488 - # Both categories need same follow-up - focus on both 489 - follow_ups.append( 490 - (primary_prompt_type, "primary", [primary, secondary], None) 491 - ) 492 - elif has_secondary: 493 - # Different types - focus on primary, ignore secondary 494 - follow_ups.append( 495 - (primary_prompt_type, "primary", [primary], secondary) 496 - ) 497 - else: 498 - # No secondary - no focus guidance needed 499 - follow_ups.append( 500 - (primary_prompt_type, "primary", [primary], None) 501 - ) 468 + if not overlap and secondary_meta: 469 + follow_ups.append((secondary, secondary_meta)) 502 470 503 - if ( 504 - not overlap 505 - and secondary_prompt_type 506 - and secondary_prompt_type != primary_prompt_type 507 - ): 508 - # Secondary needs different follow-up - focus on it, ignore primary 509 - follow_ups.append( 510 - (secondary_prompt_type, "secondary", [secondary], primary) 511 - ) 512 - 513 - # Create follow-up requests (all use full frame) 471 + # Create follow-up requests 514 472 if follow_ups: 515 - # Load full frame for follow-up processing 516 473 full_img = Image.open(io.BytesIO(req.frame_bytes)) 517 474 req.pending_follow_ups = len(follow_ups) 518 475 519 - # Close initial image since DESCRIBE_JSON is complete 476 + # Close initial image since DESCRIBE is complete 520 477 if hasattr(req, "initial_image") and req.initial_image: 521 478 req.initial_image.close() 522 479 req.initial_image = None 523 480 524 - for i, (prompt_type, source, focus_cats, ignore_cat) in enumerate( 525 - follow_ups 526 - ): 481 + for i, (category, cat_meta) in enumerate(follow_ups): 527 482 if i == 0: 528 - # Reuse original request for first follow-up 529 483 follow_req = req 530 484 else: 531 - # Create new request for additional follow-ups 532 485 follow_req = batch.create(contents=[]) 533 - # Copy essential metadata 534 486 follow_req.frame_id = req.frame_id 535 487 follow_req.timestamp = req.timestamp 536 488 follow_req.frame_bytes = req.frame_bytes 537 489 follow_req.json_analysis = req.json_analysis 538 - follow_req.meeting_analysis = req.meeting_analysis 539 - follow_req.extracted_text = req.extracted_text 490 + follow_req.category_results = req.category_results 540 491 follow_req.requests = req.requests 541 492 follow_req.pending_follow_ups = req.pending_follow_ups 542 493 543 - follow_req.follow_up_source = source 494 + follow_req.follow_up_category = category 544 495 follow_req.retry_count = 0 496 + follow_req.request_type = RequestType.CATEGORY 545 497 546 - # Build focus guidance suffix 547 - focus_suffix = "" 548 - if len(focus_cats) > 1: 549 - # Multiple categories to focus on (same follow-up type) 550 - cats_str = " and ".join(focus_cats) 551 - focus_suffix = f" Focus on both the {cats_str} content." 552 - elif ignore_cat: 553 - # Single category, ignore the other 554 - focus_suffix = ( 555 - f" Focus on the {focus_cats[0]} content, " 556 - f"not the {ignore_cat} content." 557 - ) 498 + # Determine output format from metadata 499 + is_json = cat_meta.get("output") == "json" 558 500 559 - if prompt_type == "meeting": 560 - batch.update( 561 - follow_req, 562 - contents=self._user_contents( 563 - f"Analyze this meeting screenshot.{focus_suffix}", 564 - full_img, 565 - entities=True, 566 - ), 567 - model=GEMINI_FLASH, 568 - system_instruction=meeting_system_instruction, 569 - json_output=True, 570 - max_output_tokens=10240, 571 - thinking_budget=6144, 572 - ) 573 - follow_req.request_type = RequestType.DESCRIBE_MEETING 574 - else: # text 575 - batch.update( 576 - follow_req, 577 - contents=self._user_contents( 578 - f"Extract text from this screenshot.{focus_suffix}", 579 - full_img, 580 - entities=True, 581 - ), 582 - model=GEMINI_FLASH, 583 - system_instruction=text_system_instruction, 584 - json_output=False, 585 - max_output_tokens=8192, 586 - thinking_budget=4096, 587 - ) 588 - follow_req.request_type = RequestType.DESCRIBE_TEXT 501 + batch.update( 502 + follow_req, 503 + contents=self._user_contents( 504 + f"Analyze this {category} screenshot.", 505 + full_img, 506 + entities=True, 507 + ), 508 + model=GEMINI_FLASH, 509 + system_instruction=cat_meta["prompt"], 510 + json_output=is_json, 511 + max_output_tokens=10240 if is_json else 8192, 512 + thinking_budget=6144 if is_json else 4096, 513 + ) 589 514 590 515 logger.info( 591 516 f"Frame {req.frame_id}: {len(follow_ups)} follow-up(s) - " 592 - f"{', '.join(pt for pt, _ , _, _ in follow_ups)}" 517 + f"{', '.join(cat for cat, _ in follow_ups)}" 593 518 ) 594 519 595 - # Close full_img after all follow-up requests are created 596 520 full_img.close() 597 - 598 521 continue # Don't output yet, wait for follow-ups 599 522 600 523 # Handle follow-up completion for parallel requests 601 - if req.request_type in ( 602 - RequestType.DESCRIBE_MEETING, 603 - RequestType.DESCRIBE_TEXT, 604 - ): 524 + if req.request_type == RequestType.CATEGORY: 605 525 # Store result in frame_results for merging 606 526 if req.frame_id not in frame_results: 607 527 frame_results[req.frame_id] = { ··· 616 536 617 537 result = frame_results[req.frame_id] 618 538 619 - # Merge this follow-up's result 620 - if req.meeting_analysis: 621 - result["meeting_analysis"] = req.meeting_analysis 622 - if req.extracted_text: 623 - result["extracted_text"] = req.extracted_text 539 + # Merge this follow-up's category result 540 + for category, cat_result in req.category_results.items(): 541 + result[category] = cat_result 624 542 625 543 # Update requests list (avoid duplicates by using shared list) 626 544 result["requests"] = req.requests ··· 646 564 # Aggressively clear heavy fields 647 565 req.frame_bytes = None 648 566 req.json_analysis = None 649 - req.meeting_analysis = None 650 - req.extracted_text = None 567 + req.category_results = None 651 568 652 569 continue 653 570 654 - # Final output for frames with no follow-ups (DESCRIBE_JSON only) 571 + # Final output for frames with no follow-ups (DESCRIBE only) 655 572 result = { 656 573 "frame_id": req.frame_id, 657 574 "timestamp": req.timestamp, ··· 682 599 # Aggressively clear heavy fields now that request is finalized 683 600 req.frame_bytes = None 684 601 req.json_analysis = None 685 - req.meeting_analysis = None 686 - req.extracted_text = None 602 + req.category_results = None 687 603 688 604 # Close output file 689 605 if output_file: ··· 753 669 parser.add_argument( 754 670 "--prompt", 755 671 type=str, 756 - default="describe_json.txt", 757 - help="Prompt template to use (default: describe_json.txt)", 672 + default="describe.txt", 673 + help="Prompt template to use (default: describe.txt)", 758 674 ) 759 675 parser.add_argument( 760 676 "-j",

+3

observe/describe/browsing.json

··· 1 + { 2 + "output": "markdown" 3 + }

+53

observe/describe/browsing.txt

··· 1 + # Web Browsing Text Extraction 2 + 3 + You are extracting text from a screenshot of web browsing activity (news sites, blogs, social media feeds, shopping, general websites). 4 + 5 + Extract the meaningful content and format it as markdown. 6 + 7 + ## Header 8 + 9 + Start with the site and page context: 10 + `# [Site Name - Page Title or Topic]` 11 + 12 + ## Content Prioritization 13 + 14 + Focus on the primary content, skip: 15 + - Navigation menus and headers 16 + - Sidebar widgets and ads 17 + - Cookie banners and popups 18 + - Footer links 19 + 20 + ## Article/Post Content 21 + 22 + For articles and blog posts: 23 + - Use `##` for article headlines 24 + - Preserve paragraph structure 25 + - Include author and date if visible 26 + - Extract image captions as `*[Image: caption]*` 27 + 28 + ## Social Media Feeds 29 + 30 + For feed content (Twitter/X, Reddit, Facebook, etc.): 31 + ```markdown 32 + **@username** (timestamp): 33 + Post content here 34 + 35 + [12 likes, 3 comments] 36 + ``` 37 + 38 + ## Shopping/Product Pages 39 + 40 + For e-commerce content: 41 + - Product name as heading 42 + - Price prominently noted 43 + - Key specifications in list format 44 + - Review snippets if visible 45 + 46 + ## Quality Standards 47 + 48 + - Preserve reading order (main content first) 49 + - Skip repeated navigation elements 50 + - Mark unclear text with `[unclear]` 51 + - Mark cut-off content with `...` 52 + 53 + Return ONLY the formatted markdown. Do NOT wrap in code fences or JSON.

+3

observe/describe/meeting.json

··· 1 + { 2 + "output": "json" 3 + }

+3

observe/describe/messaging.json

··· 1 + { 2 + "output": "markdown" 3 + }

+48

observe/describe/messaging.txt

··· 1 + # Messaging App Text Extraction 2 + 3 + You are extracting text from a screenshot of a messaging or email application (Slack, Discord, Messages, Gmail, Teams chat, etc.). 4 + 5 + Extract all visible conversation content and format it as markdown. 6 + 7 + ## Header 8 + 9 + Start with the application and channel/conversation context: 10 + `# [App Name - Channel/Conversation]` 11 + 12 + ## Conversation Format 13 + 14 + Extract messages with participant attribution: 15 + 16 + ```markdown 17 + **Alice**: Hey, how's it going? 18 + **Bob**: Pretty good, working on the new feature 19 + **Alice**: Nice! Let me know if you need help 20 + ``` 21 + 22 + - Use **bold** for participant names 23 + - Preserve message order (oldest to newest visible) 24 + - Include timestamps if clearly visible: `**Alice** (2:34 PM): message` 25 + - Mark reactions/emoji responses inline: `**Bob**: Sounds good [thumbs up from Alice]` 26 + 27 + ## Thread Context 28 + 29 + If viewing a thread or reply chain: 30 + - Note the parent message context 31 + - Indent replies or mark as thread replies 32 + 33 + ## Special Elements 34 + 35 + - **Quoted/forwarded messages**: Use `>` blockquote 36 + - **Code snippets**: Use appropriate code fences 37 + - **Links**: Include the visible link text 38 + - **Attachments**: Note as `[Attachment: filename or description]` 39 + - **Unread indicators**: Note where unread messages begin 40 + 41 + ## Quality Standards 42 + 43 + - Focus on message content, skip UI chrome 44 + - Preserve conversation flow and context 45 + - Mark unclear text with `[unclear]` 46 + - Mark cut-off messages with `...` 47 + 48 + Return ONLY the formatted markdown. Do NOT wrap in code fences or JSON.

+3

observe/describe/productivity.json

··· 1 + { 2 + "output": "markdown" 3 + }

+57

observe/describe/productivity.txt

··· 1 + # Productivity App Text Extraction 2 + 3 + You are extracting text from a screenshot of productivity software (spreadsheets, slides, calendars, task managers, issue trackers, project management tools). 4 + 5 + Extract all visible content with appropriate structure as markdown. 6 + 7 + ## Header 8 + 9 + Start with the application and context: 10 + `# [App Name - Document/View Title]` 11 + 12 + ## Spreadsheet Content 13 + 14 + For spreadsheet views: 15 + - Use markdown tables for visible data 16 + - Note column/row headers 17 + - Include cell formulas if visible: `=SUM(A1:A10)` 18 + - Note selected cells or ranges 19 + 20 + ```markdown 21 + | Name | Status | Due Date | 22 + |------|--------|----------| 23 + | Task 1 | Done | 2024-01-15 | 24 + | Task 2 | In Progress | 2024-01-20 | 25 + ``` 26 + 27 + ## Presentation Slides 28 + 29 + For slide content: 30 + - `##` for slide title 31 + - Bullet points for slide content 32 + - Note speaker notes if visible 33 + - Describe images/charts: `*[Chart: Sales by Quarter]*` 34 + 35 + ## Calendar Views 36 + 37 + For calendar content: 38 + - Note the date range visible 39 + - List events with times: `**9:00 AM** - Team Standup` 40 + - Include event details if expanded 41 + 42 + ## Task/Issue Trackers 43 + 44 + For project management views: 45 + - Task/issue titles and IDs 46 + - Status, assignee, priority 47 + - Due dates and labels 48 + - Preserve board/list structure 49 + 50 + ## Quality Standards 51 + 52 + - Preserve data relationships and hierarchy 53 + - Include key metadata (dates, statuses, assignees) 54 + - Mark unclear text with `[unclear]` 55 + - Mark cut-off content with `...` 56 + 57 + Return ONLY the formatted markdown. Do NOT wrap in code fences or JSON.

+3

observe/describe/reading.json

··· 1 + { 2 + "output": "markdown" 3 + }

+52

observe/describe/reading.txt

··· 1 + # Document Reading Text Extraction 2 + 3 + You are extracting text from a screenshot of document reading activity (PDFs, documentation, articles, ebooks, manuals). 4 + 5 + Extract all visible text with proper document structure as markdown. 6 + 7 + ## Header 8 + 9 + Start with the document context: 10 + `# [Document Title or Type]` 11 + 12 + ## Document Structure 13 + 14 + Preserve the document hierarchy: 15 + - Use `##`, `###` for section headings 16 + - Maintain numbered sections (e.g., "2.3 Configuration") 17 + - Preserve list formatting (bullets, numbered lists) 18 + - Keep table structure using markdown tables 19 + 20 + ## Code and Technical Content 21 + 22 + For documentation with code: 23 + ```language 24 + code here 25 + ``` 26 + 27 + For command examples: 28 + ```bash 29 + $ command example 30 + ``` 31 + 32 + ## References and Citations 33 + 34 + - Include footnotes as `[1]` with reference text if visible 35 + - Preserve figure/table captions: `*Figure 3: Description*` 36 + - Note page numbers if visible: `[Page 42]` 37 + 38 + ## Academic/Formal Documents 39 + 40 + - Preserve abstract/summary sections 41 + - Include author/date information 42 + - Note section numbers 43 + 44 + ## Quality Standards 45 + 46 + - Maintain logical reading flow 47 + - Preserve paragraph breaks 48 + - Skip page headers/footers unless meaningful 49 + - Mark unclear text with `[unclear]` 50 + - Mark cut-off text with `...` 51 + 52 + Return ONLY the formatted markdown. Do NOT wrap in code fences or JSON.

-58

observe/describe_text.txt

··· 1 - # Screenshot Text Extraction 2 - 3 - You are extracting text from a desktop screenshot. 4 - 5 - Extract all visible text from the image and format it as markdown. 6 - 7 - ## Text Extraction Guidelines 8 - 9 - **Start with a header**: 10 - - If you can see an application name and/or window title or active browser tab, include it as: `# [App Name - Title]` 11 - - If not explicitly visible, make your best guess based on the content and UI elements 12 - 13 - Extract ALL visible text using appropriate markdown formatting: 14 - 15 - - **Headings**: Use `#`, `##`, `###` for headers and titles 16 - - **Code blocks**: Use triple backticks with language tags for code or terminal output 17 - - **Lists**: Use `-` or `1.` for bullets and numbered lists 18 - - **Emphasis**: Use `**bold**` or `*italic*` for visually emphasized text 19 - - **Paragraphs**: Separate with blank lines 20 - - **Inline code**: Use backticks for technical terms or short code snippets 21 - 22 - ## Special Content Types 23 - 24 - **User Input Areas**: Mark active input fields or text being composed: 25 - ```markdown 26 - **[USER INPUT]**: <text being typed or edited> 27 - ``` 28 - 29 - **Terminal/Commands**: 30 - ```bash 31 - $ command here 32 - output here 33 - ``` 34 - 35 - **Code Editors**: Preserve structure with language tags: 36 - ```python 37 - def example(): 38 - pass 39 - ``` 40 - 41 - **Messaging/Chat**: Identify participants: 42 - ```markdown 43 - **Alice**: Hey, how's it going? 44 - **Bob**: Pretty good, working on the new feature 45 - ``` 46 - 47 - **Reading Content**: Extract cleanly with proper markdown structure 48 - 49 - ## Quality Standards 50 - 51 - - Preserve reading order (top-to-bottom, left-to-right) 52 - - Maintain paragraph breaks and logical sections 53 - - Skip repeated UI chrome or decorative elements 54 - - Mark unclear text with `[unclear]` 55 - - Mark cut-off text with `...` 56 - - Focus on meaningful content 57 - 58 - Return ONLY the formatted markdown. Do NOT wrap in code fences or JSON.

+24 -12

observe/screen.py

··· 157 157 lines.append(description) 158 158 lines.append("") 159 159 160 - # Add extracted text if present 161 - extracted_text = frame.get("extracted_text") 162 - if extracted_text: 163 - lines.append("**Extracted Text:**") 164 - lines.append("") 165 - lines.append("```") 166 - lines.append(extracted_text.strip()) 167 - lines.append("```") 168 - lines.append("") 160 + # Add category-specific content if present 161 + # New format uses category name as key (e.g., "meeting", "messaging") 162 + # Old format used "extracted_text" and "meeting_analysis" 163 + text_categories = ["messaging", "browsing", "reading", "productivity"] 164 + for cat in text_categories: 165 + if cat in frame: 166 + lines.append(f"**{cat.title()}:**") 167 + lines.append("") 168 + lines.append(frame[cat].strip()) 169 + lines.append("") 170 + break 171 + else: 172 + # Fall back to legacy extracted_text field 173 + extracted_text = frame.get("extracted_text") 174 + if extracted_text: 175 + lines.append("**Extracted Text:**") 176 + lines.append("") 177 + lines.append("```") 178 + lines.append(extracted_text.strip()) 179 + lines.append("```") 180 + lines.append("") 169 181 170 - # Add meeting analysis if present 171 - meeting = frame.get("meeting_analysis") 182 + # Add meeting analysis if present (new: "meeting", old: "meeting_analysis") 183 + meeting = frame.get("meeting") or frame.get("meeting_analysis") 172 184 if meeting: 173 - lines.append("**Meeting Analysis:**") 185 + lines.append("**Meeting:**") 174 186 lines.append("") 175 187 lines.append("```json") 176 188 lines.append(json.dumps(meeting, indent=2))

+78 -52

tests/test_describe_config.py

··· 1 - """Tests for observe/describe.py config loading.""" 1 + """Tests for observe/describe.py category prompt discovery.""" 2 2 3 3 import json 4 - import tempfile 5 4 from pathlib import Path 6 5 from unittest.mock import patch 7 6 8 7 import pytest 9 8 10 9 11 - def test_config_loads_successfully(): 12 - """Test that config loads successfully on import.""" 13 - from observe.describe import CONFIG 10 + def test_category_prompts_discovered(): 11 + """Test that category prompts are discovered on import.""" 12 + from observe.describe import CATEGORY_PROMPTS 13 + 14 + # Should have discovered some category prompts 15 + assert len(CATEGORY_PROMPTS) > 0 16 + # Meeting should be one of them 17 + assert "meeting" in CATEGORY_PROMPTS 18 + 19 + 20 + def test_category_prompts_have_required_fields(): 21 + """Test that discovered categories have required metadata.""" 22 + from observe.describe import CATEGORY_PROMPTS 23 + 24 + for category, metadata in CATEGORY_PROMPTS.items(): 25 + # Each category should have 'output' and 'prompt' fields 26 + assert "output" in metadata, f"Category {category} missing 'output' field" 27 + assert "prompt" in metadata, f"Category {category} missing 'prompt' field" 28 + # Output should be 'json' or 'markdown' 29 + assert metadata["output"] in ( 30 + "json", 31 + "markdown", 32 + ), f"Category {category} has invalid output: {metadata['output']}" 33 + # Prompt should be non-empty string 34 + assert isinstance(metadata["prompt"], str) 35 + assert len(metadata["prompt"]) > 0 36 + 14 37 15 - assert "text_extraction_categories" in CONFIG 16 - assert isinstance(CONFIG["text_extraction_categories"], list) 17 - assert len(CONFIG["text_extraction_categories"]) > 0 38 + def test_meeting_category_is_json(): 39 + """Test that meeting category outputs JSON.""" 40 + from observe.describe import CATEGORY_PROMPTS 18 41 42 + assert "meeting" in CATEGORY_PROMPTS 43 + assert CATEGORY_PROMPTS["meeting"]["output"] == "json" 19 44 20 - def test_config_has_expected_categories(): 21 - """Test that config contains text extraction categories.""" 22 - from observe.describe import CONFIG 23 45 24 - # Verify structure, not specific values (config can change) 25 - assert "text_extraction_categories" in CONFIG 26 - assert isinstance(CONFIG["text_extraction_categories"], list) 27 - # Should have at least one category configured 28 - assert len(CONFIG["text_extraction_categories"]) > 0 29 - # All entries should be non-empty strings 30 - for category in CONFIG["text_extraction_categories"]: 31 - assert isinstance(category, str) 32 - assert len(category) > 0 46 + def test_text_categories_are_markdown(): 47 + """Test that text-based categories output markdown.""" 48 + from observe.describe import CATEGORY_PROMPTS 33 49 50 + text_categories = ["messaging", "browsing", "reading", "productivity"] 51 + for category in text_categories: 52 + if category in CATEGORY_PROMPTS: 53 + assert ( 54 + CATEGORY_PROMPTS[category]["output"] == "markdown" 55 + ), f"Category {category} should output markdown" 34 56 35 - def test_config_loading_with_missing_file(tmp_path): 36 - """Test that config loading fails gracefully when file is missing.""" 37 - from observe.describe import _load_config 57 + 58 + def test_discover_category_prompts_with_missing_dir(tmp_path): 59 + """Test that discovery handles missing directory gracefully.""" 60 + from observe.describe import _discover_category_prompts 38 61 39 62 with patch("observe.describe.Path") as mock_path: 40 - # Mock the config path to point to non-existent file 41 - mock_config_path = tmp_path / "nonexistent.json" 42 - mock_path.return_value.parent = tmp_path 43 - mock_path.return_value.__truediv__.return_value = mock_config_path 63 + # Mock to point to non-existent directory 64 + mock_describe_dir = tmp_path / "nonexistent" 65 + mock_path.return_value.parent.__truediv__.return_value = mock_describe_dir 44 66 45 - with pytest.raises(SystemExit) as exc_info: 46 - _load_config() 47 - assert exc_info.value.code == 1 67 + result = _discover_category_prompts() 68 + assert result == {} 48 69 49 70 50 - def test_config_loading_with_invalid_json(tmp_path): 51 - """Test that config loading fails gracefully with invalid JSON.""" 52 - from observe.describe import _load_config 71 + def test_discover_category_prompts_with_valid_dir(tmp_path): 72 + """Test that discovery works with valid category files.""" 73 + from observe.describe import _discover_category_prompts 53 74 54 - # Create invalid JSON file 55 - invalid_json_path = tmp_path / "describe.json" 56 - invalid_json_path.write_text("{ invalid json }") 75 + # Create test category directory 76 + describe_dir = tmp_path / "describe" 77 + describe_dir.mkdir() 78 + 79 + # Create test category files 80 + (describe_dir / "test.json").write_text('{"output": "markdown"}') 81 + (describe_dir / "test.txt").write_text("Test prompt content") 57 82 58 83 with patch("observe.describe.Path") as mock_path: 59 - mock_path.return_value.parent = tmp_path 60 - mock_path.return_value.__truediv__.return_value = invalid_json_path 84 + mock_path.return_value.parent.__truediv__.return_value = describe_dir 61 85 62 - with pytest.raises(SystemExit) as exc_info: 63 - _load_config() 64 - assert exc_info.value.code == 1 86 + result = _discover_category_prompts() 87 + assert "test" in result 88 + assert result["test"]["output"] == "markdown" 89 + assert result["test"]["prompt"] == "Test prompt content" 65 90 66 91 67 - def test_config_loading_with_valid_json(tmp_path): 68 - """Test that config loading succeeds with valid JSON.""" 69 - from observe.describe import _load_config 92 + def test_discover_category_prompts_skips_incomplete(tmp_path): 93 + """Test that discovery skips categories without matching txt file.""" 94 + from observe.describe import _discover_category_prompts 70 95 71 - # Create valid JSON file 72 - valid_json_path = tmp_path / "describe.json" 73 - config_data = {"text_extraction_categories": ["code", "messaging", "reading"]} 74 - valid_json_path.write_text(json.dumps(config_data)) 96 + # Create test category directory 97 + describe_dir = tmp_path / "describe" 98 + describe_dir.mkdir() 99 + 100 + # Create JSON without matching txt 101 + (describe_dir / "incomplete.json").write_text('{"output": "json"}') 75 102 76 103 with patch("observe.describe.Path") as mock_path: 77 - mock_path.return_value.parent = tmp_path 78 - mock_path.return_value.__truediv__.return_value = valid_json_path 104 + mock_path.return_value.parent.__truediv__.return_value = describe_dir 79 105 80 - config = _load_config() 81 - assert config == config_data 106 + result = _discover_category_prompts() 107 + assert "incomplete" not in result

+9 -9

tests/test_formatters.py

··· 182 182 { 183 183 "timestamp": 5, 184 184 "analysis": { 185 - "primary": "code", 186 - "visual_description": "Python code", 185 + "primary": "reading", 186 + "visual_description": "Documentation page", 187 187 }, 188 - "extracted_text": "def hello():\n pass", 188 + "reading": "# API Reference\n\ndef hello():\n pass", 189 189 } 190 190 ] 191 191 ··· 195 195 assert "header" in meta 196 196 assert "Frame Analyses" in meta["header"] 197 197 assert chunks[0]["timestamp"] == 5000 # 5 seconds = 5000ms 198 - assert "**Category:** code" in chunks[0]["markdown"] 198 + assert "**Category:** reading" in chunks[0]["markdown"] 199 199 assert "def hello()" in chunks[0]["markdown"] 200 200 201 201 def test_format_screen_with_entity_context(self): ··· 240 240 # Monitor info should be in the header, not per-frame 241 241 assert "(left - DP-1)" in meta["header"] 242 242 243 - def test_format_screen_meeting_analysis(self): 244 - """Test screen formatting with meeting analysis.""" 243 + def test_format_screen_meeting(self): 244 + """Test screen formatting with meeting data.""" 245 245 from observe.screen import format_screen 246 246 247 247 entries = [ 248 248 { 249 249 "timestamp": 0, 250 - "analysis": {}, 251 - "meeting_analysis": {"participants": ["Alice", "Bob"]}, 250 + "analysis": {"primary": "meeting"}, 251 + "meeting": {"participants": ["Alice", "Bob"]}, 252 252 } 253 253 ] 254 254 255 255 chunks, meta = format_screen(entries) 256 256 257 - assert "Meeting Analysis" in chunks[0]["markdown"] 257 + assert "**Meeting:**" in chunks[0]["markdown"] 258 258 assert "Alice" in chunks[0]["markdown"] 259 259 260 260 def test_format_screen_extracts_metadata(self):

+9 -9

tests/test_screen_formatter.py

··· 190 190 assert "Alice, Bob, ProjectX" in markdown 191 191 192 192 193 - def test_format_screen_includes_extracted_text(): 194 - """Test that extracted text is included in output.""" 193 + def test_format_screen_includes_category_content(): 194 + """Test that category-specific content is included in output.""" 195 195 frames = [ 196 196 { 197 197 "timestamp": 0, 198 198 "analysis": { 199 - "primary": "terminal", 200 - "visual_description": "Terminal window", 199 + "primary": "productivity", 200 + "visual_description": "Spreadsheet view", 201 201 }, 202 - "extracted_text": "$ python test.py\nAll tests passed", 202 + "productivity": "| Name | Value |\n|------|-------|\n| Test | 123 |", 203 203 }, 204 204 ] 205 205 ··· 211 211 chunks, meta = format_screen(frames, context) 212 212 markdown = "\n".join([meta.get("header", "")] + [c["markdown"] for c in chunks]) 213 213 214 - # Should include extracted text in code block 215 - assert "**Extracted Text:**" in markdown 216 - assert "$ python test.py" in markdown 217 - assert "All tests passed" in markdown 214 + # Should include category content 215 + assert "**Productivity:**" in markdown 216 + assert "| Name | Value |" in markdown 217 + assert "| Test | 123 |" in markdown 218 218 219 219 220 220 def test_format_screen_returns_chunks_with_timestamps():

Configure Feed

Configure Feed