feat(screencast): add screencast_diff utility for identifying visually different frames

+466

2 changed files

expand all

observe

gnome

screencast_diff.py

pyproject.toml

+464

observe/gnome/screencast_diff.py

··· 1 + #!/usr/bin/env python3 2 + """ 3 + screencast_diff.py — find the most visually different frames in a screencast 4 + 5 + Efficiently scans video and computes the minimum distance from each frame to 6 + all earlier frames using intensity, gradient, and histogram-based scoring. 7 + Frames with high min-distance are novel (never seen before), while repeated 8 + frames (e.g., A→B→A toggles) score low. Identifies the top 10 most divergent 9 + frames, then re-extracts only those frames as WebP images. 10 + 11 + Usage: 12 + gnome-screencast-diff screencast.webm 13 + gnome-screencast-diff screencast.webm --interval 0.5 # sample every 0.5s 14 + """ 15 + 16 + import argparse 17 + import io 18 + import sys 19 + import time 20 + from http.server import BaseHTTPRequestHandler, HTTPServer 21 + from pathlib import Path 22 + 23 + import av 24 + import numpy as np 25 + from PIL import Image 26 + from scipy.spatial.distance import jensenshannon 27 + 28 + 29 + # Frame comparison helper functions 30 + 31 + 32 + def to_luma(img: Image.Image) -> np.ndarray: 33 + """Convert PIL image to luma (grayscale) as float32 normalized to [0, 1].""" 34 + if img.mode != "L": 35 + img = img.convert("L") 36 + return np.array(img, dtype=np.float32) / 255.0 37 + 38 + 39 + def avg_pool(Y: np.ndarray, pool_h: int, pool_w: int) -> np.ndarray: 40 + """Average pooling via reshape and mean.""" 41 + h, w = Y.shape 42 + new_h = h // pool_h 43 + new_w = w // pool_w 44 + 45 + # Trim to make dimensions divisible 46 + Y_trimmed = Y[: new_h * pool_h, : new_w * pool_w] 47 + 48 + # Reshape and compute mean 49 + reshaped = Y_trimmed.reshape(new_h, pool_h, new_w, pool_w) 50 + return reshaped.mean(axis=(1, 3)) 51 + 52 + 53 + def gradient_mag(Y: np.ndarray) -> np.ndarray: 54 + """Compute gradient magnitude using np.diff in x and y directions.""" 55 + dx = np.diff(Y, axis=1, prepend=Y[:, :1]) 56 + dy = np.diff(Y, axis=0, prepend=Y[:1, :]) 57 + return np.sqrt(dx**2 + dy**2) 58 + 59 + 60 + def hist32(Y: np.ndarray) -> np.ndarray: 61 + """Compute 32-bin normalized histogram of image.""" 62 + hist, _ = np.histogram(Y, bins=32, range=(0, 1)) 63 + hist = hist.astype(np.float32) 64 + # Normalize to probability distribution 65 + hist_sum = hist.sum() 66 + if hist_sum > 0: 67 + hist = hist / hist_sum 68 + return hist 69 + 70 + 71 + def js_div(p: np.ndarray, q: np.ndarray) -> float: 72 + """Jensen-Shannon divergence between two probability distributions.""" 73 + # Add small epsilon to avoid log(0) 74 + eps = 1e-10 75 + p = np.clip(p, eps, 1.0) 76 + q = np.clip(q, eps, 1.0) 77 + return float(jensenshannon(p, q) ** 2) 78 + 79 + 80 + def compute_frame_score( 81 + Y_prev: np.ndarray, 82 + Y_curr: np.ndarray, 83 + G_prev: np.ndarray, 84 + G_curr: np.ndarray, 85 + hist_prev: np.ndarray, 86 + hist_curr: np.ndarray, 87 + ) -> float: 88 + """ 89 + Compute comprehensive frame difference score. 90 + 91 + S = 0.5*S_int + 0.3*S_grad + 0.2*JSD 92 + where: 93 + S_int = mean((Y_curr - Y_prev)**2) / (var(Y_prev) + eps) 94 + S_grad = mean((G_curr - G_prev)**2) / (mean(G_prev**2) + eps) 95 + JSD = jensen_shannon_divergence(hist_prev, hist_curr) 96 + """ 97 + eps = 1e-10 98 + 99 + # Intensity difference score 100 + var_prev = np.var(Y_prev) 101 + S_int = np.mean((Y_curr - Y_prev) ** 2) / (var_prev + eps) 102 + 103 + # Gradient difference score 104 + mean_G_prev_sq = np.mean(G_prev**2) 105 + S_grad = np.mean((G_curr - G_prev) ** 2) / (mean_G_prev_sq + eps) 106 + 107 + # Histogram JSD 108 + JSD = js_div(hist_prev, hist_curr) 109 + 110 + # Weighted combination 111 + S = 0.5 * S_int + 0.3 * S_grad + 0.2 * JSD 112 + 113 + return float(S) 114 + 115 + 116 + class ScreencastDiffer: 117 + def __init__(self, video_path: str, sample_interval: float = 1.0): 118 + self.video_path = Path(video_path) 119 + if not self.video_path.exists(): 120 + raise FileNotFoundError(f"Video file not found: {video_path}") 121 + 122 + self.sample_interval = sample_interval 123 + self.frame_scores = [] # List of (timestamp, score) - computed during scan 124 + self.divergence_scores = [] # List of (timestamp, score) - sorted by score 125 + self.top_frames = {} # Dict of {idx: (timestamp, score, webp_bytes)} for top 10 126 + 127 + # Performance timing 128 + self.timings = { 129 + "video_scan": 0.0, 130 + "frame_to_image": 0.0, 131 + "luma_compute": 0.0, 132 + "pooling": 0.0, 133 + "gradient": 0.0, 134 + "histogram": 0.0, 135 + "score_compute": 0.0, 136 + "top_frames_extract": 0.0, 137 + "webp_encode": 0.0, 138 + } 139 + 140 + print( 141 + f"Scanning {video_path} with min-distance-to-history comparison...", 142 + file=sys.stderr, 143 + ) 144 + self._process_video() 145 + print("Sorting divergence scores...", file=sys.stderr) 146 + self._compute_divergence() 147 + print("Extracting top 10 frames as WebP...", file=sys.stderr) 148 + self._extract_top_frames() 149 + print( 150 + f"Found {len(self.divergence_scores)} scored frames, ready to serve top 10", 151 + file=sys.stderr, 152 + ) 153 + self._print_timings() 154 + 155 + def _process_video(self): 156 + """Scan video at intervals and compute min distance to any earlier frame.""" 157 + try: 158 + t_scan_start = time.perf_counter() 159 + with av.open(str(self.video_path)) as container: 160 + stream = container.streams.video[0] 161 + duration = ( 162 + float(stream.duration * stream.time_base) 163 + if stream.duration 164 + else None 165 + ) 166 + 167 + if duration: 168 + print(f"Video duration: {duration:.2f}s", file=sys.stderr) 169 + 170 + last_sampled = -self.sample_interval 171 + frame_count = 0 172 + 173 + # Store all previous frame features for comparison 174 + previous_frames = [] # List of (Yd, G, hist) tuples 175 + 176 + for frame in container.decode(video=0): 177 + if frame.pts is None: 178 + continue 179 + 180 + timestamp = frame.time if frame.time is not None else 0.0 181 + 182 + # Sample at intervals 183 + if timestamp - last_sampled >= self.sample_interval: 184 + # Convert frame to image 185 + t_img_start = time.perf_counter() 186 + img = frame.to_image() 187 + self.timings["frame_to_image"] += ( 188 + time.perf_counter() - t_img_start 189 + ) 190 + 191 + # Convert to luma 192 + t_luma_start = time.perf_counter() 193 + Y = to_luma(img) 194 + self.timings["luma_compute"] += ( 195 + time.perf_counter() - t_luma_start 196 + ) 197 + 198 + # Average pool to reduce resolution 199 + t_pool_start = time.perf_counter() 200 + Yd = avg_pool(Y, 128, 128) 201 + self.timings["pooling"] += time.perf_counter() - t_pool_start 202 + 203 + # Compute gradient magnitude 204 + t_grad_start = time.perf_counter() 205 + G = gradient_mag(Yd) 206 + self.timings["gradient"] += time.perf_counter() - t_grad_start 207 + 208 + # Compute histogram 209 + t_hist_start = time.perf_counter() 210 + hist = hist32(Yd) 211 + self.timings["histogram"] += time.perf_counter() - t_hist_start 212 + 213 + # Compute min score against all previous frames 214 + if previous_frames: 215 + t_score_start = time.perf_counter() 216 + min_score = float("inf") 217 + for Y_prev, G_prev, hist_prev in previous_frames: 218 + score = compute_frame_score( 219 + Y_prev, Yd, G_prev, G, hist_prev, hist 220 + ) 221 + min_score = min(min_score, score) 222 + self.timings["score_compute"] += ( 223 + time.perf_counter() - t_score_start 224 + ) 225 + self.frame_scores.append((timestamp, min_score)) 226 + 227 + # Store current frame features for future comparisons 228 + previous_frames.append((Yd, G, hist)) 229 + 230 + last_sampled = timestamp 231 + frame_count += 1 232 + 233 + if frame_count % 10 == 0: 234 + print( 235 + f" Scanned {frame_count} frames at {timestamp:.1f}s", 236 + file=sys.stderr, 237 + ) 238 + 239 + self.timings["video_scan"] = time.perf_counter() - t_scan_start 240 + 241 + except Exception as e: 242 + print(f"ERROR: Failed to process video: {e}", file=sys.stderr) 243 + import traceback 244 + 245 + traceback.print_exc(file=sys.stderr) 246 + raise 247 + 248 + def _compute_divergence(self): 249 + """Sort frame scores by divergence (highest first).""" 250 + # Scores are already computed during scan, just need to sort 251 + self.divergence_scores = sorted( 252 + self.frame_scores, key=lambda x: x[1], reverse=True 253 + ) 254 + 255 + def _extract_top_frames(self, n: int = 10): 256 + """Extract and encode top N most divergent frames as WebP.""" 257 + if not self.divergence_scores: 258 + return 259 + 260 + t_extract_start = time.perf_counter() 261 + 262 + # Create mapping from timestamp to rank (1-indexed) 263 + timestamp_to_idx = { 264 + ts: idx for idx, (ts, _) in enumerate(self.divergence_scores[:n], 1) 265 + } 266 + 267 + try: 268 + with av.open(str(self.video_path)) as container: 269 + last_sampled = -self.sample_interval 270 + 271 + for frame in container.decode(video=0): 272 + if frame.pts is None: 273 + continue 274 + 275 + timestamp = frame.time if frame.time is not None else 0.0 276 + 277 + # Sample at intervals and check if this is a top frame 278 + if timestamp - last_sampled >= self.sample_interval: 279 + if timestamp in timestamp_to_idx: 280 + # Convert frame to image 281 + img = frame.to_image() 282 + 283 + # Encode as WebP with quality setting 284 + t_webp_start = time.perf_counter() 285 + buf = io.BytesIO() 286 + img.save(buf, format="WEBP", quality=85) 287 + webp_bytes = buf.getvalue() 288 + self.timings["webp_encode"] += ( 289 + time.perf_counter() - t_webp_start 290 + ) 291 + 292 + # Find the divergence score for this timestamp 293 + score = next( 294 + s for ts, s in self.divergence_scores if ts == timestamp 295 + ) 296 + 297 + # Store with index based on divergence rank 298 + idx = timestamp_to_idx[timestamp] 299 + self.top_frames[idx] = (timestamp, score, webp_bytes) 300 + 301 + # Exit early if we've found all top frames 302 + if len(self.top_frames) >= n: 303 + break 304 + 305 + last_sampled = timestamp 306 + 307 + except Exception as e: 308 + print(f"ERROR: Failed to extract top frames: {e}", file=sys.stderr) 309 + import traceback 310 + 311 + traceback.print_exc(file=sys.stderr) 312 + raise 313 + 314 + self.timings["top_frames_extract"] = time.perf_counter() - t_extract_start 315 + 316 + def get_top_divergent(self, n: int = 10): 317 + """Get the top N most divergent frames.""" 318 + return self.divergence_scores[:n] 319 + 320 + def _print_timings(self): 321 + """Print performance timing breakdown.""" 322 + total = sum(self.timings.values()) 323 + 324 + print(f"\n{'='*60}", file=sys.stderr) 325 + print(f"Performance Breakdown (total: {total:.2f}s)", file=sys.stderr) 326 + print(f"{'='*60}", file=sys.stderr) 327 + 328 + # Sort by time descending 329 + sorted_timings = sorted(self.timings.items(), key=lambda x: x[1], reverse=True) 330 + 331 + for name, duration in sorted_timings: 332 + pct = (duration / total * 100) if total > 0 else 0 333 + bar_width = int(pct / 2) # Scale to 50 chars max 334 + bar = "█" * bar_width 335 + print(f"{name:20s} {duration:7.2f}s {pct:5.1f}% {bar}", file=sys.stderr) 336 + 337 + print(f"{'='*60}", file=sys.stderr) 338 + 339 + # Calculate derived metrics 340 + if len(self.frame_scores) > 0: 341 + per_frame = total / ( 342 + len(self.frame_scores) + 1 343 + ) # +1 for first frame with no score 344 + print(f"Frames scanned: {len(self.frame_scores) + 1}", file=sys.stderr) 345 + print(f"Frames scored: {len(self.frame_scores)}", file=sys.stderr) 346 + print(f"Time per frame: {per_frame:.3f}s", file=sys.stderr) 347 + 348 + if len(self.frame_scores) > 0: 349 + per_score = self.timings["score_compute"] / len(self.frame_scores) 350 + print(f"Time per score: {per_score*1000:.3f}ms", file=sys.stderr) 351 + 352 + print(f"{'='*60}\n", file=sys.stderr) 353 + 354 + 355 + def make_handler(differ: ScreencastDiffer): 356 + """Create request handler with access to differ instance.""" 357 + 358 + class RequestHandler(BaseHTTPRequestHandler): 359 + def log_message(self, format, *args): 360 + """Suppress default request logging.""" 361 + sys.stderr.write(f"{self.address_string()} - {format % args}\n") 362 + 363 + def do_GET(self): 364 + if self.path == "/": 365 + # Serve HTML page with top 10 frames 366 + self.send_response(200) 367 + self.send_header("Content-Type", "text/html; charset=utf-8") 368 + self.end_headers() 369 + 370 + html = ["<!DOCTYPE html>"] 371 + html.append("<html><head>") 372 + html.append("<title>Screencast Divergence</title>") 373 + html.append("<style>") 374 + html.append( 375 + "body { font-family: monospace; margin: 20px; background: #1e1e1e; color: #ccc; }" 376 + ) 377 + html.append("h1 { color: #fff; }") 378 + html.append( 379 + ".frame { margin: 20px 0; padding: 10px; border: 1px solid #444; background: #2e2e2e; }" 380 + ) 381 + html.append( 382 + ".frame img { max-width: 800px; display: block; margin: 10px 0; }" 383 + ) 384 + html.append(".info { color: #8cf; }") 385 + html.append("</style>") 386 + html.append("</head><body>") 387 + html.append("<h1>Top 10 Most Divergent Frames</h1>") 388 + html.append(f"<p>Video: {differ.video_path.name}</p>") 389 + html.append( 390 + "<p>Scoring method: Min distance to any earlier frame (intensity + gradient + histogram JSD)</p>" 391 + ) 392 + html.append( 393 + f"<p>Total frames analyzed: {len(differ.frame_scores)} (sampled every {differ.sample_interval}s)</p>" 394 + ) 395 + 396 + for idx, (timestamp, score) in enumerate( 397 + differ.get_top_divergent(10), 1 398 + ): 399 + html.append('<div class="frame">') 400 + html.append( 401 + f'<div class="info">#{idx} - Timestamp: {timestamp:.2f}s - Divergence Score: {score}</div>' 402 + ) 403 + html.append( 404 + f'<img src="/frame/{idx}" alt="Frame at {timestamp:.2f}s">' 405 + ) 406 + html.append("</div>") 407 + 408 + html.append("</body></html>") 409 + 410 + output = "\n".join(html).encode("utf-8") 411 + self.wfile.write(output) 412 + 413 + elif self.path.startswith("/frame/"): 414 + # Serve individual frame image 415 + try: 416 + idx = int(self.path.split("/")[-1]) 417 + if idx in differ.top_frames: 418 + _, _, webp_bytes = differ.top_frames[idx] 419 + self.send_response(200) 420 + self.send_header("Content-Type", "image/webp") 421 + self.send_header("Content-Length", str(len(webp_bytes))) 422 + self.end_headers() 423 + self.wfile.write(webp_bytes) 424 + else: 425 + self.send_error(404, "Frame not found") 426 + except (ValueError, IndexError): 427 + self.send_error(400, "Invalid frame index") 428 + else: 429 + self.send_error(404, "Not found") 430 + 431 + return RequestHandler 432 + 433 + 434 + def main(): 435 + parser = argparse.ArgumentParser( 436 + description="Find and display the most visually different frames in a screencast" 437 + ) 438 + parser.add_argument("video", help="Path to screencast webm file") 439 + parser.add_argument( 440 + "--interval", 441 + type=float, 442 + default=1.0, 443 + help="Sample interval in seconds (default: 1.0)", 444 + ) 445 + parser.add_argument( 446 + "--port", type=int, default=9999, help="Server port (default: 9999)" 447 + ) 448 + args = parser.parse_args() 449 + 450 + differ = ScreencastDiffer(args.video, sample_interval=args.interval) 451 + 452 + print(f"\nServer running at http://0.0.0.0:{args.port}/") 453 + print("Press Ctrl+C to stop") 454 + 455 + server = HTTPServer(("0.0.0.0", args.port), make_handler(differ)) 456 + try: 457 + server.serve_forever() 458 + except KeyboardInterrupt: 459 + print("\nShutting down...") 460 + server.shutdown() 461 + 462 + 463 + if __name__ == "__main__": 464 + main()

pyproject.toml

··· 44 44 "numpy", 45 45 "dbus-next", 46 46 "av", 47 + "imagehash", 47 48 48 49 "sqlite-utils", 49 50 "openai>=1.2.0", ··· 107 108 think-messages = "think.messages:main" 108 109 gnome-screencast = "observe.gnome.screencast:main" 109 110 gnome-screencast-viewer = "observe.gnome.screencast_viewer:main" 111 + gnome-screencast-diff = "observe.gnome.screencast_diff:main" 110 112 111 113 [project.urls] 112 114 Homepage = "https://github.com/yourusername/sunstone"

Configure Feed

Configure Feed