A loose federation of distributed, typed datasets
1<!DOCTYPE html>
2<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
3
4<meta charset="utf-8">
5<meta name="generator" content="quarto-1.7.34">
6
7<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
8
9<meta name="description" content="Loading and iterating typed WebDataset tar files">
10
11<title>Datasets – atdata</title>
12<style>
13code{white-space: pre-wrap;}
14span.smallcaps{font-variant: small-caps;}
15div.columns{display: flex; gap: min(4vw, 1.5em);}
16div.column{flex: auto; overflow-x: auto;}
17div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
18ul.task-list{list-style: none;}
19ul.task-list li input[type="checkbox"] {
20 width: 0.8em;
21 margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
22 vertical-align: middle;
23}
24/* CSS for syntax highlighting */
25html { -webkit-text-size-adjust: 100%; }
26pre > code.sourceCode { white-space: pre; position: relative; }
27pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
28pre > code.sourceCode > span:empty { height: 1.2em; }
29.sourceCode { overflow: visible; }
30code.sourceCode > span { color: inherit; text-decoration: inherit; }
31div.sourceCode { margin: 1em 0; }
32pre.sourceCode { margin: 0; }
33@media screen {
34div.sourceCode { overflow: auto; }
35}
36@media print {
37pre > code.sourceCode { white-space: pre-wrap; }
38pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
39}
40pre.numberSource code
41 { counter-reset: source-line 0; }
42pre.numberSource code > span
43 { position: relative; left: -4em; counter-increment: source-line; }
44pre.numberSource code > span > a:first-child::before
45 { content: counter(source-line);
46 position: relative; left: -1em; text-align: right; vertical-align: baseline;
47 border: none; display: inline-block;
48 -webkit-touch-callout: none; -webkit-user-select: none;
49 -khtml-user-select: none; -moz-user-select: none;
50 -ms-user-select: none; user-select: none;
51 padding: 0 4px; width: 4em;
52 }
53pre.numberSource { margin-left: 3em; padding-left: 4px; }
54div.sourceCode
55 { }
56@media screen {
57pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
58}
59</style>
60
61
62<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
63<script src="../site_libs/quarto-nav/headroom.min.js"></script>
64<script src="../site_libs/clipboard/clipboard.min.js"></script>
65<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
66<script src="../site_libs/quarto-search/fuse.min.js"></script>
67<script src="../site_libs/quarto-search/quarto-search.js"></script>
68<meta name="quarto:offset" content="../">
69<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
70<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
71<script src="../site_libs/quarto-html/popper.min.js"></script>
72<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
73<script src="../site_libs/quarto-html/anchor.min.js"></script>
74<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
75<link href="../site_libs/quarto-html/quarto-syntax-highlighting-9582434199d49cc9e91654cdeeb4866b.css" rel="stylesheet" class="quarto-color-scheme" id="quarto-text-highlighting-styles">
76<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-8dcd8563ea6803ab7cbb3d71ca5772e1.css" rel="stylesheet" class="quarto-color-scheme quarto-color-alternate" id="quarto-text-highlighting-styles">
77<link href="../site_libs/quarto-html/quarto-syntax-highlighting-9582434199d49cc9e91654cdeeb4866b.css" rel="stylesheet" class="quarto-color-scheme-extra" id="quarto-text-highlighting-styles">
78<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
79<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
80<link href="../site_libs/bootstrap/bootstrap-62bce24ca844314e7bb1a34dbdfe05cc.min.css" rel="stylesheet" append-hash="true" class="quarto-color-scheme" id="quarto-bootstrap" data-mode="light">
81<link href="../site_libs/bootstrap/bootstrap-dark-7964ffd8887b0991fe8d71c6c8bc75d6.min.css" rel="stylesheet" append-hash="true" class="quarto-color-scheme quarto-color-alternate" id="quarto-bootstrap" data-mode="dark">
82<link href="../site_libs/bootstrap/bootstrap-62bce24ca844314e7bb1a34dbdfe05cc.min.css" rel="stylesheet" append-hash="true" class="quarto-color-scheme-extra" id="quarto-bootstrap" data-mode="light">
83<script id="quarto-search-options" type="application/json">{
84 "location": "navbar",
85 "copy-button": false,
86 "collapse-after": 3,
87 "panel-placement": "end",
88 "type": "overlay",
89 "limit": 50,
90 "keyboard-shortcut": [
91 "f",
92 "/",
93 "s"
94 ],
95 "show-item-context": false,
96 "language": {
97 "search-no-results-text": "No results",
98 "search-matching-documents-text": "matching documents",
99 "search-copy-link-title": "Copy link to search",
100 "search-hide-matches-text": "Hide additional matches",
101 "search-more-match-text": "more match in this document",
102 "search-more-matches-text": "more matches in this document",
103 "search-clear-button-title": "Clear",
104 "search-text-placeholder": "",
105 "search-detached-cancel-button-title": "Cancel",
106 "search-submit-button-title": "Submit",
107 "search-label": "Search"
108 }
109}</script>
110
111
112<link rel="stylesheet" href="../assets/styles.css">
113</head>
114
115<body class="nav-sidebar docked nav-fixed quarto-light"><script id="quarto-html-before-body" type="application/javascript">
116 const toggleBodyColorMode = (bsSheetEl) => {
117 const mode = bsSheetEl.getAttribute("data-mode");
118 const bodyEl = window.document.querySelector("body");
119 if (mode === "dark") {
120 bodyEl.classList.add("quarto-dark");
121 bodyEl.classList.remove("quarto-light");
122 } else {
123 bodyEl.classList.add("quarto-light");
124 bodyEl.classList.remove("quarto-dark");
125 }
126 }
127 const toggleBodyColorPrimary = () => {
128 const bsSheetEl = window.document.querySelector("link#quarto-bootstrap:not([rel=disabled-stylesheet])");
129 if (bsSheetEl) {
130 toggleBodyColorMode(bsSheetEl);
131 }
132 }
133 const setColorSchemeToggle = (alternate) => {
134 const toggles = window.document.querySelectorAll('.quarto-color-scheme-toggle');
135 for (let i=0; i < toggles.length; i++) {
136 const toggle = toggles[i];
137 if (toggle) {
138 if (alternate) {
139 toggle.classList.add("alternate");
140 } else {
141 toggle.classList.remove("alternate");
142 }
143 }
144 }
145 };
146 const toggleColorMode = (alternate) => {
147 // Switch the stylesheets
148 const primaryStylesheets = window.document.querySelectorAll('link.quarto-color-scheme:not(.quarto-color-alternate)');
149 const alternateStylesheets = window.document.querySelectorAll('link.quarto-color-scheme.quarto-color-alternate');
150 manageTransitions('#quarto-margin-sidebar .nav-link', false);
151 if (alternate) {
152 // note: dark is layered on light, we don't disable primary!
153 enableStylesheet(alternateStylesheets);
154 for (const sheetNode of alternateStylesheets) {
155 if (sheetNode.id === "quarto-bootstrap") {
156 toggleBodyColorMode(sheetNode);
157 }
158 }
159 } else {
160 disableStylesheet(alternateStylesheets);
161 enableStylesheet(primaryStylesheets)
162 toggleBodyColorPrimary();
163 }
164 manageTransitions('#quarto-margin-sidebar .nav-link', true);
165 // Switch the toggles
166 setColorSchemeToggle(alternate)
167 // Hack to workaround the fact that safari doesn't
168 // properly recolor the scrollbar when toggling (#1455)
169 if (navigator.userAgent.indexOf('Safari') > 0 && navigator.userAgent.indexOf('Chrome') == -1) {
170 manageTransitions("body", false);
171 window.scrollTo(0, 1);
172 setTimeout(() => {
173 window.scrollTo(0, 0);
174 manageTransitions("body", true);
175 }, 40);
176 }
177 }
178 const disableStylesheet = (stylesheets) => {
179 for (let i=0; i < stylesheets.length; i++) {
180 const stylesheet = stylesheets[i];
181 stylesheet.rel = 'disabled-stylesheet';
182 }
183 }
184 const enableStylesheet = (stylesheets) => {
185 for (let i=0; i < stylesheets.length; i++) {
186 const stylesheet = stylesheets[i];
187 if(stylesheet.rel !== 'stylesheet') { // for Chrome, which will still FOUC without this check
188 stylesheet.rel = 'stylesheet';
189 }
190 }
191 }
192 const manageTransitions = (selector, allowTransitions) => {
193 const els = window.document.querySelectorAll(selector);
194 for (let i=0; i < els.length; i++) {
195 const el = els[i];
196 if (allowTransitions) {
197 el.classList.remove('notransition');
198 } else {
199 el.classList.add('notransition');
200 }
201 }
202 }
203 const isFileUrl = () => {
204 return window.location.protocol === 'file:';
205 }
206 const hasAlternateSentinel = () => {
207 let styleSentinel = getColorSchemeSentinel();
208 if (styleSentinel !== null) {
209 return styleSentinel === "alternate";
210 } else {
211 return false;
212 }
213 }
214 const setStyleSentinel = (alternate) => {
215 const value = alternate ? "alternate" : "default";
216 if (!isFileUrl()) {
217 window.localStorage.setItem("quarto-color-scheme", value);
218 } else {
219 localAlternateSentinel = value;
220 }
221 }
222 const getColorSchemeSentinel = () => {
223 if (!isFileUrl()) {
224 const storageValue = window.localStorage.getItem("quarto-color-scheme");
225 return storageValue != null ? storageValue : localAlternateSentinel;
226 } else {
227 return localAlternateSentinel;
228 }
229 }
230 const toggleGiscusIfUsed = (isAlternate, darkModeDefault) => {
231 const baseTheme = document.querySelector('#giscus-base-theme')?.value ?? 'light';
232 const alternateTheme = document.querySelector('#giscus-alt-theme')?.value ?? 'dark';
233 let newTheme = '';
234 if(authorPrefersDark) {
235 newTheme = isAlternate ? baseTheme : alternateTheme;
236 } else {
237 newTheme = isAlternate ? alternateTheme : baseTheme;
238 }
239 const changeGiscusTheme = () => {
240 // From: https://github.com/giscus/giscus/issues/336
241 const sendMessage = (message) => {
242 const iframe = document.querySelector('iframe.giscus-frame');
243 if (!iframe) return;
244 iframe.contentWindow.postMessage({ giscus: message }, 'https://giscus.app');
245 }
246 sendMessage({
247 setConfig: {
248 theme: newTheme
249 }
250 });
251 }
252 const isGiscussLoaded = window.document.querySelector('iframe.giscus-frame') !== null;
253 if (isGiscussLoaded) {
254 changeGiscusTheme();
255 }
256 };
257 const authorPrefersDark = false;
258 const darkModeDefault = authorPrefersDark;
259 document.querySelector('link#quarto-text-highlighting-styles.quarto-color-scheme-extra').rel = 'disabled-stylesheet';
260 document.querySelector('link#quarto-bootstrap.quarto-color-scheme-extra').rel = 'disabled-stylesheet';
261 let localAlternateSentinel = darkModeDefault ? 'alternate' : 'default';
262 // Dark / light mode switch
263 window.quartoToggleColorScheme = () => {
264 // Read the current dark / light value
265 let toAlternate = !hasAlternateSentinel();
266 toggleColorMode(toAlternate);
267 setStyleSentinel(toAlternate);
268 toggleGiscusIfUsed(toAlternate, darkModeDefault);
269 window.dispatchEvent(new Event('resize'));
270 };
271 // Switch to dark mode if need be
272 if (hasAlternateSentinel()) {
273 toggleColorMode(true);
274 } else {
275 toggleColorMode(false);
276 }
277 </script>
278
279<div id="quarto-search-results"></div>
280 <header id="quarto-header" class="headroom fixed-top">
281 <nav class="navbar navbar-expand-lg " data-bs-theme="dark">
282 <div class="navbar-container container-fluid">
283 <div class="navbar-brand-container mx-auto">
284 <a class="navbar-brand" href="../index.html">
285 <span class="navbar-title">atdata</span>
286 </a>
287 </div>
288 <div id="quarto-search" class="" title="Search"></div>
289 <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" role="menu" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
290 <span class="navbar-toggler-icon"></span>
291</button>
292 <div class="collapse navbar-collapse" id="navbarCollapse">
293 <ul class="navbar-nav navbar-nav-scroll me-auto">
294 <li class="nav-item">
295 <a class="nav-link active" href="../index.html" aria-current="page">
296<span class="menu-text">Guide</span></a>
297 </li>
298 <li class="nav-item dropdown ">
299 <a class="nav-link dropdown-toggle" href="#" id="nav-menu-tutorials" role="link" data-bs-toggle="dropdown" aria-expanded="false">
300 <span class="menu-text">Tutorials</span>
301 </a>
302 <ul class="dropdown-menu" aria-labelledby="nav-menu-tutorials">
303 <li>
304 <a class="dropdown-item" href="../tutorials/quickstart.html">
305 <span class="dropdown-text">Quick Start</span></a>
306 </li>
307 <li>
308 <a class="dropdown-item" href="../tutorials/local-workflow.html">
309 <span class="dropdown-text">Local Workflow</span></a>
310 </li>
311 <li>
312 <a class="dropdown-item" href="../tutorials/atmosphere.html">
313 <span class="dropdown-text">Atmosphere Publishing</span></a>
314 </li>
315 <li>
316 <a class="dropdown-item" href="../tutorials/promotion.html">
317 <span class="dropdown-text">Promotion Workflow</span></a>
318 </li>
319 </ul>
320 </li>
321 <li class="nav-item dropdown ">
322 <a class="nav-link dropdown-toggle" href="#" id="nav-menu-reference" role="link" data-bs-toggle="dropdown" aria-expanded="false">
323 <span class="menu-text">Reference</span>
324 </a>
325 <ul class="dropdown-menu" aria-labelledby="nav-menu-reference">
326 <li>
327 <a class="dropdown-item" href="../reference/architecture.html">
328 <span class="dropdown-text">Architecture Overview</span></a>
329 </li>
330 <li>
331 <a class="dropdown-item" href="../reference/packable-samples.html">
332 <span class="dropdown-text">Packable Samples</span></a>
333 </li>
334 <li>
335 <a class="dropdown-item" href="../reference/datasets.html">
336 <span class="dropdown-text">Datasets</span></a>
337 </li>
338 <li>
339 <a class="dropdown-item" href="../reference/lenses.html">
340 <span class="dropdown-text">Lenses</span></a>
341 </li>
342 <li>
343 <a class="dropdown-item" href="../reference/local-storage.html">
344 <span class="dropdown-text">Local Storage</span></a>
345 </li>
346 <li>
347 <a class="dropdown-item" href="../reference/atmosphere.html">
348 <span class="dropdown-text">Atmosphere</span></a>
349 </li>
350 <li>
351 <a class="dropdown-item" href="../reference/promotion.html">
352 <span class="dropdown-text">Promotion</span></a>
353 </li>
354 <li>
355 <a class="dropdown-item" href="../reference/load-dataset.html">
356 <span class="dropdown-text">load_dataset API</span></a>
357 </li>
358 <li>
359 <a class="dropdown-item" href="../reference/protocols.html">
360 <span class="dropdown-text">Protocols</span></a>
361 </li>
362 <li>
363 <a class="dropdown-item" href="../reference/uri-spec.html">
364 <span class="dropdown-text">URI Specification</span></a>
365 </li>
366 <li>
367 <a class="dropdown-item" href="../reference/troubleshooting.html">
368 <span class="dropdown-text">Troubleshooting & FAQ</span></a>
369 </li>
370 <li>
371 <a class="dropdown-item" href="../reference/deployment.html">
372 <span class="dropdown-text">Deployment Guide</span></a>
373 </li>
374 </ul>
375 </li>
376 <li class="nav-item">
377 <a class="nav-link" href="../api/index.html">
378<span class="menu-text">API</span></a>
379 </li>
380</ul>
381 <ul class="navbar-nav navbar-nav-scroll ms-auto">
382 <li class="nav-item compact">
383 <a class="nav-link" href="https://github.com/your-org/atdata"> <i class="bi bi-github" role="img">
384</i>
385<span class="menu-text"></span></a>
386 </li>
387</ul>
388 </div> <!-- /navcollapse -->
389 <div class="quarto-navbar-tools">
390 <a href="" class="quarto-color-scheme-toggle quarto-navigation-tool px-1" onclick="window.quartoToggleColorScheme(); return false;" title="Toggle dark mode"><i class="bi"></i></a>
391</div>
392 </div> <!-- /container-fluid -->
393 </nav>
394 <nav class="quarto-secondary-nav">
395 <div class="container-fluid d-flex">
396 <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
397 <i class="bi bi-layout-text-sidebar-reverse"></i>
398 </button>
399 <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../reference/architecture.html">Reference</a></li><li class="breadcrumb-item"><a href="../reference/datasets.html">Datasets</a></li></ol></nav>
400 <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
401 </a>
402 </div>
403 </nav>
404</header>
405<!-- content -->
406<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
407<!-- sidebar -->
408 <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
409 <div class="sidebar-menu-container">
410 <ul class="list-unstyled mt-1">
411 <li class="sidebar-item">
412 <div class="sidebar-item-container">
413 <a href="../index.html" class="sidebar-item-text sidebar-link">
414 <span class="menu-text">atdata</span></a>
415 </div>
416</li>
417 <li class="sidebar-item sidebar-item-section">
418 <div class="sidebar-item-container">
419 <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
420 <span class="menu-text">Getting Started</span></a>
421 <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
422 <i class="bi bi-chevron-right ms-2"></i>
423 </a>
424 </div>
425 <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
426 <li class="sidebar-item">
427 <div class="sidebar-item-container">
428 <a href="../tutorials/quickstart.html" class="sidebar-item-text sidebar-link">
429 <span class="menu-text">Quick Start</span></a>
430 </div>
431</li>
432 <li class="sidebar-item">
433 <div class="sidebar-item-container">
434 <a href="../tutorials/local-workflow.html" class="sidebar-item-text sidebar-link">
435 <span class="menu-text">Local Workflow</span></a>
436 </div>
437</li>
438 <li class="sidebar-item">
439 <div class="sidebar-item-container">
440 <a href="../tutorials/atmosphere.html" class="sidebar-item-text sidebar-link">
441 <span class="menu-text">Atmosphere Publishing</span></a>
442 </div>
443</li>
444 <li class="sidebar-item">
445 <div class="sidebar-item-container">
446 <a href="../tutorials/promotion.html" class="sidebar-item-text sidebar-link">
447 <span class="menu-text">Promotion Workflow</span></a>
448 </div>
449</li>
450 </ul>
451 </li>
452 <li class="sidebar-item sidebar-item-section">
453 <div class="sidebar-item-container">
454 <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true">
455 <span class="menu-text">Reference</span></a>
456 <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
457 <i class="bi bi-chevron-right ms-2"></i>
458 </a>
459 </div>
460 <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">
461 <li class="sidebar-item">
462 <div class="sidebar-item-container">
463 <a href="../reference/architecture.html" class="sidebar-item-text sidebar-link">
464 <span class="menu-text">Architecture Overview</span></a>
465 </div>
466</li>
467 <li class="sidebar-item">
468 <div class="sidebar-item-container">
469 <a href="../reference/packable-samples.html" class="sidebar-item-text sidebar-link">
470 <span class="menu-text">Packable Samples</span></a>
471 </div>
472</li>
473 <li class="sidebar-item">
474 <div class="sidebar-item-container">
475 <a href="../reference/datasets.html" class="sidebar-item-text sidebar-link active">
476 <span class="menu-text">Datasets</span></a>
477 </div>
478</li>
479 <li class="sidebar-item">
480 <div class="sidebar-item-container">
481 <a href="../reference/lenses.html" class="sidebar-item-text sidebar-link">
482 <span class="menu-text">Lenses</span></a>
483 </div>
484</li>
485 <li class="sidebar-item">
486 <div class="sidebar-item-container">
487 <a href="../reference/local-storage.html" class="sidebar-item-text sidebar-link">
488 <span class="menu-text">Local Storage</span></a>
489 </div>
490</li>
491 <li class="sidebar-item">
492 <div class="sidebar-item-container">
493 <a href="../reference/atmosphere.html" class="sidebar-item-text sidebar-link">
494 <span class="menu-text">Atmosphere (ATProto Integration)</span></a>
495 </div>
496</li>
497 <li class="sidebar-item">
498 <div class="sidebar-item-container">
499 <a href="../reference/promotion.html" class="sidebar-item-text sidebar-link">
500 <span class="menu-text">Promotion Workflow</span></a>
501 </div>
502</li>
503 <li class="sidebar-item">
504 <div class="sidebar-item-container">
505 <a href="../reference/load-dataset.html" class="sidebar-item-text sidebar-link">
506 <span class="menu-text">load_dataset API</span></a>
507 </div>
508</li>
509 <li class="sidebar-item">
510 <div class="sidebar-item-container">
511 <a href="../reference/protocols.html" class="sidebar-item-text sidebar-link">
512 <span class="menu-text">Protocols</span></a>
513 </div>
514</li>
515 <li class="sidebar-item">
516 <div class="sidebar-item-container">
517 <a href="../reference/uri-spec.html" class="sidebar-item-text sidebar-link">
518 <span class="menu-text">URI Specification</span></a>
519 </div>
520</li>
521 <li class="sidebar-item">
522 <div class="sidebar-item-container">
523 <a href="../reference/troubleshooting.html" class="sidebar-item-text sidebar-link">
524 <span class="menu-text">Troubleshooting & FAQ</span></a>
525 </div>
526</li>
527 <li class="sidebar-item">
528 <div class="sidebar-item-container">
529 <a href="../reference/deployment.html" class="sidebar-item-text sidebar-link">
530 <span class="menu-text">Deployment Guide</span></a>
531 </div>
532</li>
533 </ul>
534 </li>
535 </ul>
536 </div>
537</nav>
538<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
539<!-- margin-sidebar -->
540 <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
541 <nav id="TOC" role="doc-toc" class="toc-active">
542 <h2 id="toc-title">On this page</h2>
543
544 <ul>
545 <li><a href="#creating-a-dataset" id="toc-creating-a-dataset" class="nav-link active" data-scroll-target="#creating-a-dataset">Creating a Dataset</a></li>
546 <li><a href="#data-sources" id="toc-data-sources" class="nav-link" data-scroll-target="#data-sources">Data Sources</a>
547 <ul class="collapse">
548 <li><a href="#url-source-default" id="toc-url-source-default" class="nav-link" data-scroll-target="#url-source-default">URL Source (default)</a></li>
549 <li><a href="#s3-source" id="toc-s3-source" class="nav-link" data-scroll-target="#s3-source">S3 Source</a></li>
550 </ul></li>
551 <li><a href="#iteration-modes" id="toc-iteration-modes" class="nav-link" data-scroll-target="#iteration-modes">Iteration Modes</a>
552 <ul class="collapse">
553 <li><a href="#ordered-iteration" id="toc-ordered-iteration" class="nav-link" data-scroll-target="#ordered-iteration">Ordered Iteration</a></li>
554 <li><a href="#shuffled-iteration" id="toc-shuffled-iteration" class="nav-link" data-scroll-target="#shuffled-iteration">Shuffled Iteration</a></li>
555 </ul></li>
556 <li><a href="#samplebatch" id="toc-samplebatch" class="nav-link" data-scroll-target="#samplebatch">SampleBatch</a></li>
557 <li><a href="#type-transformations-with-lenses" id="toc-type-transformations-with-lenses" class="nav-link" data-scroll-target="#type-transformations-with-lenses">Type Transformations with Lenses</a></li>
558 <li><a href="#dataset-properties" id="toc-dataset-properties" class="nav-link" data-scroll-target="#dataset-properties">Dataset Properties</a>
559 <ul class="collapse">
560 <li><a href="#shard-list" id="toc-shard-list" class="nav-link" data-scroll-target="#shard-list">Shard List</a></li>
561 <li><a href="#metadata" id="toc-metadata" class="nav-link" data-scroll-target="#metadata">Metadata</a></li>
562 </ul></li>
563 <li><a href="#writing-datasets" id="toc-writing-datasets" class="nav-link" data-scroll-target="#writing-datasets">Writing Datasets</a></li>
564 <li><a href="#parquet-export" id="toc-parquet-export" class="nav-link" data-scroll-target="#parquet-export">Parquet Export</a></li>
565 <li><a href="#url-formats" id="toc-url-formats" class="nav-link" data-scroll-target="#url-formats">URL Formats</a></li>
566 <li><a href="#dataset-properties-1" id="toc-dataset-properties-1" class="nav-link" data-scroll-target="#dataset-properties-1">Dataset Properties</a>
567 <ul class="collapse">
568 <li><a href="#source" id="toc-source" class="nav-link" data-scroll-target="#source">Source</a></li>
569 <li><a href="#sample-type" id="toc-sample-type" class="nav-link" data-scroll-target="#sample-type">Sample Type</a></li>
570 </ul></li>
571 <li><a href="#related" id="toc-related" class="nav-link" data-scroll-target="#related">Related</a></li>
572 </ul>
573<div class="toc-actions"><ul><li><a href="https://github.com/your-org/atdata/edit/main/reference/datasets.qmd" class="toc-action"><i class="bi bi-github"></i>Edit this page</a></li><li><a href="https://github.com/your-org/atdata/issues/new" class="toc-action"><i class="bi empty"></i>Report an issue</a></li></ul></div></nav>
574 </div>
575<!-- main -->
576<main class="content" id="quarto-document-content">
577
578
579<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../reference/architecture.html">Reference</a></li><li class="breadcrumb-item"><a href="../reference/datasets.html">Datasets</a></li></ol></nav>
580<div class="quarto-title">
581<h1 class="title">Datasets</h1>
582</div>
583
584<div>
585 <div class="description">
586 Loading and iterating typed WebDataset tar files
587 </div>
588</div>
589
590
591<div class="quarto-title-meta">
592
593
594
595
596 </div>
597
598
599
600</header>
601
602
603<p>The <code>Dataset</code> class provides typed iteration over WebDataset tar files with automatic batching and lens transformations.</p>
604<section id="creating-a-dataset" class="level2">
605<h2 class="anchored" data-anchor-id="creating-a-dataset">Creating a Dataset</h2>
606<div id="ef6e2916" class="cell">
607<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> atdata</span>
608<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> numpy.typing <span class="im">import</span> NDArray</span>
609<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
610<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="at">@atdata.packable</span></span>
611<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> ImageSample:</span>
612<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a> image: NDArray</span>
613<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a> label: <span class="bu">str</span></span>
614<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
615<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Single shard (string URL - most common)</span></span>
616<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](<span class="st">"data-000000.tar"</span>)</span>
617<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a></span>
618<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Multiple shards with brace notation</span></span>
619<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](<span class="st">"data-{000000..000009}.tar"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
620</div>
621<p>The type parameter <code>[ImageSample]</code> specifies what sample type the dataset contains. This enables type-safe iteration and automatic deserialization.</p>
622</section>
623<section id="data-sources" class="level2">
624<h2 class="anchored" data-anchor-id="data-sources">Data Sources</h2>
625<p>Datasets can be created from different data sources using the <code>DataSource</code> protocol:</p>
626<section id="url-source-default" class="level3">
627<h3 class="anchored" data-anchor-id="url-source-default">URL Source (default)</h3>
628<p>When you pass a string to <code>Dataset</code>, it automatically wraps it in a <code>URLSource</code>:</p>
629<div id="9cb82ec3" class="cell">
630<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># These are equivalent:</span></span>
631<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](<span class="st">"data-{000000..000009}.tar"</span>)</span>
632<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](atdata.URLSource(<span class="st">"data-{000000..000009}.tar"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
633</div>
634</section>
635<section id="s3-source" class="level3">
636<h3 class="anchored" data-anchor-id="s3-source">S3 Source</h3>
637<p>For private S3 buckets or S3-compatible storage (Cloudflare R2, MinIO), use <code>S3Source</code>:</p>
638<div id="c2cb3b94" class="cell">
639<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># From explicit credentials</span></span>
640<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>source <span class="op">=</span> atdata.S3Source(</span>
641<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a> bucket<span class="op">=</span><span class="st">"my-bucket"</span>,</span>
642<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> keys<span class="op">=</span>[<span class="st">"data-000000.tar"</span>, <span class="st">"data-000001.tar"</span>],</span>
643<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> endpoint<span class="op">=</span><span class="st">"https://my-r2-account.r2.cloudflarestorage.com"</span>,</span>
644<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> access_key<span class="op">=</span><span class="st">"AKID..."</span>,</span>
645<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a> secret_key<span class="op">=</span><span class="st">"SECRET..."</span>,</span>
646<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>)</span>
647<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](source)</span>
648<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a></span>
649<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a><span class="co"># From S3 URLs</span></span>
650<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>source <span class="op">=</span> atdata.S3Source.from_urls([</span>
651<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a> <span class="st">"s3://my-bucket/data-000000.tar"</span>,</span>
652<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a> <span class="st">"s3://my-bucket/data-000001.tar"</span>,</span>
653<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>])</span>
654<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](source)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
655</div>
656<div class="callout callout-style-default callout-note callout-titled">
657<div class="callout-header d-flex align-content-center">
658<div class="callout-icon-container">
659<i class="callout-icon"></i>
660</div>
661<div class="callout-title-container flex-fill">
662Note
663</div>
664</div>
665<div class="callout-body-container callout-body">
666<p><code>S3Source</code> uses boto3 for streaming, enabling authentication with private buckets. For public S3 URLs, a string URL with <code>URLSource</code> works directly.</p>
667</div>
668</div>
669</section>
670</section>
671<section id="iteration-modes" class="level2">
672<h2 class="anchored" data-anchor-id="iteration-modes">Iteration Modes</h2>
673<section id="ordered-iteration" class="level3">
674<h3 class="anchored" data-anchor-id="ordered-iteration">Ordered Iteration</h3>
675<p>Iterate through samples in their original order:</p>
676<div id="d3f45d97" class="cell">
677<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># With batching (default batch_size=1)</span></span>
678<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> batch <span class="kw">in</span> dataset.ordered(batch_size<span class="op">=</span><span class="dv">32</span>):</span>
679<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a> images <span class="op">=</span> batch.image <span class="co"># numpy array (32, H, W, C)</span></span>
680<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a> labels <span class="op">=</span> batch.label <span class="co"># list of 32 strings</span></span>
681<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
682<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Without batching (raw samples)</span></span>
683<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> sample <span class="kw">in</span> dataset.ordered(batch_size<span class="op">=</span><span class="va">None</span>):</span>
684<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(sample.label)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
685</div>
686</section>
687<section id="shuffled-iteration" class="level3">
688<h3 class="anchored" data-anchor-id="shuffled-iteration">Shuffled Iteration</h3>
689<p>Iterate with randomized order at both shard and sample levels:</p>
690<div id="fb5204d1" class="cell">
691<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> batch <span class="kw">in</span> dataset.shuffled(batch_size<span class="op">=</span><span class="dv">32</span>):</span>
692<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a> <span class="co"># Samples are shuffled</span></span>
693<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> process(batch)</span>
694<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a></span>
695<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Control shuffle buffer sizes</span></span>
696<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> batch <span class="kw">in</span> dataset.shuffled(</span>
697<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a> buffer_shards<span class="op">=</span><span class="dv">100</span>, <span class="co"># Shards to buffer (default: 100)</span></span>
698<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a> buffer_samples<span class="op">=</span><span class="dv">10000</span>, <span class="co"># Samples to buffer (default: 10,000)</span></span>
699<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a> batch_size<span class="op">=</span><span class="dv">32</span>,</span>
700<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>):</span>
701<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a> process(batch)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
702</div>
703<div class="callout callout-style-default callout-tip callout-titled">
704<div class="callout-header d-flex align-content-center">
705<div class="callout-icon-container">
706<i class="callout-icon"></i>
707</div>
708<div class="callout-title-container flex-fill">
709Tip
710</div>
711</div>
712<div class="callout-body-container callout-body">
713<p>Larger buffer sizes increase randomness but use more memory. For training, <code>buffer_samples=10000</code> is usually a good balance.</p>
714</div>
715</div>
716</section>
717</section>
718<section id="samplebatch" class="level2">
719<h2 class="anchored" data-anchor-id="samplebatch">SampleBatch</h2>
720<p>When iterating with a <code>batch_size</code>, each iteration yields a <code>SampleBatch</code> with automatic attribute aggregation.</p>
721<div id="c1093b41" class="cell">
722<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="at">@atdata.packable</span></span>
723<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> Sample:</span>
724<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a> features: NDArray <span class="co"># shape (256,)</span></span>
725<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a> label: <span class="bu">str</span></span>
726<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a> score: <span class="bu">float</span></span>
727<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
728<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> batch <span class="kw">in</span> dataset.ordered(batch_size<span class="op">=</span><span class="dv">16</span>):</span>
729<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a> <span class="co"># NDArray fields are stacked with a batch dimension</span></span>
730<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a> features <span class="op">=</span> batch.features <span class="co"># numpy array (16, 256)</span></span>
731<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a></span>
732<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a> <span class="co"># Other fields become lists</span></span>
733<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a> labels <span class="op">=</span> batch.label <span class="co"># list of 16 strings</span></span>
734<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a> scores <span class="op">=</span> batch.score <span class="co"># list of 16 floats</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
735</div>
736<p>Results are cached, so accessing the same attribute multiple times is efficient.</p>
737</section>
738<section id="type-transformations-with-lenses" class="level2">
739<h2 class="anchored" data-anchor-id="type-transformations-with-lenses">Type Transformations with Lenses</h2>
740<p>View a dataset through a different sample type using registered lenses:</p>
741<div id="044cffe6" class="cell">
742<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="at">@atdata.packable</span></span>
743<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> SimplifiedSample:</span>
744<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> label: <span class="bu">str</span></span>
745<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a></span>
746<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="at">@atdata.lens</span></span>
747<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> simplify(src: ImageSample) <span class="op">-></span> SimplifiedSample:</span>
748<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> SimplifiedSample(label<span class="op">=</span>src.label)</span>
749<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
750<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Transform dataset to different type</span></span>
751<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>simple_ds <span class="op">=</span> dataset.as_type(SimplifiedSample)</span>
752<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a></span>
753<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> batch <span class="kw">in</span> simple_ds.ordered(batch_size<span class="op">=</span><span class="dv">16</span>):</span>
754<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(batch.label) <span class="co"># Only label field available</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
755</div>
756<p>See <a href="../reference/lenses.html">Lenses</a> for details on defining transformations.</p>
757</section>
758<section id="dataset-properties" class="level2">
759<h2 class="anchored" data-anchor-id="dataset-properties">Dataset Properties</h2>
760<section id="shard-list" class="level3">
761<h3 class="anchored" data-anchor-id="shard-list">Shard List</h3>
762<p>Get the list of individual tar files:</p>
763<div id="ce9df6da" class="cell">
764<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[Sample](<span class="st">"data-{000000..000009}.tar"</span>)</span>
765<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>shards <span class="op">=</span> dataset.shard_list</span>
766<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="co"># ['data-000000.tar', 'data-000001.tar', ..., 'data-000009.tar']</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
767</div>
768</section>
769<section id="metadata" class="level3">
770<h3 class="anchored" data-anchor-id="metadata">Metadata</h3>
771<p>Datasets can have associated metadata from a URL:</p>
772<div id="129c7a63" class="cell">
773<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[Sample](</span>
774<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="st">"data-{000000..000009}.tar"</span>,</span>
775<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> metadata_url<span class="op">=</span><span class="st">"https://example.com/metadata.msgpack"</span></span>
776<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>)</span>
777<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a></span>
778<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetched and cached on first access</span></span>
779<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>metadata <span class="op">=</span> dataset.metadata <span class="co"># dict or None</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
780</div>
781</section>
782</section>
783<section id="writing-datasets" class="level2">
784<h2 class="anchored" data-anchor-id="writing-datasets">Writing Datasets</h2>
785<p>Use WebDataset’s <code>TarWriter</code> or <code>ShardWriter</code> to create datasets:</p>
786<div id="f29fbb2c" class="cell">
787<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> webdataset <span class="im">as</span> wds</span>
788<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
789<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a></span>
790<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>samples <span class="op">=</span> [</span>
791<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a> ImageSample(image<span class="op">=</span>np.random.rand(<span class="dv">224</span>, <span class="dv">224</span>, <span class="dv">3</span>).astype(np.float32), label<span class="op">=</span><span class="st">"cat"</span>)</span>
792<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> _ <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">100</span>)</span>
793<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>]</span>
794<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a></span>
795<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Single tar file</span></span>
796<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> wds.writer.TarWriter(<span class="st">"data-000000.tar"</span>) <span class="im">as</span> sink:</span>
797<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> i, sample <span class="kw">in</span> <span class="bu">enumerate</span>(samples):</span>
798<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a> sink.write({<span class="op">**</span>sample.as_wds, <span class="st">"__key__"</span>: <span class="ss">f"sample_</span><span class="sc">{</span>i<span class="sc">:06d}</span><span class="ss">"</span>})</span>
799<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a></span>
800<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Multiple shards with automatic splitting</span></span>
801<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> wds.writer.ShardWriter(<span class="st">"data-</span><span class="sc">%06d</span><span class="st">.tar"</span>, maxcount<span class="op">=</span><span class="dv">1000</span>) <span class="im">as</span> sink:</span>
802<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> i, sample <span class="kw">in</span> <span class="bu">enumerate</span>(samples):</span>
803<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a> sink.write({<span class="op">**</span>sample.as_wds, <span class="st">"__key__"</span>: <span class="ss">f"sample_</span><span class="sc">{</span>i<span class="sc">:06d}</span><span class="ss">"</span>})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
804</div>
805</section>
806<section id="parquet-export" class="level2">
807<h2 class="anchored" data-anchor-id="parquet-export">Parquet Export</h2>
808<p>Export dataset contents to parquet format:</p>
809<div id="53060440" class="cell">
810<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Export entire dataset</span></span>
811<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>dataset.to_parquet(<span class="st">"output.parquet"</span>)</span>
812<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
813<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Export with custom field mapping</span></span>
814<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> extract_fields(sample):</span>
815<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"label"</span>: sample.label, <span class="st">"score"</span>: sample.confidence}</span>
816<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a></span>
817<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>dataset.to_parquet(<span class="st">"output.parquet"</span>, sample_map<span class="op">=</span>extract_fields)</span>
818<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
819<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Export in segments</span></span>
820<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>dataset.to_parquet(<span class="st">"output.parquet"</span>, maxcount<span class="op">=</span><span class="dv">10000</span>)</span>
821<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Creates output-000000.parquet, output-000001.parquet, etc.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
822</div>
823</section>
824<section id="url-formats" class="level2">
825<h2 class="anchored" data-anchor-id="url-formats">URL Formats</h2>
826<p>When using string URLs (via <code>URLSource</code>), WebDataset supports various formats:</p>
827<table class="caption-top table">
828<colgroup>
829<col style="width: 47%">
830<col style="width: 52%">
831</colgroup>
832<thead>
833<tr class="header">
834<th>Format</th>
835<th>Example</th>
836</tr>
837</thead>
838<tbody>
839<tr class="odd">
840<td>Local files</td>
841<td><code>./data/file.tar</code>, <code>/absolute/path/file-{000000..000009}.tar</code></td>
842</tr>
843<tr class="even">
844<td>HTTP/HTTPS</td>
845<td><code>https://example.com/data-{000000..000009}.tar</code></td>
846</tr>
847<tr class="odd">
848<td>Google Cloud</td>
849<td><code>gs://bucket/path/file.tar</code></td>
850</tr>
851</tbody>
852</table>
853<p>For S3 with authentication, use <code>S3Source</code> instead of <code>s3://</code> URLs.</p>
854</section>
855<section id="dataset-properties-1" class="level2">
856<h2 class="anchored" data-anchor-id="dataset-properties-1">Dataset Properties</h2>
857<section id="source" class="level3">
858<h3 class="anchored" data-anchor-id="source">Source</h3>
859<p>Access the underlying <code>DataSource</code>:</p>
860<div id="e315e899" class="cell">
861<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[Sample](<span class="st">"data.tar"</span>)</span>
862<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>source <span class="op">=</span> dataset.source <span class="co"># URLSource instance</span></span>
863<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(source.shard_list) <span class="co"># ['data.tar']</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
864</div>
865</section>
866<section id="sample-type" class="level3">
867<h3 class="anchored" data-anchor-id="sample-type">Sample Type</h3>
868<p>Get the type parameter used to create the dataset:</p>
869<div id="c77919f7" class="cell">
870<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> atdata.Dataset[ImageSample](<span class="st">"data.tar"</span>)</span>
871<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(dataset.sample_type) <span class="co"># <class 'ImageSample'></span></span>
872<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(dataset.batch_type) <span class="co"># SampleBatch[ImageSample]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
873</div>
874</section>
875</section>
876<section id="related" class="level2">
877<h2 class="anchored" data-anchor-id="related">Related</h2>
878<ul>
879<li><a href="../reference/packable-samples.html">Packable Samples</a> - Defining typed samples</li>
880<li><a href="../reference/lenses.html">Lenses</a> - Type transformations</li>
881<li><a href="../reference/load-dataset.html">load_dataset</a> - HuggingFace-style loading API</li>
882<li><a href="../reference/protocols.html">Protocols</a> - DataSource protocol details</li>
883</ul>
884
885
886</section>
887
888</main> <!-- /main -->
889<script id="quarto-html-after-body" type="application/javascript">
890 window.document.addEventListener("DOMContentLoaded", function (event) {
891 // Ensure there is a toggle, if there isn't float one in the top right
892 if (window.document.querySelector('.quarto-color-scheme-toggle') === null) {
893 const a = window.document.createElement('a');
894 a.classList.add('top-right');
895 a.classList.add('quarto-color-scheme-toggle');
896 a.href = "";
897 a.onclick = function() { try { window.quartoToggleColorScheme(); } catch {} return false; };
898 const i = window.document.createElement("i");
899 i.classList.add('bi');
900 a.appendChild(i);
901 window.document.body.appendChild(a);
902 }
903 setColorSchemeToggle(hasAlternateSentinel())
904 const icon = "";
905 const anchorJS = new window.AnchorJS();
906 anchorJS.options = {
907 placement: 'right',
908 icon: icon
909 };
910 anchorJS.add('.anchored');
911 const isCodeAnnotation = (el) => {
912 for (const clz of el.classList) {
913 if (clz.startsWith('code-annotation-')) {
914 return true;
915 }
916 }
917 return false;
918 }
919 const onCopySuccess = function(e) {
920 // button target
921 const button = e.trigger;
922 // don't keep focus
923 button.blur();
924 // flash "checked"
925 button.classList.add('code-copy-button-checked');
926 var currentTitle = button.getAttribute("title");
927 button.setAttribute("title", "Copied!");
928 let tooltip;
929 if (window.bootstrap) {
930 button.setAttribute("data-bs-toggle", "tooltip");
931 button.setAttribute("data-bs-placement", "left");
932 button.setAttribute("data-bs-title", "Copied!");
933 tooltip = new bootstrap.Tooltip(button,
934 { trigger: "manual",
935 customClass: "code-copy-button-tooltip",
936 offset: [0, -8]});
937 tooltip.show();
938 }
939 setTimeout(function() {
940 if (tooltip) {
941 tooltip.hide();
942 button.removeAttribute("data-bs-title");
943 button.removeAttribute("data-bs-toggle");
944 button.removeAttribute("data-bs-placement");
945 }
946 button.setAttribute("title", currentTitle);
947 button.classList.remove('code-copy-button-checked');
948 }, 1000);
949 // clear code selection
950 e.clearSelection();
951 }
952 const getTextToCopy = function(trigger) {
953 const codeEl = trigger.previousElementSibling.cloneNode(true);
954 for (const childEl of codeEl.children) {
955 if (isCodeAnnotation(childEl)) {
956 childEl.remove();
957 }
958 }
959 return codeEl.innerText;
960 }
961 const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
962 text: getTextToCopy
963 });
964 clipboard.on('success', onCopySuccess);
965 if (window.document.getElementById('quarto-embedded-source-code-modal')) {
966 const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
967 text: getTextToCopy,
968 container: window.document.getElementById('quarto-embedded-source-code-modal')
969 });
970 clipboardModal.on('success', onCopySuccess);
971 }
972 var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
973 var mailtoRegex = new RegExp(/^mailto:/);
974 var filterRegex = new RegExp("https:\/\/github\.com\/your-org\/atdata");
975 var isInternal = (href) => {
976 return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
977 }
978 // Inspect non-navigation links and adorn them if external
979 var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
980 for (var i=0; i<links.length; i++) {
981 const link = links[i];
982 if (!isInternal(link.href)) {
983 // undo the damage that might have been done by quarto-nav.js in the case of
984 // links that we want to consider external
985 if (link.dataset.originalHref !== undefined) {
986 link.href = link.dataset.originalHref;
987 }
988 }
989 }
990 function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
991 const config = {
992 allowHTML: true,
993 maxWidth: 500,
994 delay: 100,
995 arrow: false,
996 appendTo: function(el) {
997 return el.parentElement;
998 },
999 interactive: true,
1000 interactiveBorder: 10,
1001 theme: 'quarto',
1002 placement: 'bottom-start',
1003 };
1004 if (contentFn) {
1005 config.content = contentFn;
1006 }
1007 if (onTriggerFn) {
1008 config.onTrigger = onTriggerFn;
1009 }
1010 if (onUntriggerFn) {
1011 config.onUntrigger = onUntriggerFn;
1012 }
1013 window.tippy(el, config);
1014 }
1015 const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
1016 for (var i=0; i<noterefs.length; i++) {
1017 const ref = noterefs[i];
1018 tippyHover(ref, function() {
1019 // use id or data attribute instead here
1020 let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
1021 try { href = new URL(href).hash; } catch {}
1022 const id = href.replace(/^#\/?/, "");
1023 const note = window.document.getElementById(id);
1024 if (note) {
1025 return note.innerHTML;
1026 } else {
1027 return "";
1028 }
1029 });
1030 }
1031 const xrefs = window.document.querySelectorAll('a.quarto-xref');
1032 const processXRef = (id, note) => {
1033 // Strip column container classes
1034 const stripColumnClz = (el) => {
1035 el.classList.remove("page-full", "page-columns");
1036 if (el.children) {
1037 for (const child of el.children) {
1038 stripColumnClz(child);
1039 }
1040 }
1041 }
1042 stripColumnClz(note)
1043 if (id === null || id.startsWith('sec-')) {
1044 // Special case sections, only their first couple elements
1045 const container = document.createElement("div");
1046 if (note.children && note.children.length > 2) {
1047 container.appendChild(note.children[0].cloneNode(true));
1048 for (let i = 1; i < note.children.length; i++) {
1049 const child = note.children[i];
1050 if (child.tagName === "P" && child.innerText === "") {
1051 continue;
1052 } else {
1053 container.appendChild(child.cloneNode(true));
1054 break;
1055 }
1056 }
1057 if (window.Quarto?.typesetMath) {
1058 window.Quarto.typesetMath(container);
1059 }
1060 return container.innerHTML
1061 } else {
1062 if (window.Quarto?.typesetMath) {
1063 window.Quarto.typesetMath(note);
1064 }
1065 return note.innerHTML;
1066 }
1067 } else {
1068 // Remove any anchor links if they are present
1069 const anchorLink = note.querySelector('a.anchorjs-link');
1070 if (anchorLink) {
1071 anchorLink.remove();
1072 }
1073 if (window.Quarto?.typesetMath) {
1074 window.Quarto.typesetMath(note);
1075 }
1076 if (note.classList.contains("callout")) {
1077 return note.outerHTML;
1078 } else {
1079 return note.innerHTML;
1080 }
1081 }
1082 }
1083 for (var i=0; i<xrefs.length; i++) {
1084 const xref = xrefs[i];
1085 tippyHover(xref, undefined, function(instance) {
1086 instance.disable();
1087 let url = xref.getAttribute('href');
1088 let hash = undefined;
1089 if (url.startsWith('#')) {
1090 hash = url;
1091 } else {
1092 try { hash = new URL(url).hash; } catch {}
1093 }
1094 if (hash) {
1095 const id = hash.replace(/^#\/?/, "");
1096 const note = window.document.getElementById(id);
1097 if (note !== null) {
1098 try {
1099 const html = processXRef(id, note.cloneNode(true));
1100 instance.setContent(html);
1101 } finally {
1102 instance.enable();
1103 instance.show();
1104 }
1105 } else {
1106 // See if we can fetch this
1107 fetch(url.split('#')[0])
1108 .then(res => res.text())
1109 .then(html => {
1110 const parser = new DOMParser();
1111 const htmlDoc = parser.parseFromString(html, "text/html");
1112 const note = htmlDoc.getElementById(id);
1113 if (note !== null) {
1114 const html = processXRef(id, note);
1115 instance.setContent(html);
1116 }
1117 }).finally(() => {
1118 instance.enable();
1119 instance.show();
1120 });
1121 }
1122 } else {
1123 // See if we can fetch a full url (with no hash to target)
1124 // This is a special case and we should probably do some content thinning / targeting
1125 fetch(url)
1126 .then(res => res.text())
1127 .then(html => {
1128 const parser = new DOMParser();
1129 const htmlDoc = parser.parseFromString(html, "text/html");
1130 const note = htmlDoc.querySelector('main.content');
1131 if (note !== null) {
1132 // This should only happen for chapter cross references
1133 // (since there is no id in the URL)
1134 // remove the first header
1135 if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
1136 note.children[0].remove();
1137 }
1138 const html = processXRef(null, note);
1139 instance.setContent(html);
1140 }
1141 }).finally(() => {
1142 instance.enable();
1143 instance.show();
1144 });
1145 }
1146 }, function(instance) {
1147 });
1148 }
1149 let selectedAnnoteEl;
1150 const selectorForAnnotation = ( cell, annotation) => {
1151 let cellAttr = 'data-code-cell="' + cell + '"';
1152 let lineAttr = 'data-code-annotation="' + annotation + '"';
1153 const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
1154 return selector;
1155 }
1156 const selectCodeLines = (annoteEl) => {
1157 const doc = window.document;
1158 const targetCell = annoteEl.getAttribute("data-target-cell");
1159 const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
1160 const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
1161 const lines = annoteSpan.getAttribute("data-code-lines").split(",");
1162 const lineIds = lines.map((line) => {
1163 return targetCell + "-" + line;
1164 })
1165 let top = null;
1166 let height = null;
1167 let parent = null;
1168 if (lineIds.length > 0) {
1169 //compute the position of the single el (top and bottom and make a div)
1170 const el = window.document.getElementById(lineIds[0]);
1171 top = el.offsetTop;
1172 height = el.offsetHeight;
1173 parent = el.parentElement.parentElement;
1174 if (lineIds.length > 1) {
1175 const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
1176 const bottom = lastEl.offsetTop + lastEl.offsetHeight;
1177 height = bottom - top;
1178 }
1179 if (top !== null && height !== null && parent !== null) {
1180 // cook up a div (if necessary) and position it
1181 let div = window.document.getElementById("code-annotation-line-highlight");
1182 if (div === null) {
1183 div = window.document.createElement("div");
1184 div.setAttribute("id", "code-annotation-line-highlight");
1185 div.style.position = 'absolute';
1186 parent.appendChild(div);
1187 }
1188 div.style.top = top - 2 + "px";
1189 div.style.height = height + 4 + "px";
1190 div.style.left = 0;
1191 let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
1192 if (gutterDiv === null) {
1193 gutterDiv = window.document.createElement("div");
1194 gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
1195 gutterDiv.style.position = 'absolute';
1196 const codeCell = window.document.getElementById(targetCell);
1197 const gutter = codeCell.querySelector('.code-annotation-gutter');
1198 gutter.appendChild(gutterDiv);
1199 }
1200 gutterDiv.style.top = top - 2 + "px";
1201 gutterDiv.style.height = height + 4 + "px";
1202 }
1203 selectedAnnoteEl = annoteEl;
1204 }
1205 };
1206 const unselectCodeLines = () => {
1207 const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
1208 elementsIds.forEach((elId) => {
1209 const div = window.document.getElementById(elId);
1210 if (div) {
1211 div.remove();
1212 }
1213 });
1214 selectedAnnoteEl = undefined;
1215 };
1216 // Handle positioning of the toggle
1217 window.addEventListener(
1218 "resize",
1219 throttle(() => {
1220 elRect = undefined;
1221 if (selectedAnnoteEl) {
1222 selectCodeLines(selectedAnnoteEl);
1223 }
1224 }, 10)
1225 );
1226 function throttle(fn, ms) {
1227 let throttle = false;
1228 let timer;
1229 return (...args) => {
1230 if(!throttle) { // first call gets through
1231 fn.apply(this, args);
1232 throttle = true;
1233 } else { // all the others get throttled
1234 if(timer) clearTimeout(timer); // cancel #2
1235 timer = setTimeout(() => {
1236 fn.apply(this, args);
1237 timer = throttle = false;
1238 }, ms);
1239 }
1240 };
1241 }
1242 // Attach click handler to the DT
1243 const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
1244 for (const annoteDlNode of annoteDls) {
1245 annoteDlNode.addEventListener('click', (event) => {
1246 const clickedEl = event.target;
1247 if (clickedEl !== selectedAnnoteEl) {
1248 unselectCodeLines();
1249 const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
1250 if (activeEl) {
1251 activeEl.classList.remove('code-annotation-active');
1252 }
1253 selectCodeLines(clickedEl);
1254 clickedEl.classList.add('code-annotation-active');
1255 } else {
1256 // Unselect the line
1257 unselectCodeLines();
1258 clickedEl.classList.remove('code-annotation-active');
1259 }
1260 });
1261 }
1262 const findCites = (el) => {
1263 const parentEl = el.parentElement;
1264 if (parentEl) {
1265 const cites = parentEl.dataset.cites;
1266 if (cites) {
1267 return {
1268 el,
1269 cites: cites.split(' ')
1270 };
1271 } else {
1272 return findCites(el.parentElement)
1273 }
1274 } else {
1275 return undefined;
1276 }
1277 };
1278 var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
1279 for (var i=0; i<bibliorefs.length; i++) {
1280 const ref = bibliorefs[i];
1281 const citeInfo = findCites(ref);
1282 if (citeInfo) {
1283 tippyHover(citeInfo.el, function() {
1284 var popup = window.document.createElement('div');
1285 citeInfo.cites.forEach(function(cite) {
1286 var citeDiv = window.document.createElement('div');
1287 citeDiv.classList.add('hanging-indent');
1288 citeDiv.classList.add('csl-entry');
1289 var biblioDiv = window.document.getElementById('ref-' + cite);
1290 if (biblioDiv) {
1291 citeDiv.innerHTML = biblioDiv.innerHTML;
1292 }
1293 popup.appendChild(citeDiv);
1294 });
1295 return popup.innerHTML;
1296 });
1297 }
1298 }
1299 });
1300 </script>
1301</div> <!-- /content -->
1302<footer class="footer">
1303 <div class="nav-footer">
1304 <div class="nav-footer-left">
1305<p>Built with <a href="https://quarto.org/">Quarto</a></p>
1306</div>
1307 <div class="nav-footer-center">
1308
1309 <div class="toc-actions d-sm-block d-md-none"><ul><li><a href="https://github.com/your-org/atdata/edit/main/reference/datasets.qmd" class="toc-action"><i class="bi bi-github"></i>Edit this page</a></li><li><a href="https://github.com/your-org/atdata/issues/new" class="toc-action"><i class="bi empty"></i>Report an issue</a></li></ul></div></div>
1310 <div class="nav-footer-right">
1311<p>MIT License</p>
1312</div>
1313 </div>
1314</footer>
1315
1316
1317
1318
1319</body></html>