Ionosphere.tv
3
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: vendor layers.pub lexicons for transcript enrichment

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

+447
+6
.gitignore
··· 17 17 *.DS_Store 18 18 .sprite 19 19 .claude/worktrees 20 + 21 + # Python NLP pipeline 22 + pipeline/.venv/ 23 + pipeline/data/ 24 + __pycache__/ 25 + *.pyc
+114
lexicons/pub/layers/annotation/annotationLayer.json
··· 1 + { 2 + "lexicon": 1, 3 + "$type": "com.atproto.lexicon.schema", 4 + "id": "pub.layers.annotation.annotationLayer", 5 + "revision": 1, 6 + "description": "An annotation layer attaching typed annotations to a pub.layers.expression. Covers token-tag, span, relation, tree, and other layer kinds. Each annotation has an anchor, a label, and optional confidence and features.", 7 + "defs": { 8 + "main": { 9 + "type": "record", 10 + "key": "any", 11 + "record": { 12 + "type": "object", 13 + "required": ["expression", "kind", "subkind", "sourceMethod", "annotations"], 14 + "properties": { 15 + "expression": { 16 + "type": "string", 17 + "format": "at-uri", 18 + "description": "AT URI of the pub.layers.expression.expression this layer annotates." 19 + }, 20 + "kindUri": { 21 + "type": "string", 22 + "format": "at-uri", 23 + "description": "AT URI of a record formally defining the annotation layer kind." 24 + }, 25 + "kind": { 26 + "type": "string", 27 + "description": "Layer kind: token-tag, span, relation, tree, etc." 28 + }, 29 + "subkindUri": { 30 + "type": "string", 31 + "format": "at-uri", 32 + "description": "AT URI of a record formally defining the annotation subkind (e.g. a specific tag set)." 33 + }, 34 + "subkind": { 35 + "type": "string", 36 + "description": "More specific categorisation within the layer kind (e.g. POS, NER, dependency)." 37 + }, 38 + "sourceMethodUri": { 39 + "type": "string", 40 + "format": "at-uri", 41 + "description": "AT URI of a record formally defining the source method." 42 + }, 43 + "sourceMethod": { 44 + "type": "string", 45 + "description": "How annotations were produced: manual-native, automatic, converted, etc." 46 + }, 47 + "annotations": { 48 + "type": "array", 49 + "items": { 50 + "type": "ref", 51 + "ref": "pub.layers.annotation.annotationLayer#annotation" 52 + }, 53 + "description": "The individual annotations that make up this layer." 54 + }, 55 + "tokenizationId": { 56 + "type": "ref", 57 + "ref": "pub.layers.defs#uuid", 58 + "description": "UUID of the tokenization this layer is aligned to (for token-tag layers)." 59 + }, 60 + "rank": { 61 + "type": "integer", 62 + "description": "Ordering rank when multiple layers of the same kind exist on the same expression." 63 + }, 64 + "metadata": { 65 + "type": "ref", 66 + "ref": "pub.layers.defs#annotationMetadata", 67 + "description": "Provenance metadata for this layer." 68 + }, 69 + "createdAt": { 70 + "type": "string", 71 + "format": "datetime", 72 + "description": "ISO 8601 datetime when this record was created." 73 + } 74 + } 75 + } 76 + }, 77 + "annotation": { 78 + "type": "object", 79 + "description": "A single annotation within an annotation layer.", 80 + "required": ["label"], 81 + "properties": { 82 + "uuid": { 83 + "type": "ref", 84 + "ref": "pub.layers.defs#uuid", 85 + "description": "Stable UUID for this annotation." 86 + }, 87 + "anchor": { 88 + "type": "ref", 89 + "ref": "pub.layers.defs#anchor", 90 + "description": "Position this annotation targets (text span, token, or temporal span)." 91 + }, 92 + "label": { 93 + "type": "string", 94 + "description": "The annotation label (e.g. a POS tag, NER type, sentiment class)." 95 + }, 96 + "text": { 97 + "type": "string", 98 + "description": "The surface text covered by this annotation (optional, for readability)." 99 + }, 100 + "confidence": { 101 + "type": "integer", 102 + "description": "Confidence score in the range 0–1000 (0 = none, 1000 = certain).", 103 + "minimum": 0, 104 + "maximum": 1000 105 + }, 106 + "features": { 107 + "type": "ref", 108 + "ref": "pub.layers.defs#featureMap", 109 + "description": "Additional linguistic features for this annotation." 110 + } 111 + } 112 + } 113 + } 114 + }
+151
lexicons/pub/layers/defs.json
··· 1 + { 2 + "lexicon": 1, 3 + "$type": "com.atproto.lexicon.schema", 4 + "id": "pub.layers.defs", 5 + "revision": 1, 6 + "description": "Shared type definitions for the layers.pub v0.5.0 lexicon family. Covers spans, temporal spans, anchors, token references, annotation metadata, and feature maps.", 7 + "defs": { 8 + "span": { 9 + "type": "object", 10 + "description": "A byte-offset span into a text string, with optional character offsets.", 11 + "required": ["byteStart", "byteEnd"], 12 + "properties": { 13 + "byteStart": { 14 + "type": "integer", 15 + "description": "Inclusive start byte offset." 16 + }, 17 + "byteEnd": { 18 + "type": "integer", 19 + "description": "Exclusive end byte offset." 20 + }, 21 + "charStart": { 22 + "type": "integer", 23 + "description": "Inclusive start Unicode character offset (optional)." 24 + }, 25 + "charEnd": { 26 + "type": "integer", 27 + "description": "Exclusive end Unicode character offset (optional)." 28 + } 29 + } 30 + }, 31 + "temporalSpan": { 32 + "type": "object", 33 + "description": "A time range in milliseconds, e.g. within a media stream.", 34 + "required": ["start", "ending"], 35 + "properties": { 36 + "start": { 37 + "type": "integer", 38 + "description": "Start time in milliseconds." 39 + }, 40 + "ending": { 41 + "type": "integer", 42 + "description": "End time in milliseconds." 43 + } 44 + } 45 + }, 46 + "uuid": { 47 + "type": "object", 48 + "description": "A UUID value wrapper.", 49 + "required": ["value"], 50 + "properties": { 51 + "value": { 52 + "type": "string", 53 + "description": "UUID string (e.g. RFC 4122)." 54 + } 55 + } 56 + }, 57 + "tokenRef": { 58 + "type": "object", 59 + "description": "A reference to a specific token within a named tokenization.", 60 + "required": ["tokenizationId", "tokenIndex"], 61 + "properties": { 62 + "tokenizationId": { 63 + "type": "ref", 64 + "ref": "pub.layers.defs#uuid", 65 + "description": "UUID identifying the tokenization this token belongs to." 66 + }, 67 + "tokenIndex": { 68 + "type": "integer", 69 + "description": "Zero-based index of the token within the tokenization." 70 + } 71 + } 72 + }, 73 + "anchor": { 74 + "type": "object", 75 + "description": "Polymorphic anchor pointing to a text span, a token, or a temporal position. At least one field should be set.", 76 + "properties": { 77 + "textSpan": { 78 + "type": "ref", 79 + "ref": "pub.layers.defs#span", 80 + "description": "Byte-offset span into the expression text." 81 + }, 82 + "tokenRef": { 83 + "type": "ref", 84 + "ref": "pub.layers.defs#tokenRef", 85 + "description": "Reference to a specific token in a tokenization." 86 + }, 87 + "temporalSpan": { 88 + "type": "ref", 89 + "ref": "pub.layers.defs#temporalSpan", 90 + "description": "Time range within a media stream." 91 + } 92 + } 93 + }, 94 + "annotationMetadata": { 95 + "type": "object", 96 + "description": "Provenance metadata for an annotation or layer.", 97 + "properties": { 98 + "agent": { 99 + "type": "string", 100 + "format": "at-uri", 101 + "description": "AT URI of the agent (DID or account) that produced this annotation." 102 + }, 103 + "tool": { 104 + "type": "string", 105 + "description": "Name or identifier of the tool or model used." 106 + }, 107 + "timestamp": { 108 + "type": "string", 109 + "format": "datetime", 110 + "description": "ISO 8601 datetime when the annotation was produced." 111 + }, 112 + "confidence": { 113 + "type": "integer", 114 + "description": "Confidence score in the range 0–1000 (0 = none, 1000 = certain).", 115 + "minimum": 0, 116 + "maximum": 1000 117 + } 118 + } 119 + }, 120 + "feature": { 121 + "type": "object", 122 + "description": "A key-value linguistic feature entry.", 123 + "required": ["key", "value"], 124 + "properties": { 125 + "key": { 126 + "type": "string", 127 + "description": "Feature name (e.g. POS, Gender, Number)." 128 + }, 129 + "value": { 130 + "type": "string", 131 + "description": "Feature value (e.g. NOUN, Masc, Sing)." 132 + } 133 + } 134 + }, 135 + "featureMap": { 136 + "type": "object", 137 + "description": "A set of linguistic feature key-value pairs.", 138 + "required": ["entries"], 139 + "properties": { 140 + "entries": { 141 + "type": "array", 142 + "items": { 143 + "type": "ref", 144 + "ref": "pub.layers.defs#feature" 145 + }, 146 + "description": "List of feature entries." 147 + } 148 + } 149 + } 150 + } 151 + }
+75
lexicons/pub/layers/expression/expression.json
··· 1 + { 2 + "lexicon": 1, 3 + "$type": "com.atproto.lexicon.schema", 4 + "id": "pub.layers.expression.expression", 5 + "revision": 1, 6 + "description": "A linguistic expression record — a unit of text at any granularity (document, transcript, paragraph, sentence, word, morpheme). Expressions may be nested via parentRef and anchored into media via mediaRef and anchor.", 7 + "defs": { 8 + "main": { 9 + "type": "record", 10 + "key": "any", 11 + "record": { 12 + "type": "object", 13 + "required": ["id", "kind", "text", "language", "createdAt"], 14 + "properties": { 15 + "id": { 16 + "type": "string", 17 + "description": "Stable identifier for this expression (e.g. UUID)." 18 + }, 19 + "kindUri": { 20 + "type": "string", 21 + "format": "at-uri", 22 + "description": "AT URI of a record that formally defines the expression kind." 23 + }, 24 + "kind": { 25 + "type": "string", 26 + "description": "Expression granularity: document, transcript, paragraph, sentence, word, morpheme, etc." 27 + }, 28 + "text": { 29 + "type": "string", 30 + "description": "The full text content of this expression." 31 + }, 32 + "language": { 33 + "type": "string", 34 + "description": "BCP 47 language tag (e.g. en, fr, zh-Hant)." 35 + }, 36 + "mediaRef": { 37 + "type": "string", 38 + "format": "at-uri", 39 + "description": "AT URI of the media record this expression derives from (e.g. a video or audio record)." 40 + }, 41 + "sourceRef": { 42 + "type": "string", 43 + "format": "at-uri", 44 + "description": "AT URI of the source record from which this expression was produced (e.g. a transcript record)." 45 + }, 46 + "parentRef": { 47 + "type": "string", 48 + "format": "at-uri", 49 + "description": "AT URI of the parent expression, for nested structure (e.g. sentence within paragraph)." 50 + }, 51 + "anchor": { 52 + "type": "ref", 53 + "ref": "pub.layers.defs#anchor", 54 + "description": "Position of this expression within its parent text or media." 55 + }, 56 + "metadata": { 57 + "type": "ref", 58 + "ref": "pub.layers.defs#annotationMetadata", 59 + "description": "Provenance metadata for this expression." 60 + }, 61 + "features": { 62 + "type": "ref", 63 + "ref": "pub.layers.defs#featureMap", 64 + "description": "Linguistic features associated with this expression." 65 + }, 66 + "createdAt": { 67 + "type": "string", 68 + "format": "datetime", 69 + "description": "ISO 8601 datetime when this record was created." 70 + } 71 + } 72 + } 73 + } 74 + } 75 + }
+101
lexicons/pub/layers/segmentation/segmentation.json
··· 1 + { 2 + "lexicon": 1, 3 + "$type": "com.atproto.lexicon.schema", 4 + "id": "pub.layers.segmentation.segmentation", 5 + "revision": 1, 6 + "description": "A segmentation record attaching one or more tokenizations to a pub.layers.expression. Each tokenization defines a scheme (whitespace, BPE, etc.) and a sequence of indexed tokens with text spans and optional temporal spans.", 7 + "defs": { 8 + "main": { 9 + "type": "record", 10 + "key": "any", 11 + "record": { 12 + "type": "object", 13 + "required": ["expression", "tokenizations", "createdAt"], 14 + "properties": { 15 + "expression": { 16 + "type": "string", 17 + "format": "at-uri", 18 + "description": "AT URI of the pub.layers.expression.expression this segmentation applies to." 19 + }, 20 + "tokenizations": { 21 + "type": "array", 22 + "items": { 23 + "type": "ref", 24 + "ref": "pub.layers.segmentation.segmentation#tokenization" 25 + }, 26 + "description": "One or more tokenization schemes over the expression text." 27 + }, 28 + "metadata": { 29 + "type": "ref", 30 + "ref": "pub.layers.defs#annotationMetadata", 31 + "description": "Provenance metadata for this segmentation." 32 + }, 33 + "features": { 34 + "type": "ref", 35 + "ref": "pub.layers.defs#featureMap", 36 + "description": "Linguistic features associated with this segmentation." 37 + }, 38 + "createdAt": { 39 + "type": "string", 40 + "format": "datetime", 41 + "description": "ISO 8601 datetime when this record was created." 42 + } 43 + } 44 + } 45 + }, 46 + "tokenization": { 47 + "type": "object", 48 + "description": "A single tokenization scheme with its token sequence.", 49 + "required": ["tokens"], 50 + "properties": { 51 + "uuid": { 52 + "type": "ref", 53 + "ref": "pub.layers.defs#uuid", 54 + "description": "Stable UUID identifying this tokenization, for use in tokenRef anchors." 55 + }, 56 + "kindUri": { 57 + "type": "string", 58 + "format": "at-uri", 59 + "description": "AT URI of a record formally defining the tokenization kind." 60 + }, 61 + "kind": { 62 + "type": "string", 63 + "description": "Tokenization scheme: whitespace, bpe, sentencepiece, character, etc." 64 + }, 65 + "tokens": { 66 + "type": "array", 67 + "items": { 68 + "type": "ref", 69 + "ref": "pub.layers.segmentation.segmentation#token" 70 + }, 71 + "description": "Ordered sequence of tokens produced by this tokenization." 72 + } 73 + } 74 + }, 75 + "token": { 76 + "type": "object", 77 + "description": "A single token within a tokenization.", 78 + "required": ["tokenIndex", "text", "textSpan"], 79 + "properties": { 80 + "tokenIndex": { 81 + "type": "integer", 82 + "description": "Zero-based index of this token in the tokenization sequence." 83 + }, 84 + "text": { 85 + "type": "string", 86 + "description": "Surface text of this token." 87 + }, 88 + "textSpan": { 89 + "type": "ref", 90 + "ref": "pub.layers.defs#span", 91 + "description": "Byte-offset span of this token in the parent expression text." 92 + }, 93 + "temporalSpan": { 94 + "type": "ref", 95 + "ref": "pub.layers.defs#temporalSpan", 96 + "description": "Time range of this token within the associated media (optional)." 97 + } 98 + } 99 + } 100 + } 101 + }