feat: embedding service with WordPiece tokenizer and post text extractor

+13 -11

docs/tasks/phase-7.md

··· 12 12 #### ObjectBox Setup 13 13 14 14 - [x] Add `objectbox`, `objectbox_flutter_libs` to `pubspec.yaml`; add `objectbox_generator` to dev deps 15 - - [ ] `EmbeddedPost` entity - `postUri` (unique), `accountDid`, `source` (saved/liked), `indexedText`, `embedding` (384D float vector, HNSW cosine index), `embeddedAt` 16 - - [ ] Run `build_runner` to generate `objectbox.g.dart` and `objectbox-model.json` 17 - - [ ] `ObjectBoxStore` singleton - `openStore()` at app startup (after Drift init), expose via `RepositoryProvider` 18 - - [ ] `EmbeddingRepository` - CRUD operations on `EmbeddedPost`: `upsert`, `deleteByUri`, `queryByAccount`, `countByAccount` 15 + - [x] `EmbeddedPost` entity - `postUri` (unique), `accountDid`, `source` (saved/liked), `indexedText`, `embedding` (384D float vector, HNSW cosine index), `embeddedAt` 16 + - [x] Run `build_runner` to generate `objectbox.g.dart` and `objectbox-model.json` 17 + - [x] `ObjectBoxStore` singleton - `openStore()` at app startup (after Drift init), expose via `RepositoryProvider` 18 + - [x] `EmbeddingRepository` - CRUD operations on `EmbeddedPost`: `upsert`, `deleteByUri`, `queryByAccount`, `countByAccount` 19 19 20 20 #### TFLite Embedding Service 21 21 22 22 - [x] Add `tflite_flutter` to `pubspec.yaml` 23 23 - [x] Bundle `minilm_l6_v2_int8.tflite` and `vocab.txt` as Flutter assets 24 - - [ ] `WordPieceTokenizer` - load vocab, tokenize text, pad/truncate to 256 tokens, return `List<int>` 25 - - [ ] `EmbeddingService` - long-lived background `Isolate` with `ReceivePort`/`SendPort` message passing 26 - - [ ] `EmbeddingService.initialize()` - spawn isolate, load TFLite model + tokenizer in isolate 27 - - [ ] `EmbeddingService.embed(String text)` - send text to isolate, receive `Float32List[384]`, L2-normalize 28 - - [ ] `EmbeddingService.isAvailable` - flag gating UI entry points, false if model fails to load 29 - - [ ] `EmbeddingService.dispose()` - close isolate and interpreter 30 - - [ ] `PostTextExtractor` - concatenate post text + image alt texts + link card title/description into a single searchable string 24 + - [x] `WordPieceTokenizer` - load vocab, tokenize text, pad/truncate to 256 tokens, return `List<int>` 25 + - [x] `EmbeddingService` - long-lived background `Isolate` with `ReceivePort`/`SendPort` message passing 26 + - [x] `EmbeddingService.initialize()` - spawn isolate, load TFLite model + tokenizer in isolate 27 + - [x] `EmbeddingService.embed(String text)` - send text to isolate, receive `Float32List[384]`, L2-normalize 28 + - [x] `EmbeddingService.isAvailable` - flag gating UI entry points, false if model fails to load 29 + - [x] `EmbeddingService.dispose()` - close isolate and interpreter 30 + - [x] `PostTextExtractor` - concatenate post text + image alt texts + link card title/description into a single searchable string 31 31 32 32 #### Liked Posts Sync 33 33 ··· 37 37 - [ ] `LikedPostsRepository.getLikedPosts(accountDid, {limit, offset})` - paginated query 38 38 - [ ] `LikedPostsRepository.removeLike(accountDid, postUri)` - delete entry 39 39 - [ ] Eviction: drop oldest entries when count exceeds 1000 per account 40 + - [ ] Documentation update: move development information from README.md to a top-level DEVELOPMENT.md. 41 + Should be updated to reflect new architecture and patterns. 40 42 41 43 #### Indexing Pipeline 42 44

+186

lib/core/embedding/embedding_service.dart

··· 1 + import 'dart:async'; 2 + import 'dart:isolate'; 3 + import 'dart:math' show sqrt; 4 + 5 + import 'package:flutter/foundation.dart'; 6 + import 'package:flutter/services.dart'; 7 + import 'package:lazurite/core/embedding/word_piece_tokenizer.dart'; 8 + import 'package:tflite_flutter/tflite_flutter.dart'; 9 + 10 + final class _SetupData { 11 + const _SetupData({required this.sendPort, required this.rootIsolateToken}); 12 + final SendPort sendPort; 13 + final RootIsolateToken rootIsolateToken; 14 + } 15 + 16 + /// A request sent to the isolate: (text, replyPort) or null to dispose. 17 + typedef _EmbedRequest = (String text, SendPort replyPort); 18 + 19 + /// L2-normalize [vector], returning a new [Float32List]. 20 + /// 21 + /// If the norm is near zero the original vector is returned unchanged. 22 + @visibleForTesting 23 + Float32List l2Normalize(Float32List vector) { 24 + var norm = 0.0; 25 + for (var i = 0; i < vector.length; i++) { 26 + norm += vector[i] * vector[i]; 27 + } 28 + norm = sqrt(norm); 29 + if (norm < 1e-10) return vector; 30 + final out = Float32List(vector.length); 31 + for (var i = 0; i < vector.length; i++) { 32 + out[i] = vector[i] / norm; 33 + } 34 + return out; 35 + } 36 + 37 + Future<void> _isolateEntry(_SetupData setup) async { 38 + BackgroundIsolateBinaryMessenger.ensureInitialized(setup.rootIsolateToken); 39 + 40 + final receivePort = ReceivePort(); 41 + 42 + setup.sendPort.send(receivePort.sendPort); 43 + 44 + Interpreter? interpreter; 45 + WordPieceTokenizer? tokenizer; 46 + 47 + try { 48 + interpreter = await Interpreter.fromAsset('all-MiniLM-L6-v2-quant.tflite'); 49 + final vocabText = await rootBundle.loadString('assets/vocab.txt'); 50 + tokenizer = WordPieceTokenizer.fromString(vocabText); 51 + setup.sendPort.send(true); 52 + } catch (_) { 53 + setup.sendPort.send(false); 54 + receivePort.close(); 55 + return; 56 + } 57 + 58 + await for (final message in receivePort) { 59 + if (message == null) break; 60 + final (text, replyPort) = message as _EmbedRequest; 61 + try { 62 + final result = _runInference(interpreter, tokenizer, text); 63 + replyPort.send(result); 64 + } catch (_) { 65 + replyPort.send(null); 66 + } 67 + } 68 + 69 + interpreter.close(); 70 + receivePort.close(); 71 + } 72 + 73 + Float32List _runInference(Interpreter interpreter, WordPieceTokenizer tokenizer, String text) { 74 + final tokenIds = tokenizer.tokenize(text); 75 + const seqLen = WordPieceTokenizer.maxTokens; 76 + 77 + final inputIds = [tokenIds]; 78 + final attentionMask = [tokenIds.map((id) => id != 0 ? 1 : 0).toList()]; 79 + final tokenTypeIds = [List<int>.filled(seqLen, 0)]; 80 + 81 + final outputBuffer = [List<double>.filled(384, 0.0)]; 82 + interpreter.runForMultipleInputs([inputIds, attentionMask, tokenTypeIds], {0: outputBuffer}); 83 + 84 + return l2Normalize(Float32List.fromList(outputBuffer[0])); 85 + } 86 + 87 + /// On-device text embedding service backed by a long-lived background [Isolate]. 88 + /// 89 + /// Start with [initialize], shut down with [dispose]. Check [isAvailable] 90 + /// before calling [embed]; the flag is false when the model fails to load or 91 + /// when the service has not yet been initialised. 92 + class EmbeddingService { 93 + /// Creates a real embedding service backed by TFLite + Isolate. 94 + EmbeddingService() : _mockEmbedFn = null; 95 + 96 + /// Creates a test double that bypasses the Isolate and TFLite entirely. 97 + /// 98 + /// [embedFn] is called synchronously (from the caller's perspective) on every 99 + /// [embed] invocation. [initialize] immediately sets [isAvailable] to true. 100 + @visibleForTesting 101 + EmbeddingService.forTesting(Future<Float32List> Function(String text) embedFn) : _mockEmbedFn = embedFn; 102 + 103 + final Future<Float32List> Function(String text)? _mockEmbedFn; 104 + 105 + bool _isAvailable = false; 106 + Isolate? _isolate; 107 + SendPort? _isolateSendPort; 108 + ReceivePort? _setupPort; 109 + 110 + /// Whether the service is ready to produce embeddings. 111 + /// 112 + /// False until [initialize] completes successfully, and false again after 113 + /// [dispose] is called or if the model failed to load. 114 + bool get isAvailable => _isAvailable; 115 + 116 + /// Initialise the service. 117 + /// 118 + /// For the real implementation this spawns a background [Isolate], loads the 119 + /// TFLite model, and builds the [WordPieceTokenizer]. For the test double it 120 + /// is a no-op that marks the service as available. 121 + /// 122 + /// Safe to call multiple times; subsequent calls are no-ops. 123 + Future<void> initialize() async { 124 + if (_isAvailable) return; 125 + if (_mockEmbedFn != null) { 126 + _isAvailable = true; 127 + return; 128 + } 129 + 130 + final setupPort = ReceivePort(); 131 + _setupPort = setupPort; 132 + final messages = StreamIterator<dynamic>(setupPort); 133 + 134 + try { 135 + _isolate = await Isolate.spawn( 136 + _isolateEntry, 137 + _SetupData(sendPort: setupPort.sendPort, rootIsolateToken: RootIsolateToken.instance!), 138 + debugName: 'EmbeddingIsolate', 139 + ); 140 + 141 + await messages.moveNext(); 142 + _isolateSendPort = messages.current as SendPort; 143 + 144 + await messages.moveNext(); 145 + _isAvailable = messages.current as bool; 146 + } finally { 147 + await messages.cancel(); 148 + setupPort.close(); 149 + _setupPort = null; 150 + } 151 + } 152 + 153 + /// Embed [text] and return an L2-normalised [Float32List] of length 384. 154 + /// 155 + /// Throws [StateError] if the service is not available. 156 + Future<Float32List> embed(String text) async { 157 + if (!_isAvailable) { 158 + throw StateError('EmbeddingService is not available. Call initialize() first.'); 159 + } 160 + 161 + if (_mockEmbedFn != null) { 162 + return _mockEmbedFn(text); 163 + } 164 + 165 + final responsePort = ReceivePort(); 166 + _isolateSendPort!.send((text, responsePort.sendPort)); 167 + final result = await responsePort.first; 168 + responsePort.close(); 169 + 170 + if (result == null) throw StateError('Embedding inference failed for text: "$text"'); 171 + return result as Float32List; 172 + } 173 + 174 + /// Shut down the background isolate and mark the service as unavailable. 175 + /// 176 + /// Safe to call before [initialize] or after [dispose]. 177 + void dispose() { 178 + _isolateSendPort?.send(null); 179 + _isolate?.kill(priority: Isolate.immediate); 180 + _setupPort?.close(); 181 + _isAvailable = false; 182 + _isolate = null; 183 + _isolateSendPort = null; 184 + _setupPort = null; 185 + } 186 + }

+147

lib/core/embedding/word_piece_tokenizer.dart

··· 1 + import 'package:characters/characters.dart'; 2 + 3 + /// BERT-style WordPiece tokenizer compatible with all-MiniLM-L6-v2. 4 + /// 5 + /// Converts text to token IDs using the standard BERT uncased vocabulary. 6 + /// The returned list always has exactly [maxTokens] elements. 7 + /// 8 + /// Token ID conventions (BERT-base-uncased): 9 + /// [PAD] = 0 10 + /// [UNK] = 100 11 + /// [CLS] = 101 12 + /// [SEP] = 102 13 + class WordPieceTokenizer { 14 + WordPieceTokenizer._(this._vocab); 15 + 16 + /// Constructs a tokenizer from the raw contents of a vocab file. 17 + /// 18 + /// Each line is one token; its line number (0-indexed) is its ID. 19 + factory WordPieceTokenizer.fromString(String vocabText) { 20 + final vocab = <String, int>{}; 21 + var index = 0; 22 + for (final line in vocabText.split('\n')) { 23 + final token = line.trimRight(); 24 + if (token.isNotEmpty) { 25 + vocab[token] = index; 26 + } 27 + index++; 28 + } 29 + return WordPieceTokenizer._(vocab); 30 + } 31 + static const int padId = 0; 32 + static const int unkId = 100; 33 + static const int clsId = 101; 34 + static const int sepId = 102; 35 + static const int maxTokens = 256; 36 + 37 + final Map<String, int> _vocab; 38 + 39 + /// Tokenize [text] into a list of token IDs padded/truncated to [maxTokens]. 40 + /// 41 + /// Layout: `[CLS] token_ids... [SEP] [PAD]...` 42 + List<int> tokenize(String text) { 43 + final cleaned = _cleanText(text.toLowerCase()); 44 + final basicTokens = _basicTokenize(cleaned); 45 + 46 + final ids = <int>[clsId]; 47 + for (final word in basicTokens) { 48 + final pieces = _wordPiece(word); 49 + 50 + if (ids.length + pieces.length >= maxTokens) { 51 + ids.addAll(pieces.take(maxTokens - ids.length - 1)); 52 + break; 53 + } 54 + ids.addAll(pieces); 55 + } 56 + ids.add(sepId); 57 + 58 + while (ids.length < maxTokens) { 59 + ids.add(padId); 60 + } 61 + 62 + return ids; 63 + } 64 + 65 + /// Remove control characters and normalize whitespace. 66 + String _cleanText(String text) { 67 + final buf = StringBuffer(); 68 + for (final char in text.characters) { 69 + final cp = char.codeUnitAt(0); 70 + if (cp == 0 || cp == 0xFFFD || _isControlChar(cp)) continue; 71 + buf.write(_isWhitespace(cp) ? ' ' : char); 72 + } 73 + return buf.toString(); 74 + } 75 + 76 + /// Split on whitespace and punctuation to produce basic tokens. 77 + List<String> _basicTokenize(String text) { 78 + final tokens = <String>[]; 79 + final buf = StringBuffer(); 80 + for (final char in text.characters) { 81 + final cp = char.codeUnitAt(0); 82 + if (_isWhitespace(cp)) { 83 + if (buf.isNotEmpty) { 84 + tokens.add(buf.toString()); 85 + buf.clear(); 86 + } 87 + } else if (_isPunctuation(cp)) { 88 + if (buf.isNotEmpty) { 89 + tokens.add(buf.toString()); 90 + buf.clear(); 91 + } 92 + tokens.add(char); 93 + } else { 94 + buf.write(char); 95 + } 96 + } 97 + if (buf.isNotEmpty) tokens.add(buf.toString()); 98 + return tokens; 99 + } 100 + 101 + /// WordPiece sub-word tokenization for a single [word]. 102 + /// 103 + /// Returns `[unkId]` if no valid segmentation exists. 104 + List<int> _wordPiece(String word) { 105 + if (word.isEmpty) return []; 106 + if (_vocab.containsKey(word)) return [_vocab[word]!]; 107 + 108 + final result = <int>[]; 109 + var start = 0; 110 + 111 + while (start < word.length) { 112 + var end = word.length; 113 + int? foundId; 114 + int? foundLen; 115 + 116 + while (start < end) { 117 + final sub = start == 0 ? word.substring(0, end) : '##${word.substring(start, end)}'; 118 + if (_vocab.containsKey(sub)) { 119 + foundId = _vocab[sub]!; 120 + foundLen = end - start; 121 + break; 122 + } 123 + end--; 124 + } 125 + 126 + if (foundId == null) return [unkId]; 127 + 128 + result.add(foundId); 129 + start += foundLen!; 130 + } 131 + 132 + return result; 133 + } 134 + 135 + bool _isWhitespace(int cp) => cp == 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D; 136 + 137 + bool _isControlChar(int cp) => (cp < 0x20 && !_isWhitespace(cp)) || (cp >= 0x7F && cp <= 0x9F); 138 + 139 + bool _isPunctuation(int cp) => 140 + (cp >= 33 && cp <= 47) || 141 + (cp >= 58 && cp <= 64) || 142 + (cp >= 91 && cp <= 96) || 143 + (cp >= 123 && cp <= 126) || 144 + (cp >= 0x2000 && cp <= 0x206F) || 145 + (cp >= 0x2E00 && cp <= 0x2E7F) || 146 + (cp >= 0x3000 && cp <= 0x303F); 147 + }

+62

lib/features/search/data/post_text_extractor.dart

··· 1 + import 'package:bluesky/app_bsky_embed_recordwithmedia.dart'; 2 + import 'package:bluesky/app_bsky_feed_defs.dart'; 3 + import 'package:bluesky/app_bsky_feed_post.dart'; 4 + 5 + /// Extracts a single searchable string from a [PostView] for embedding. 6 + /// 7 + /// Concatenates (in order, separated by spaces): 8 + /// 1. Post body text 9 + /// 2. Alt-text from every image in an images embed 10 + /// 3. Title + description from an external link-card embed 11 + /// 12 + /// Returns an empty string if no text can be extracted. 13 + class PostTextExtractor { 14 + const PostTextExtractor(); 15 + 16 + String extract(PostView post) { 17 + final parts = <String>[]; 18 + 19 + final recordText = _recordText(post.record); 20 + if (recordText.isNotEmpty) parts.add(recordText); 21 + 22 + final embed = post.embed; 23 + if (embed != null) { 24 + if (embed.isEmbedImagesView) { 25 + for (final image in embed.embedImagesView!.images) { 26 + final alt = image.alt.trim(); 27 + if (alt.isNotEmpty) parts.add(alt); 28 + } 29 + } else if (embed.isEmbedExternalView) { 30 + final external = embed.embedExternalView!.external; 31 + final title = external.title.trim(); 32 + if (title.isNotEmpty) parts.add(title); 33 + final desc = external.description.trim(); 34 + if (desc.isNotEmpty) parts.add(desc); 35 + } else if (embed.isEmbedRecordWithMediaView) { 36 + final media = embed.embedRecordWithMediaView!.media; 37 + if (media.isEmbedImagesView) { 38 + for (final image in media.embedImagesView!.images) { 39 + final alt = image.alt.trim(); 40 + if (alt.isNotEmpty) parts.add(alt); 41 + } 42 + } else if (media.isEmbedExternalView) { 43 + final external = media.embedExternalView!.external; 44 + final title = external.title.trim(); 45 + if (title.isNotEmpty) parts.add(title); 46 + final desc = external.description.trim(); 47 + if (desc.isNotEmpty) parts.add(desc); 48 + } 49 + } 50 + } 51 + 52 + return parts.join(' '); 53 + } 54 + 55 + String _recordText(Map<String, dynamic> record) { 56 + try { 57 + return FeedPostRecord.fromJson(record).text.trim(); 58 + } catch (_) { 59 + return ''; 60 + } 61 + } 62 + }

+156

test/core/embedding/embedding_service_test.dart

··· 1 + import 'dart:typed_data'; 2 + 3 + import 'package:flutter_test/flutter_test.dart'; 4 + import 'package:lazurite/core/embedding/embedding_service.dart'; 5 + 6 + void main() { 7 + group('l2Normalize', () { 8 + test('unit vector is unchanged', () { 9 + final v = Float32List.fromList([1.0, 0.0, 0.0]); 10 + final result = l2Normalize(v); 11 + expect(result[0], closeTo(1.0, 1e-6)); 12 + expect(result[1], closeTo(0.0, 1e-6)); 13 + expect(result[2], closeTo(0.0, 1e-6)); 14 + }); 15 + 16 + test('scales vector to unit length', () { 17 + final v = Float32List.fromList([3.0, 4.0]); 18 + final result = l2Normalize(v); 19 + // norm = 5; normalised = [0.6, 0.8] 20 + expect(result[0], closeTo(0.6, 1e-6)); 21 + expect(result[1], closeTo(0.8, 1e-6)); 22 + }); 23 + 24 + test('result has norm ≈ 1', () { 25 + final v = Float32List.fromList(List.generate(384, (i) => (i + 1).toDouble())); 26 + final result = l2Normalize(v); 27 + var norm = 0.0; 28 + for (final x in result) { 29 + norm += x * x; 30 + } 31 + expect(norm, closeTo(1.0, 1e-5)); 32 + }); 33 + 34 + test('near-zero vector is returned unchanged (no division by zero)', () { 35 + final v = Float32List.fromList([0.0, 0.0, 0.0]); 36 + final result = l2Normalize(v); 37 + expect(result, equals(v)); 38 + }); 39 + 40 + test('returns a new list, does not mutate input', () { 41 + final v = Float32List.fromList([3.0, 4.0]); 42 + l2Normalize(v); 43 + expect(v[0], equals(3.0)); 44 + expect(v[1], equals(4.0)); 45 + }); 46 + }); 47 + 48 + group('EmbeddingService', () { 49 + group('initial state', () { 50 + test('isAvailable is false before initialize', () { 51 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 52 + expect(service.isAvailable, isFalse); 53 + }); 54 + }); 55 + 56 + group('initialize / dispose', () { 57 + test('isAvailable is true after initialize with mock backend', () async { 58 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 59 + await service.initialize(); 60 + expect(service.isAvailable, isTrue); 61 + }); 62 + 63 + test('initialize is idempotent', () async { 64 + var calls = 0; 65 + final service = EmbeddingService.forTesting((_) async { 66 + calls++; 67 + return Float32List(384); 68 + }); 69 + await service.initialize(); 70 + await service.initialize(); // second call should be a no-op 71 + expect(service.isAvailable, isTrue); 72 + // Idempotency check: embed once to confirm service still works. 73 + await service.embed('test'); 74 + expect(calls, equals(1)); // embed was called once, not initialize twice 75 + }); 76 + 77 + test('dispose resets isAvailable to false', () async { 78 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 79 + await service.initialize(); 80 + service.dispose(); 81 + expect(service.isAvailable, isFalse); 82 + }); 83 + 84 + test('dispose before initialize does not throw', () { 85 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 86 + expect(() => service.dispose(), returnsNormally); 87 + }); 88 + 89 + test('dispose can be called multiple times safely', () async { 90 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 91 + await service.initialize(); 92 + service.dispose(); 93 + expect(() => service.dispose(), returnsNormally); 94 + }); 95 + }); 96 + 97 + group('embed', () { 98 + test('throws StateError when not initialized', () async { 99 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 100 + expect(() => service.embed('hello'), throwsStateError); 101 + }); 102 + 103 + test('throws StateError after dispose', () async { 104 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 105 + await service.initialize(); 106 + service.dispose(); 107 + expect(() => service.embed('hello'), throwsStateError); 108 + }); 109 + 110 + test('returns the value produced by the mock backend', () async { 111 + final expected = Float32List.fromList(List.generate(384, (i) => i.toDouble())); 112 + final service = EmbeddingService.forTesting((_) async => expected); 113 + await service.initialize(); 114 + 115 + final result = await service.embed('some text'); 116 + expect(result, equals(expected)); 117 + }); 118 + 119 + test('result has length 384', () async { 120 + final service = EmbeddingService.forTesting((_) async => Float32List(384)); 121 + await service.initialize(); 122 + 123 + final result = await service.embed('hello world'); 124 + expect(result.length, equals(384)); 125 + }); 126 + 127 + test('forwards the exact text to the backend', () async { 128 + String? received; 129 + final service = EmbeddingService.forTesting((text) async { 130 + received = text; 131 + return Float32List(384); 132 + }); 133 + await service.initialize(); 134 + 135 + await service.embed('the quick brown fox'); 136 + expect(received, equals('the quick brown fox')); 137 + }); 138 + 139 + test('multiple concurrent embeds each receive correct results', () async { 140 + var callCount = 0; 141 + final service = EmbeddingService.forTesting((text) async { 142 + callCount++; 143 + final v = Float32List(384); 144 + v[0] = callCount.toDouble(); 145 + return v; 146 + }); 147 + await service.initialize(); 148 + 149 + final results = await Future.wait([service.embed('a'), service.embed('b'), service.embed('c')]); 150 + 151 + expect(results.length, equals(3)); 152 + expect(results.every((r) => r.length == 384), isTrue); 153 + }); 154 + }); 155 + }); 156 + }

+171

test/core/embedding/word_piece_tokenizer_test.dart

··· 1 + import 'package:flutter_test/flutter_test.dart'; 2 + import 'package:lazurite/core/embedding/word_piece_tokenizer.dart'; 3 + 4 + /// Minimal synthetic vocabulary for fast, isolated tests. 5 + /// Each line is one token; line number (0-indexed) is its ID. 6 + /// 7 + /// Mapping: ID 8 + /// [PAD] (padId) = 0 9 + /// ##a = 1 10 + /// ##b = 2 11 + /// ##c = 3 12 + /// ##bc = 4 13 + /// ##un = 5 14 + /// ab = 6 15 + /// abc = 7 16 + /// hello = 8 17 + /// world = 9 18 + /// [UNK] (unkId = 100) → not in the 10-token vocab, so words with no 19 + /// sub-token match will return [UNK]=100 from 20 + /// the full-vocab constant, but for a synthetic 21 + /// vocab the 100 will be the literal constant. 22 + /// 23 + /// We embed [PAD] × 90 filler entries between index 10 and 99 so that 24 + /// unkId=100, clsId=101, sepId=102 fall at the expected positions. 25 + String _buildVocab() { 26 + final lines = <String>[]; 27 + lines.add('[PAD]'); 28 + lines.add('##a'); 29 + lines.add('##b'); 30 + lines.add('##c'); 31 + lines.add('##bc'); 32 + lines.add('##un'); 33 + lines.add('ab'); 34 + lines.add('abc'); 35 + lines.add('hello'); 36 + lines.add('world'); 37 + for (var i = 10; i < 100; i++) { 38 + lines.add('[unused_${i}_]'); 39 + } 40 + 41 + lines.add('[UNK]'); 42 + lines.add('[CLS]'); 43 + lines.add('[SEP]'); 44 + return lines.join('\n'); 45 + } 46 + 47 + final _vocab = _buildVocab(); 48 + 49 + void main() { 50 + late WordPieceTokenizer tokenizer; 51 + 52 + setUp(() { 53 + tokenizer = WordPieceTokenizer.fromString(_vocab); 54 + }); 55 + 56 + group('WordPieceTokenizer', () { 57 + group('structure', () { 58 + test('always returns exactly maxTokens (256) elements', () { 59 + final result = tokenizer.tokenize('hello'); 60 + expect(result.length, equals(WordPieceTokenizer.maxTokens)); 61 + }); 62 + 63 + test('first token is always CLS (101)', () { 64 + expect(tokenizer.tokenize('hello').first, equals(WordPieceTokenizer.clsId)); 65 + expect(tokenizer.tokenize('').first, equals(WordPieceTokenizer.clsId)); 66 + }); 67 + 68 + test('empty string → [CLS, SEP, PAD, PAD, ...]', () { 69 + final result = tokenizer.tokenize(''); 70 + expect(result[0], equals(WordPieceTokenizer.clsId)); 71 + expect(result[1], equals(WordPieceTokenizer.sepId)); 72 + expect(result.sublist(2), everyElement(equals(WordPieceTokenizer.padId))); 73 + }); 74 + 75 + test('padding fills remaining indices after [SEP] with PAD (0)', () { 76 + final result = tokenizer.tokenize('hello'); 77 + expect(result.sublist(3), everyElement(equals(WordPieceTokenizer.padId))); 78 + }); 79 + }); 80 + 81 + group('whole-word tokens', () { 82 + test('known word → its vocab ID', () { 83 + final result = tokenizer.tokenize('hello'); 84 + expect(result[1], equals(8)); 85 + expect(result[2], equals(WordPieceTokenizer.sepId)); 86 + }); 87 + 88 + test('two known words → two IDs between CLS and SEP', () { 89 + final result = tokenizer.tokenize('hello world'); 90 + expect(result[0], equals(WordPieceTokenizer.clsId)); 91 + expect(result[1], equals(8)); 92 + expect(result[2], equals(9)); 93 + expect(result[3], equals(WordPieceTokenizer.sepId)); 94 + }); 95 + }); 96 + 97 + group('wordpiece sub-word splitting', () { 98 + test('word not in vocab but has valid subword decomposition', () { 99 + final result = tokenizer.tokenize('abc'); 100 + expect(result[1], equals(7)); 101 + }); 102 + 103 + test('sub-word fallback: ab + ##c → [6, 3]', () { 104 + final result = tokenizer.tokenize('abbc'); 105 + expect(result[1], equals(6)); // "ab" 106 + expect(result[2], equals(4)); // "##bc" 107 + expect(result[3], equals(WordPieceTokenizer.sepId)); 108 + }); 109 + 110 + test('word with no valid sub-token decomposition → UNK', () { 111 + final result = tokenizer.tokenize('xyz'); 112 + expect(result[1], equals(WordPieceTokenizer.unkId)); 113 + expect(result[2], equals(WordPieceTokenizer.sepId)); 114 + }); 115 + }); 116 + 117 + group('truncation', () { 118 + test('very long text is truncated to maxTokens with SEP preserved', () { 119 + final longText = ('hello ' * 300).trim(); 120 + final result = tokenizer.tokenize(longText); 121 + expect(result.length, equals(WordPieceTokenizer.maxTokens)); 122 + 123 + final lastNonPad = result.lastIndexWhere((id) => id != WordPieceTokenizer.padId); 124 + expect(result[lastNonPad], equals(WordPieceTokenizer.sepId)); 125 + }); 126 + 127 + test('text of exactly maxTokens - 2 content tokens fits without truncation', () { 128 + final words = List.filled(254, 'hello'); 129 + final result = tokenizer.tokenize(words.join(' ')); 130 + expect(result[0], equals(WordPieceTokenizer.clsId)); 131 + expect(result[255], equals(WordPieceTokenizer.sepId)); 132 + expect(result.contains(WordPieceTokenizer.padId), isFalse); 133 + }); 134 + }); 135 + 136 + group('case', () { 137 + test('input is case-folded to lowercase before tokenisation', () { 138 + final lower = tokenizer.tokenize('hello'); 139 + final upper = tokenizer.tokenize('HELLO'); 140 + expect(lower, equals(upper)); 141 + }); 142 + }); 143 + 144 + group('punctuation', () { 145 + test('punctuation is split as individual tokens', () { 146 + final result = tokenizer.tokenize('hello,world'); 147 + expect(result[0], equals(WordPieceTokenizer.clsId)); 148 + expect(result[1], equals(8)); 149 + expect(result[2], equals(WordPieceTokenizer.unkId)); 150 + expect(result[3], equals(9)); 151 + expect(result[4], equals(WordPieceTokenizer.sepId)); 152 + }); 153 + }); 154 + 155 + group('fromString factory', () { 156 + test('handles Windows-style CRLF line endings', () { 157 + final crlfVocab = _buildVocab().replaceAll('\n', '\r\n'); 158 + final crlfTokenizer = WordPieceTokenizer.fromString(crlfVocab); 159 + final result = crlfTokenizer.tokenize('hello'); 160 + expect(result[1], equals(8)); 161 + }); 162 + 163 + test('skips blank trailing line if present', () { 164 + final withTrailing = '${_buildVocab()}\n'; 165 + final t = WordPieceTokenizer.fromString(withTrailing); 166 + final result = t.tokenize('hello'); 167 + expect(result[1], equals(8)); 168 + }); 169 + }); 170 + }); 171 + }

+207

test/features/search/data/post_text_extractor_test.dart

··· 1 + import 'package:atproto_core/atproto_core.dart'; 2 + import 'package:bluesky/app_bsky_actor_defs.dart'; 3 + import 'package:bluesky/app_bsky_embed_external.dart'; 4 + import 'package:bluesky/app_bsky_embed_images.dart'; 5 + import 'package:bluesky/app_bsky_embed_record.dart'; 6 + import 'package:bluesky/app_bsky_embed_recordwithmedia.dart'; 7 + import 'package:bluesky/app_bsky_feed_defs.dart'; 8 + import 'package:bluesky/app_bsky_feed_post.dart'; 9 + import 'package:flutter_test/flutter_test.dart'; 10 + import 'package:lazurite/features/search/data/post_text_extractor.dart'; 11 + 12 + const _author = ProfileViewBasic(did: 'did:plc:test', handle: 'test.bsky.social'); 13 + final _uri = AtUri.parse('at://did:plc:test/app.bsky.feed.post/xyz'); 14 + 15 + PostView _post({String text = '', UPostViewEmbed? embed}) { 16 + final record = FeedPostRecord(text: text, createdAt: DateTime.utc(2026, 1, 1)); 17 + return PostView( 18 + uri: _uri, 19 + cid: 'cid-test', 20 + author: _author, 21 + record: record.toJson(), 22 + indexedAt: DateTime.utc(2026, 1, 1), 23 + embed: embed, 24 + ); 25 + } 26 + 27 + UPostViewEmbed _imagesEmbed(List<String> altTexts) { 28 + final images = altTexts 29 + .map( 30 + (alt) => EmbedImagesViewImage( 31 + thumb: 'https://example.com/thumb.jpg', 32 + fullsize: 'https://example.com/full.jpg', 33 + alt: alt, 34 + aspectRatio: null, 35 + ), 36 + ) 37 + .toList(); 38 + return UPostViewEmbed.embedImagesView(data: EmbedImagesView(images: images)); 39 + } 40 + 41 + UPostViewEmbed _externalEmbed({ 42 + required String title, 43 + required String description, 44 + String uri = 'https://example.com', 45 + }) { 46 + return UPostViewEmbed.embedExternalView( 47 + data: EmbedExternalView( 48 + external: EmbedExternalViewExternal(uri: uri, title: title, description: description), 49 + ), 50 + ); 51 + } 52 + 53 + UPostViewEmbed _recordWithImagesEmbed(String postText, List<String> altTexts) { 54 + return UPostViewEmbed.embedRecordWithMediaView( 55 + data: EmbedRecordWithMediaView( 56 + record: const EmbedRecordView(record: UEmbedRecordViewRecord.unknown(data: {})), 57 + media: UEmbedRecordWithMediaViewMedia.embedImagesView( 58 + data: EmbedImagesView( 59 + images: altTexts 60 + .map( 61 + (alt) => EmbedImagesViewImage( 62 + thumb: 'https://example.com/thumb.jpg', 63 + fullsize: 'https://example.com/full.jpg', 64 + alt: alt, 65 + aspectRatio: null, 66 + ), 67 + ) 68 + .toList(), 69 + ), 70 + ), 71 + ), 72 + ); 73 + } 74 + 75 + UPostViewEmbed _recordWithExternalEmbed(String postText, {required String title, required String description}) { 76 + return UPostViewEmbed.embedRecordWithMediaView( 77 + data: EmbedRecordWithMediaView( 78 + record: const EmbedRecordView(record: UEmbedRecordViewRecord.unknown(data: {})), 79 + media: UEmbedRecordWithMediaViewMedia.embedExternalView( 80 + data: EmbedExternalView( 81 + external: EmbedExternalViewExternal(uri: 'https://example.com', title: title, description: description), 82 + ), 83 + ), 84 + ), 85 + ); 86 + } 87 + 88 + void main() { 89 + late PostTextExtractor extractor; 90 + 91 + setUp(() { 92 + extractor = const PostTextExtractor(); 93 + }); 94 + 95 + group('PostTextExtractor', () { 96 + group('text-only posts', () { 97 + test('returns the post body text', () { 98 + final post = _post(text: 'Hello world'); 99 + expect(extractor.extract(post), equals('Hello world')); 100 + }); 101 + 102 + test('trims surrounding whitespace from post text', () { 103 + final post = _post(text: ' trimmed '); 104 + expect(extractor.extract(post), equals('trimmed')); 105 + }); 106 + 107 + test('returns empty string for a post with no text and no embed', () { 108 + final post = _post(text: ''); 109 + expect(extractor.extract(post), equals('')); 110 + }); 111 + }); 112 + 113 + group('image embeds', () { 114 + test('appends alt texts to post text', () { 115 + final post = _post(text: 'Check this out', embed: _imagesEmbed(['a cat', 'a dog'])); 116 + expect(extractor.extract(post), equals('Check this out a cat a dog')); 117 + }); 118 + 119 + test('skips images with empty alt text', () { 120 + final post = _post(text: 'Photo', embed: _imagesEmbed(['', 'nice view', ''])); 121 + expect(extractor.extract(post), equals('Photo nice view')); 122 + }); 123 + 124 + test('handles all-blank alt texts gracefully', () { 125 + final post = _post(text: 'Silent', embed: _imagesEmbed(['', ' '])); 126 + expect(extractor.extract(post), equals('Silent')); 127 + }); 128 + 129 + test('returns only alt texts when post text is empty', () { 130 + final post = _post(embed: _imagesEmbed(['sunset photo'])); 131 + expect(extractor.extract(post), equals('sunset photo')); 132 + }); 133 + }); 134 + 135 + group('external link-card embeds', () { 136 + test('appends title and description to post text', () { 137 + final post = _post( 138 + text: 'Read this', 139 + embed: _externalEmbed(title: 'Great Article', description: 'Very informative'), 140 + ); 141 + expect(extractor.extract(post), equals('Read this Great Article Very informative')); 142 + }); 143 + 144 + test('omits empty title', () { 145 + final post = _post( 146 + text: 'Link', 147 + embed: _externalEmbed(title: '', description: 'A description'), 148 + ); 149 + expect(extractor.extract(post), equals('Link A description')); 150 + }); 151 + 152 + test('omits empty description', () { 153 + final post = _post( 154 + text: 'Link', 155 + embed: _externalEmbed(title: 'Title', description: ''), 156 + ); 157 + expect(extractor.extract(post), equals('Link Title')); 158 + }); 159 + 160 + test('returns only title+description when post text is empty', () { 161 + final post = _post( 162 + embed: _externalEmbed(title: 'My Title', description: 'My Desc'), 163 + ); 164 + expect(extractor.extract(post), equals('My Title My Desc')); 165 + }); 166 + }); 167 + 168 + group('record-with-media embeds (images)', () { 169 + test('appends image alt texts from media component', () { 170 + final post = _post(text: 'With quote', embed: _recordWithImagesEmbed('', ['alt one', 'alt two'])); 171 + expect(extractor.extract(post), equals('With quote alt one alt two')); 172 + }); 173 + 174 + test('skips empty alt texts in record-with-media', () { 175 + final post = _post(text: 'Post', embed: _recordWithImagesEmbed('', ['', 'valid'])); 176 + expect(extractor.extract(post), equals('Post valid')); 177 + }); 178 + }); 179 + 180 + group('record-with-media embeds (external)', () { 181 + test('appends title and description from media external', () { 182 + final post = _post( 183 + text: 'Quoting with link', 184 + embed: _recordWithExternalEmbed('', title: 'Card Title', description: 'Card Desc'), 185 + ); 186 + expect(extractor.extract(post), equals('Quoting with link Card Title Card Desc')); 187 + }); 188 + }); 189 + 190 + group('combinations', () { 191 + test('multiple image alt texts are space-separated between them', () { 192 + final post = _post(embed: _imagesEmbed(['first', 'second', 'third'])); 193 + expect(extractor.extract(post), equals('first second third')); 194 + }); 195 + 196 + test('produces a single space-joined string with no leading or trailing space', () { 197 + final post = _post( 198 + text: 'Body', 199 + embed: _externalEmbed(title: 'T', description: 'D'), 200 + ); 201 + final result = extractor.extract(post); 202 + expect(result.startsWith(' '), isFalse); 203 + expect(result.endsWith(' '), isFalse); 204 + }); 205 + }); 206 + }); 207 + }

Configure Feed

Configure Feed