A lil service that creates embeddings of posts, profiles, and avatars to store them in Qdrant
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

update collection parameters

Hailey d10cf3c7 042f51cc

+73 -23
+73 -23
database.py
··· 3 3 from typing import List, Optional 4 4 import uuid 5 5 6 + from qdrant_client.grpc import OptimizersConfigDiff 7 + from qdrant_client.http.models import BinaryQuantizationConfig 8 + 6 9 from config import CONFIG 7 10 from qdrant_client import QdrantClient 8 11 from qdrant_client.models import ( 12 + BinaryQuantization, 9 13 Distance, 10 14 FieldCondition, 11 15 Filter, 16 + HnswConfigDiff, 12 17 MatchValue, 13 18 Payload, 14 19 PayloadSchemaType, 15 20 PointStruct, 21 + ScalarQuantization, 22 + ScalarQuantizationConfig, 23 + ScalarType, 16 24 VectorParams, 17 25 ) 18 26 ··· 33 41 34 42 class QdrantService: 35 43 def __init__(self) -> None: 36 - self.client = None 44 + self._client = None 37 45 38 46 def initialized(self): 39 - return self.client is not None 47 + return self._client is not None 48 + 49 + def get_client(self): 50 + return self._client 40 51 41 52 def initialize(self) -> None: 42 53 logger.info(f"Connecting to Qdrant at {CONFIG.qdrant_url}") 43 54 44 - self.client = QdrantClient( 55 + self._client = QdrantClient( 45 56 url=CONFIG.qdrant_url, 46 57 ) 47 58 ··· 51 62 self._ensure_collections_exist() 52 63 53 64 def _ensure_collections_exist(self): 54 - profile_coll_exists = self.client.collection_exists( 65 + profile_coll_exists = self._client.collection_exists( 55 66 self.profile_collection_name 56 67 ) 57 - avatar_coll_exists = self.client.collection_exists(self.avatar_collection_name) 58 - post_coll_exists = self.client.collection_exists(self.post_collection_name) 68 + avatar_coll_exists = self._client.collection_exists(self.avatar_collection_name) 69 + post_coll_exists = self._client.collection_exists(self.post_collection_name) 59 70 60 71 if not profile_coll_exists: 61 72 logger.info(f"Creating profile collection: {self.profile_collection_name}") 62 - self.client.create_collection( 73 + self._client.create_collection( 63 74 collection_name=self.profile_collection_name, 64 - vectors_config=VectorParams( 65 - size=CONFIG.embedding_size, 66 - distance=Distance.COSINE, 75 + vectors_config=VectorParams(size=1024, distance=Distance.COSINE), 76 + hnsw_config=HnswConfigDiff(m=32, ef_construct=200), 77 + quantization_config=ScalarQuantization( 78 + scalar=ScalarQuantizationConfig( 79 + type=ScalarType.INT8, quantile=0.99, always_ram=True 80 + ) 67 81 ), 68 82 ) 69 - self.client.create_payload_index( 83 + self._client.create_payload_index( 70 84 collection_name=self.profile_collection_name, 71 85 field_name="did", 72 86 field_schema=PayloadSchemaType.KEYWORD, 73 87 ) 88 + self._client.create_payload_index( 89 + collection_name=self.avatar_collection_name, 90 + field_name="timestamp", 91 + field_schema=PayloadSchemaType.DATETIME, 92 + ) 74 93 logger.info("Collection created successfully") 75 94 76 95 if not avatar_coll_exists: 77 96 logger.info(f"Creating avatar collection: {self.avatar_collection_name}") 78 - self.client.create_collection( 97 + self._client.create_collection( 79 98 collection_name=self.avatar_collection_name, 80 99 vectors_config=VectorParams( 81 100 # PDQ vectors have a size of 256 ··· 84 103 # use the square root of the selected max distance for lookups 85 104 distance=Distance.EUCLID, 86 105 ), 106 + hnsw_config=HnswConfigDiff( 107 + m=16, # lower m for binary-like data 108 + ef_construct=100, 109 + ), 110 + quantization_config=BinaryQuantization( 111 + binary=BinaryQuantizationConfig(always_ram=True) 112 + ), 87 113 ) 88 - self.client.create_payload_index( 114 + self._client.create_payload_index( 89 115 collection_name=self.avatar_collection_name, 90 116 field_name="did", 91 117 field_schema=PayloadSchemaType.KEYWORD, 92 118 ) 119 + self._client.create_payload_index( 120 + collection_name=self.avatar_collection_name, 121 + field_name="timestamp", 122 + field_schema=PayloadSchemaType.DATETIME, 123 + ) 93 124 94 125 if not post_coll_exists: 95 126 logger.info(f"Creating post collection: {self.post_collection_name}") 96 - self.client.create_collection( 127 + self._client.create_collection( 97 128 collection_name=self.post_collection_name, 98 129 vectors_config=VectorParams( 99 130 size=CONFIG.embedding_size, 100 131 distance=Distance.COSINE, 101 132 ), 133 + hnsw_config=HnswConfigDiff( 134 + m=48, 135 + ef_construct=256, 136 + ), 137 + quantization_config=ScalarQuantization( 138 + scalar=ScalarQuantizationConfig( 139 + type=ScalarType.INT8, 140 + quantile=0.99, 141 + always_ram=True, 142 + ), 143 + ), 144 + optimizers_config=OptimizersConfigDiff( 145 + indexing_threshold=50_000, 146 + ), 102 147 ) 103 - self.client.create_payload_index( 148 + self._client.create_payload_index( 104 149 collection_name=self.post_collection_name, 105 150 field_name="uri", 106 151 field_schema=PayloadSchemaType.KEYWORD, 107 152 ) 153 + self._client.create_payload_index( 154 + collection_name=self.avatar_collection_name, 155 + field_name="timestamp", 156 + field_schema=PayloadSchemaType.DATETIME, 157 + ) 108 158 logger.info("Collection created successfully") 109 159 110 160 def upsert_profile(self, did: str, description: str, vector: List[float]): ··· 114 164 "description": description, 115 165 } 116 166 117 - existing = self.client.scroll( 167 + existing = self._client.scroll( 118 168 collection_name=self.profile_collection_name, 119 169 scroll_filter=Filter( 120 170 must=[FieldCondition(key="did", match=MatchValue(value=did))] ··· 132 182 payload=payload, 133 183 ) 134 184 135 - self.client.upsert( 185 + self._client.upsert( 136 186 collection_name=self.profile_collection_name, 137 187 points=[point], 138 188 ) ··· 149 199 "cid": cid, 150 200 } 151 201 152 - existing = self.client.scroll( 202 + existing = self._client.scroll( 153 203 collection_name=self.avatar_collection_name, 154 204 scroll_filter=Filter( 155 205 must=[FieldCondition(key="did", match=MatchValue(value=did))] ··· 167 217 payload=payload, 168 218 ) 169 219 170 - self.client.upsert( 220 + self._client.upsert( 171 221 collection_name=self.avatar_collection_name, 172 222 points=[point], 173 223 ) ··· 194 244 payload=payload, 195 245 ) 196 246 197 - self.client.upsert( 247 + self._client.upsert( 198 248 collection_name=self.post_collection_name, 199 249 points=[point], 200 250 ) ··· 213 263 filter_conditions: Optional[Filter] = None, 214 264 ) -> Optional[List[Result]]: 215 265 try: 216 - results = self.client.query_points( 266 + results = self._client.query_points( 217 267 collection_name=collection_name, 218 268 query=query_vector, 219 269 query_filter=filter_conditions, ··· 234 284 logger.error(f"Error searching for similar vectors: {e}") 235 285 236 286 def get_profile_by_did(self, did: str) -> Optional[ResultWithVector]: 237 - result = self.client.scroll( 287 + result = self._client.scroll( 238 288 collection_name=self.profile_collection_name, 239 289 scroll_filter=Filter( 240 290 must=[FieldCondition(key="did", match=MatchValue(value=did))] ··· 253 303 ) 254 304 255 305 def get_avatar_by_did(self, did: str) -> Optional[ResultWithVector]: 256 - result = self.client.scroll( 306 + result = self._client.scroll( 257 307 collection_name=self.avatar_collection_name, 258 308 scroll_filter=Filter( 259 309 must=[FieldCondition(key="did", match=MatchValue(value=did))]