···11+use async_trait::async_trait;
22+use zqa_rag::reranking::common::RerankProviderConfig;
33+44+use crate::utils::library::{ZoteroItem, ZoteroItemMetadata};
55+66+/// Token statistics from a vector search call, used for cost estimation.
77+pub struct VectorSearchStats {
88+ /// Number of tokens in the query string that was embedded
99+ pub(crate) embedding_tokens: usize,
1010+ /// Total tokens of documents + query sent to the reranker
1111+ pub(crate) rerank_tokens: usize,
1212+}
1313+1414+/// An application-level trait for Zotero store implementations.
1515+#[async_trait]
1616+pub trait ZoteroStore: Send + Sync {
1717+ /// The error type returned by store operations.
1818+ type StoreError: std::error::Error + Send + Sync;
1919+ /// The metadata type associated with the store.
2020+ type Metadata;
2121+2222+ /// Returns `true` if the store exists, `false` otherwise. Useful to check that the store is
2323+ /// configured correctly.
2424+ async fn exists(&self) -> bool;
2525+ /// Returns the metadata associated with the store.
2626+ async fn get_metadata(&self) -> Result<Self::Metadata, Self::StoreError>;
2727+ /// Returns the metadata for all existing items in the store. This is useful for operations
2828+ /// such as set differences (e.g., finding newly-added items).
2929+ async fn existing_item_metadata(&self) -> Result<Vec<ZoteroItemMetadata>, Self::StoreError>;
3030+ /// Performs a vector search on the store, returning the top `limit` results.
3131+ async fn vector_search(
3232+ &self,
3333+ query: String,
3434+ limit: usize,
3535+ reranker_config: Option<&RerankProviderConfig>,
3636+ ) -> Result<(Vec<ZoteroItem>, VectorSearchStats), Self::StoreError>;
3737+ /// Upserts the given items into the store.
3838+ async fn upsert_items(&self, items: Vec<ZoteroItem>) -> Result<(), Self::StoreError>;
3939+ /// Searches the store for items matching the given query, returning the top `limit` results.
4040+ /// This variant does not perform reranking.
4141+ async fn vector_search_raw(
4242+ &self,
4343+ query: &str,
4444+ limit: usize,
4545+ ) -> Result<Vec<ZoteroItem>, Self::StoreError>;
4646+ /// Returns the items with the given keys from the store. This is useful for retrieving
4747+ /// items by their library keys without performing a full text search.
4848+ async fn get_items_by_keys(&self, keys: &[String])
4949+ -> Result<Vec<ZoteroItem>, Self::StoreError>;
5050+ /// Deletes the items with the given keys from the store.
5151+ async fn delete_by_library_keys(&self, keys: &[String]) -> Result<(), Self::StoreError>;
5252+ /// Deletes duplicate items from the store based on their title. Returns the number of items
5353+ /// deleted.
5454+ async fn dedup_by_title(&self) -> Result<usize, Self::StoreError>;
5555+}
+310
zqa/src/store/lance.rs
···11+use std::sync::Arc;
22+33+use arrow_array::RecordBatch;
44+use arrow_schema::Schema;
55+use async_trait::async_trait;
66+use zqa_rag::{
77+ embedding::common::EmbeddingProviderConfig,
88+ reranking::common::{RerankProviderConfig, get_reranking_provider_with_config},
99+ vector::backends::{
1010+ backend::VectorBackend,
1111+ lance::{LanceBackend, LanceMetadata},
1212+ },
1313+};
1414+1515+use crate::store::common::VectorSearchStats;
1616+use crate::{
1717+ cli::errors::CLIError,
1818+ config::Config,
1919+ store::common::ZoteroStore,
2020+ utils::{
2121+ arrow::{DbFields, get_schema, library_to_arrow},
2222+ library::{ZoteroItem, ZoteroItemSet},
2323+ },
2424+};
2525+2626+/// Zotero-specific store backed by LanceDB.
2727+#[derive(Clone)]
2828+pub struct LanceZoteroStore {
2929+ backend: LanceBackend,
3030+ embedding_config: EmbeddingProviderConfig,
3131+}
3232+3333+impl LanceZoteroStore {
3434+ /// Create a new Lance-backed Zotero store from an existing backend and embedding config.
3535+ #[must_use]
3636+ fn new(backend: LanceBackend, embedding_config: EmbeddingProviderConfig) -> Self {
3737+ Self {
3838+ backend,
3939+ embedding_config,
4040+ }
4141+ }
4242+4343+ /// Create a Lance-backed Zotero store from an embedding config and Arrow schema.
4444+ #[must_use]
4545+ pub fn from_schema(embedding_config: EmbeddingProviderConfig, schema: Arc<Schema>) -> Self {
4646+ let backend = LanceBackend::new(
4747+ embedding_config.clone(),
4848+ schema,
4949+ DbFields::PdfText.as_ref().to_string(),
5050+ );
5151+5252+ Self::new(backend, embedding_config)
5353+ }
5454+5555+ /// Get a read-only embedding config
5656+ #[must_use]
5757+ pub fn get_embedding_config(&self) -> EmbeddingProviderConfig {
5858+ self.embedding_config.clone()
5959+ }
6060+6161+ /// Create a Lance-backed Zotero store from an embedding configuration.
6262+ pub async fn from_embedding_config(embedding_config: EmbeddingProviderConfig) -> Self {
6363+ let schema = Arc::new(get_schema(embedding_config.provider(), true).await);
6464+ Self::from_schema(embedding_config, schema)
6565+ }
6666+6767+ /// Create a Lance-backed Zotero store from the application config.
6868+ ///
6969+ /// # Errors
7070+ ///
7171+ /// Returns a [`CLIError`] if no embedding configuration is available.
7272+ pub(crate) async fn from_config(config: &Config) -> Result<Self, CLIError> {
7373+ let embedding_config = config.get_embedding_config().ok_or(CLIError::ConfigError(
7474+ "Could not get embedding config".into(),
7575+ ))?;
7676+7777+ Ok(Self::from_embedding_config(embedding_config).await)
7878+ }
7979+8080+ /// Upsert Arrow record batches into the LanceDB table by Zotero library key.
8181+ ///
8282+ /// TODO: We should probably deprecate this at some point in favor of the `upsert_items` from
8383+ /// the trait. I'm keeping this around for now to keep refactor scopes relatively manageable.
8484+ /// Ideally, we would not have any Lance-specific architecture, but currently, commands such as
8585+ /// `/process` rely on this.
8686+ ///
8787+ /// # Errors
8888+ ///
8989+ /// Returns a [`CLIError`] if LanceDB insertion fails.
9090+ pub(crate) async fn upsert_batches(&self, batches: Vec<RecordBatch>) -> Result<(), CLIError> {
9191+ self.backend
9292+ .insert_items(batches, Some(&[DbFields::LibraryKey.as_ref()]))
9393+ .await
9494+ .map_err(Into::into)
9595+ }
9696+9797+ /// Create or update retrieval indices for the LanceDB table.
9898+ ///
9999+ /// # Errors
100100+ ///
101101+ /// Returns a [`CLIError`] if index creation or update fails.
102102+ pub(crate) async fn create_or_update_indices(&self) -> Result<(), CLIError> {
103103+ self.backend
104104+ .create_or_update_indices(DbFields::PdfText.as_ref(), DbFields::Embeddings.as_ref())
105105+ .await
106106+ .map_err(Into::into)
107107+ }
108108+}
109109+110110+#[async_trait]
111111+impl ZoteroStore for LanceZoteroStore {
112112+ type StoreError = CLIError;
113113+ type Metadata = LanceMetadata;
114114+115115+ async fn exists(&self) -> bool {
116116+ self.backend.db_exists().await
117117+ }
118118+119119+ /// Perform vector search and optional reranking.
120120+ ///
121121+ /// # Errors
122122+ ///
123123+ /// Returns a [`CLIError`] if search or reranking fails.
124124+ async fn vector_search(
125125+ &self,
126126+ query: String,
127127+ limit: usize,
128128+ reranker_config: Option<&RerankProviderConfig>,
129129+ ) -> Result<(Vec<ZoteroItem>, VectorSearchStats), CLIError> {
130130+ let embedding_tokens = query.len();
131131+ let items = <Self as ZoteroStore>::vector_search_raw(self, &query, limit).await?;
132132+133133+ let filtered_items: Vec<ZoteroItem> = items
134134+ .into_iter()
135135+ .filter(|item| !item.text.trim().is_empty())
136136+ .collect();
137137+138138+ if filtered_items.is_empty() {
139139+ return Ok((
140140+ Vec::new(),
141141+ VectorSearchStats {
142142+ embedding_tokens,
143143+ rerank_tokens: 0,
144144+ },
145145+ ));
146146+ }
147147+148148+ let Some(reranker) = reranker_config else {
149149+ return Ok((
150150+ filtered_items,
151151+ VectorSearchStats {
152152+ embedding_tokens,
153153+ rerank_tokens: 0,
154154+ },
155155+ ));
156156+ };
157157+158158+ let rerank_provider = get_reranking_provider_with_config(reranker)?;
159159+ let item_strings = filtered_items
160160+ .iter()
161161+ .map(|f| f.text.as_str())
162162+ .collect::<Vec<_>>();
163163+164164+ let rerank_tokens = item_strings.iter().map(|s| s.len()).sum::<usize>() + query.len();
165165+ let indices = rerank_provider.rerank(&item_strings, &query).await?;
166166+167167+ let reranked_items = indices
168168+ .into_iter()
169169+ .filter_map(|idx| filtered_items.get(idx).cloned())
170170+ .collect();
171171+172172+ Ok((
173173+ reranked_items,
174174+ VectorSearchStats {
175175+ embedding_tokens,
176176+ rerank_tokens,
177177+ },
178178+ ))
179179+ }
180180+181181+ /// Return metadata for Zotero items that already exist in the store.
182182+ ///
183183+ /// # Errors
184184+ ///
185185+ /// Returns a [`CLIError`] if the existing rows cannot be fetched.
186186+ async fn existing_item_metadata(
187187+ &self,
188188+ ) -> Result<Vec<crate::utils::library::ZoteroItemMetadata>, CLIError> {
189189+ let db_items = self
190190+ .backend
191191+ .get_items(&[
192192+ DbFields::LibraryKey.into(),
193193+ DbFields::Title.into(),
194194+ DbFields::FilePath.into(),
195195+ ])
196196+ .await?;
197197+198198+ Ok(db_items
199199+ .iter()
200200+ .flat_map(|batch| {
201201+ let library_keys = crate::utils::library::get_column_from_batch(batch, 0);
202202+ let titles = crate::utils::library::get_column_from_batch(batch, 1);
203203+ let file_paths = crate::utils::library::get_column_from_batch(batch, 2);
204204+205205+ crate::izip!(library_keys, titles, file_paths)
206206+ .map(
207207+ |(key, title, path)| crate::utils::library::ZoteroItemMetadata {
208208+ library_key: key,
209209+ title,
210210+ file_path: std::path::PathBuf::from(path),
211211+ authors: None,
212212+ },
213213+ )
214214+ .collect::<Vec<_>>()
215215+ })
216216+ .collect())
217217+ }
218218+219219+ /// Return metadata for the underlying LanceDB table.
220220+ ///
221221+ /// # Errors
222222+ ///
223223+ /// Returns a [`CLIError`] if LanceDB metadata could not be read.
224224+ async fn get_metadata(&self) -> Result<LanceMetadata, CLIError> {
225225+ self.backend.get_metadata().await.map_err(Into::into)
226226+ }
227227+228228+ /// Upserts the given items into the store.
229229+ ///
230230+ /// # Arguments
231231+ ///
232232+ /// * `items` - The items to upsert.
233233+ ///
234234+ /// # Errors
235235+ ///
236236+ /// Returns a [`CLIError`] if the upsert fails.
237237+ async fn upsert_items(&self, items: Vec<ZoteroItem>) -> Result<(), Self::StoreError> {
238238+ let include_embeddings = self.exists().await;
239239+ let batch =
240240+ library_to_arrow(items, self.embedding_config.clone(), include_embeddings).await?;
241241+ self.upsert_batches(vec![batch]).await
242242+ }
243243+244244+ /// Performs a raw vector search on the store, returning the top `limit` results.
245245+ ///
246246+ /// # Arguments
247247+ ///
248248+ /// * `query` - The query string.
249249+ /// * `limit` - The maximum number of results to return.
250250+ ///
251251+ /// # Errors
252252+ ///
253253+ /// Returns a [`CLIError`] if the search fails.
254254+ async fn vector_search_raw(
255255+ &self,
256256+ query: &str,
257257+ limit: usize,
258258+ ) -> Result<Vec<ZoteroItem>, Self::StoreError> {
259259+ let batches = self.backend.vector_search(query.to_string(), limit).await?;
260260+ Ok(ZoteroItemSet::from(batches).into())
261261+ }
262262+263263+ /// Returns the items with the given keys from the store.
264264+ ///
265265+ /// # Arguments
266266+ ///
267267+ /// * `keys` - The keys of the items to return.
268268+ ///
269269+ /// # Errors
270270+ ///
271271+ /// Returns a [`CLIError`] if the search fails.
272272+ async fn get_items_by_keys(
273273+ &self,
274274+ keys: &[String],
275275+ ) -> Result<Vec<ZoteroItem>, Self::StoreError> {
276276+ let batches = self
277277+ .backend
278278+ .search_by_column(DbFields::LibraryKey.as_ref(), keys)
279279+ .await?;
280280+ Ok(ZoteroItemSet::from(batches).into())
281281+ }
282282+283283+ /// Deletes the items with the given keys from the store.
284284+ ///
285285+ /// # Arguments
286286+ ///
287287+ /// * `keys` - The keys of the items to delete.
288288+ ///
289289+ /// # Errors
290290+ ///
291291+ /// Returns a [`CLIError`] if the deletion fails.
292292+ async fn delete_by_library_keys(&self, keys: &[String]) -> Result<(), Self::StoreError> {
293293+ self.backend
294294+ .delete_rows(DbFields::LibraryKey.as_ref(), keys)
295295+ .await
296296+ .map_err(Into::into)
297297+ }
298298+299299+ /// Deduplicates items in the store by title, keeping the first occurrence.
300300+ ///
301301+ /// # Errors
302302+ ///
303303+ /// Returns a [`CLIError`] if the deduplication fails.
304304+ async fn dedup_by_title(&self) -> Result<usize, Self::StoreError> {
305305+ self.backend
306306+ .dedup_rows(DbFields::Title.as_ref(), DbFields::LibraryKey.as_ref())
307307+ .await
308308+ .map_err(Into::into)
309309+ }
310310+}
+4
zqa/src/store/mod.rs
···11+//! Zotero store implementations. This module acts as a bridge between this crate and the backends in `zqa_rag`.
22+33+pub mod common;
44+pub mod lance;
+40-35
zqa/src/tools/retrieval.rs
···99use schemars::{JsonSchema, schema_for};
1010use serde::Deserialize;
1111use serde_json::json;
1212-use zqa_rag::{
1313- llm::tools::Tool, reranking::common::RerankProviderConfig,
1414- vector::backends::lance::LanceBackend,
1515-};
1212+use zqa_rag::{llm::tools::Tool, reranking::common::RerankProviderConfig};
16131414+use crate::store::common::ZoteroStore;
1715use crate::utils::{
1818- arrow::vector_search,
1916 library::get_authors,
2017 terminal::{DIM_TEXT, RESET},
2118};
···24212522/// A tool to perform vector search and reranking.
2623#[derive(Debug)]
2727-pub(crate) struct RetrievalTool {
2828- /// The backend used for vector search.
2929- pub(crate) backend: LanceBackend,
2424+pub(crate) struct RetrievalTool<T>
2525+where
2626+ T: ZoteroStore,
2727+{
2828+ /// The vector store abstraction
2929+ pub(crate) store: Arc<T>,
3030 /// The reranker provider to use.
3131 pub(crate) reranker_config: Option<RerankProviderConfig>,
3232- /// Accumulated character count of text sent to the embedding API across all calls.
3333- pub(crate) embedding_chars: Arc<AtomicU64>,
3434- /// Accumulated character count of text sent to the reranker API across all calls.
3535- pub(crate) rerank_chars: Arc<AtomicU64>,
3232+ /// Accumulated token count of text sent to the embedding API across all calls.
3333+ pub(crate) embedding_tokens: Arc<AtomicU64>,
3434+ /// Accumulated token count of text sent to the reranker API across all calls.
3535+ pub(crate) rerank_tokens: Arc<AtomicU64>,
3636}
37373838-impl RetrievalTool {
3838+impl<T> RetrievalTool<T>
3939+where
4040+ T: ZoteroStore,
4141+{
3942 /// Create a new instance of the [`RetrievalTool`] given a backend and reranker config.
4040- pub(crate) fn new(
4141- backend: LanceBackend,
4242- reranker_provider: Option<RerankProviderConfig>,
4343- ) -> Self {
4343+ pub(crate) fn new(store: Arc<T>, reranker_provider: Option<RerankProviderConfig>) -> Self {
4444 Self {
4545- backend,
4545+ store,
4646 reranker_config: reranker_provider,
4747- embedding_chars: Arc::new(AtomicU64::new(0)),
4848- rerank_chars: Arc::new(AtomicU64::new(0)),
4747+ embedding_tokens: Arc::new(AtomicU64::new(0)),
4848+ rerank_tokens: Arc::new(AtomicU64::new(0)),
4949 }
5050 }
5151}
···5656 pub query: String,
5757}
58585959-impl Tool for RetrievalTool {
5959+impl<T> Tool for RetrievalTool<T>
6060+where
6161+ T: ZoteroStore + 'static,
6262+{
6063 fn name(&self) -> String {
6164 RETRIEVAL_TOOL_NAME.into()
6265 }
···8992 ) -> std::pin::Pin<Box<dyn Future<Output = Result<serde_json::Value, String>> + Send + '_>>
9093 {
9194 let start = Instant::now();
9292- let backend = self.backend.clone();
9395 let reranker_config = self.reranker_config.clone();
9494- let embedding_chars = Arc::clone(&self.embedding_chars);
9595- let rerank_chars = Arc::clone(&self.rerank_chars);
9696+ let embedding_tokens = Arc::clone(&self.embedding_tokens);
9797+ let rerank_tokens = Arc::clone(&self.rerank_tokens);
9898+ let store = Arc::clone(&self.store);
969997100 Box::pin(async move {
98101 let input: RetrievalToolInput =
99102 serde_json::from_value(args).map_err(|e| format!("Invalid arguments: {e}"))?;
100100- let (mut results, stats) =
101101- vector_search(input.query, &backend, reranker_config.as_ref())
102102- .await
103103- .map_err(|e| format!("Search failed: {e}"))?;
104104- embedding_chars.fetch_add(stats.embedding_chars as u64, Ordering::Relaxed);
105105- rerank_chars.fetch_add(stats.rerank_chars as u64, Ordering::Relaxed);
103103+ let (mut results, stats) = store
104104+ .vector_search(input.query, 10, reranker_config.as_ref())
105105+ .await
106106+ .map_err(|e| format!("Search failed: {e}"))?;
107107+ embedding_tokens.fetch_add(stats.embedding_tokens as u64, Ordering::Relaxed);
108108+ rerank_tokens.fetch_add(stats.rerank_tokens as u64, Ordering::Relaxed);
106109107110 get_authors(&mut results).map_err(|e| format!("Failed to get authors: {e}"))?;
108111 log::info!(
···142145 DEFAULT_VOYAGE_EMBEDDING_DIM, DEFAULT_VOYAGE_EMBEDDING_MODEL, DEFAULT_VOYAGE_RERANK_MODEL,
143146 };
144147 use zqa_rag::embedding::common::EmbeddingProviderConfig;
145145- use zqa_rag::vector::backends::lance::LanceBackend;
146148147149 use super::*;
150150+ use crate::LanceZoteroStore;
148151149149- fn make_tool() -> RetrievalTool {
152152+ fn make_tool() -> RetrievalTool<LanceZoteroStore> {
150153 let config = zqa_rag::config::VoyageAIConfig {
151154 api_key: String::new(),
152155 embedding_model: DEFAULT_VOYAGE_EMBEDDING_MODEL.into(),
···159162 arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false),
160163 arrow_schema::Field::new("pdf_text", arrow_schema::DataType::Utf8, false),
161164 ]));
162162- let backend = LanceBackend::new(
165165+ let store = LanceZoteroStore::from_schema(
163166 EmbeddingProviderConfig::VoyageAI(config.clone()),
164167 schema,
165165- "pdf_text".into(),
166168 );
167167- RetrievalTool::new(backend, Some(RerankProviderConfig::VoyageAI(config)))
169169+ RetrievalTool::new(
170170+ Arc::new(store),
171171+ Some(RerankProviderConfig::VoyageAI(config)),
172172+ )
168173 }
169174170175 #[test]
+41-39
zqa/src/tools/summarization.rs
···88use serde::Deserialize;
99use serde_json::json;
1010use tokio::task::JoinSet;
1111-use zqa_rag::{
1212- llm::{
1313- base::{ApiClient, ChatRequest, CompletionApiResponse},
1414- errors::LLMError,
1515- factory::LLMClient,
1616- tools::Tool,
1717- },
1818- vector::backends::{backend::VectorBackend, lance::LanceBackend},
1111+use zqa_rag::llm::{
1212+ base::{ApiClient, ChatRequest, CompletionApiResponse},
1313+ errors::LLMError,
1414+ factory::LLMClient,
1515+ tools::Tool,
1916};
20172118use crate::{
2219 cli::prompts::get_extraction_prompt,
2323- utils::{
2424- arrow::DbFields,
2525- library::{ZoteroItem, ZoteroItemSet},
2626- rag::ModelResponse,
2727- },
2020+ store::common::ZoteroStore,
2121+ utils::{library::ZoteroItem, rag::ModelResponse},
2822};
29233024pub(crate) const SUMMARIZATION_TOOL_NAME: &str = "summarization_tool";
31253226/// A tool to summarize Zotero papers with a specified ID.
3327#[derive(Debug, Clone)]
3434-pub(crate) struct SummarizationTool {
2828+pub(crate) struct SummarizationTool<T: ZoteroStore> {
3529 pub(crate) llm_client: LLMClient,
3630 /// Backend for searching stored Zotero papers.
3737- pub(crate) backend: LanceBackend,
3131+ pub(crate) store: Arc<T>,
3832 /// The input tokens used
3933 pub(crate) input_tokens: Arc<Mutex<u32>>,
4034 /// The output tokens used
4135 pub(crate) output_tokens: Arc<Mutex<u32>>,
4236}
43374444-impl SummarizationTool {
3838+impl<T> SummarizationTool<T>
3939+where
4040+ T: ZoteroStore,
4141+{
4542 /// Create a new [`SummarizationTool`] instance, given an LLM client and a backend.
4646- pub fn new(llm_client: LLMClient, backend: LanceBackend) -> Self {
4343+ pub fn new(llm_client: LLMClient, store: Arc<T>) -> Self {
4744 Self {
4845 llm_client,
4949- backend,
4646+ store,
5047 input_tokens: Arc::new(Mutex::new(0)),
5148 output_tokens: Arc::new(Mutex::new(0)),
5249 }
···6259 ids: Vec<String>,
6360}
64616565-impl Tool for SummarizationTool {
6262+impl<T> Tool for SummarizationTool<T>
6363+where
6464+ T: ZoteroStore + 'static,
6565+{
6666 fn name(&self) -> String {
6767 SUMMARIZATION_TOOL_NAME.into()
6868 }
···8888 /// A JSON object with a `"summaries"` key mapping to a list of summary strings,
8989 /// one per successfully processed paper, and an `"errors"` key mapping to a list
9090 /// of error messages for papers that failed to summarize.
9191- fn call<'a>(
9292- &'a self,
9191+ fn call(
9292+ &self,
9393 args: serde_json::Value,
9494- ) -> Pin<Box<dyn Future<Output = Result<serde_json::Value, String>> + Send + 'a>> {
9494+ ) -> Pin<Box<dyn Future<Output = Result<serde_json::Value, String>> + Send + '_>> {
9595+ let store = Arc::clone(&self.store);
9696+ let input_tokens = Arc::clone(&self.input_tokens);
9797+ let output_tokens = Arc::clone(&self.output_tokens);
9898+ let llm_client = self.llm_client.clone();
9599 Box::pin(async move {
96100 let input: SummarizationToolInput =
97101 serde_json::from_value(args).map_err(|e| format!("Invalid arguments: {e}"))?;
981029999- let results = self
100100- .backend
101101- .search_by_column(DbFields::LibraryKey.as_ref(), &input.ids)
103103+ let results: Vec<ZoteroItem> = store
104104+ .get_items_by_keys(&input.ids)
102105 .await
103106 .map_err(|e| format!("Search failed: {e}"))?;
104107105105- let batches: ZoteroItemSet = results.into();
106106- let items: Vec<ZoteroItem> = batches.into();
107107-108108 let mut set = JoinSet::new();
109109- for item in items {
110110- let client = self.llm_client.clone();
109109+ for item in results {
110110+ let client = llm_client.clone();
111111 let text = item.text;
112112 let metadata = item.metadata;
113113 let query_cloned = input.query.clone();
···140140 summaries.push(summary);
141141142142 // Update token counts (with error handling for mutex poisoning)
143143- if let Ok(mut input_tokens) = self.input_tokens.lock() {
144144- *input_tokens += response.input_tokens;
143143+ if let Ok(mut toks) = input_tokens.lock() {
144144+ *toks += response.input_tokens;
145145 }
146146- if let Ok(mut output_tokens) = self.output_tokens.lock() {
147147- *output_tokens += response.output_tokens;
146146+ if let Ok(mut toks) = output_tokens.lock() {
147147+ *toks += response.output_tokens;
148148 }
149149 }
150150 Err(e) => {
···175175 config::{AnthropicConfig, LLMClientConfig},
176176 constants::DEFAULT_ANTHROPIC_MODEL_SMALL,
177177 llm::factory::get_client_with_config,
178178- vector::backends::lance::LanceBackend,
179178 };
180179181180 use super::*;
182182- use crate::cli::app::tests::{create_test_context, get_config};
183181 use crate::cli::handlers::library::handle_process_cmd;
182182+ use crate::{
183183+ cli::app::tests::{create_test_context, get_config},
184184+ store::lance::LanceZoteroStore,
185185+ };
184186185185- fn make_tool() -> SummarizationTool {
187187+ fn make_tool() -> SummarizationTool<LanceZoteroStore> {
186188 let client = get_client_with_config(&LLMClientConfig::Anthropic(AnthropicConfig {
187189 api_key: env::var("ANTHROPIC_API_KEY").unwrap(),
188190 model: DEFAULT_ANTHROPIC_MODEL_SMALL.into(),
···199201 arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false),
200202 arrow_schema::Field::new("pdf_text", arrow_schema::DataType::Utf8, false),
201203 ]));
202202- let backend = LanceBackend::new(embedding_config, schema, "pdf_text".into());
203203- SummarizationTool::new(client, backend)
204204+ let store = LanceZoteroStore::from_schema(embedding_config, schema);
205205+ SummarizationTool::new(client, Arc::new(store))
204206 }
205207206208 #[test]
+35-122
zqa/src/utils/arrow.rs
···11-use std::sync::Arc;
11+use std::{path::PathBuf, sync::Arc};
2233use arrow_array::{ArrayRef, RecordBatch, StringArray, cast::AsArray};
44use arrow_schema;
···99 EmbeddingProviderConfig, get_embedding_dims_by_provider, get_embedding_provider_with_config,
1010 },
1111 llm::errors::LLMError,
1212- reranking::common::{RerankProviderConfig, get_reranking_provider_with_config},
1313- vector::backends::{
1414- backend::VectorBackend,
1515- lance::{LanceBackend, LanceError, db_exists as lancedb_exists},
1616- },
1212+ vector::backends::lance::{LANCE_TABLE_NAME, LanceError, get_db_uri},
1713};
18141915use super::library::{LibraryParsingError, parse_library};
2020-use crate::utils::library::{ZoteroItem, ZoteroItemSet};
1616+use crate::{store::lance::LanceZoteroStore, utils::library::ZoteroItem};
21172218/// An enum containing the fields stored by our application in `LanceDB`, in order. Implementations
2319/// `as_ref()` and `into()` are provided to convert this to `&str` and `String` respectively.
···9288 }
9389}
94909191+/// Checks whether the configured LanceDB database exists and contains the expected table.
9292+pub(crate) async fn lancedb_exists() -> bool {
9393+ let uri = get_db_uri();
9494+ if !PathBuf::from(&uri).exists() {
9595+ return false;
9696+ }
9797+9898+ if let Ok(db) = lancedb::connect(&uri).execute().await {
9999+ db.open_table(LANCE_TABLE_NAME).execute().await.is_ok()
100100+ } else {
101101+ false
102102+ }
103103+}
104104+95105/// Get the schema for our `LanceDB` table. This is required for both getting library items and
96106/// checkhealth.
97107///
98108/// # Arguments
99109///
100110/// * `embedding_provider` - The embedding used by the current DB.
111111+/// * `include_embeddings` - Whether to include the embeddings field in the schema.
101112///
102113/// # Returns
103114///
104115/// The schema in Arrow format.
105105-pub async fn get_schema(embedding_provider: EmbeddingProvider) -> arrow_schema::Schema {
116116+pub async fn get_schema(
117117+ embedding_provider: EmbeddingProvider,
118118+ include_embeddings: bool,
119119+) -> arrow_schema::Schema {
106120 // Convert ZoteroItemMetadata to something that can be converted to Arrow
107121 // Need to extract fields and create appropriate Arrow arrays
108122 let mut schema_fields = vec![
···112126 arrow_schema::Field::new(DbFields::PdfText, arrow_schema::DataType::Utf8, false),
113127 ];
114128115115- if lancedb_exists().await {
129129+ if include_embeddings {
116130 schema_fields.push(arrow_schema::Field::new(
117131 DbFields::Embeddings,
118132 arrow_schema::DataType::FixedSizeList(
···137151///
138152/// * `items` - The items to convert to a `RecordBatch`
139153/// * `embedding_config` - Configuration for the embedding provider to use when computing embeddings.
154154+/// * `include_embeddings` - Whether to include the embeddings field in the schema.
140155///
141156/// # Errors
142157///
···150165pub async fn library_to_arrow(
151166 items: Vec<ZoteroItem>,
152167 embedding_config: EmbeddingProviderConfig,
168168+ include_embeddings: bool,
153169) -> Result<RecordBatch, ArrowError> {
154154- let schema = Arc::new(get_schema(embedding_config.provider()).await);
170170+ let schema = Arc::new(get_schema(embedding_config.provider(), include_embeddings).await);
155171156172 // Convert ZoteroItemMetadata to Arrow arrays
157173 let library_keys = StringArray::from(
···194210 Arc::new(pdf_texts.clone()) as ArrayRef,
195211 ];
196212197197- if lancedb_exists().await {
213213+ if include_embeddings {
198214 let embedding_provider = get_embedding_provider_with_config(&embedding_config)?;
199215 let query_vec = embedding_provider.compute_source_embeddings(Arc::new(pdf_texts))?;
200216 let query_vec = query_vec.as_fixed_size_list();
···232248///
233249/// # Arguments
234250///
235235-/// * `config` - Configuration containing embedding provider information.
251251+/// * `store` - [`LanceZoteroStore`] with configuration
236252/// * `start_from` - An optional offset for the SQL query. Useful for debugging, pagination,
237253/// multi-threading, etc.
238254/// * `limit` - Optional limit, meant to be used in conjunction with `start_from`.
239255pub async fn full_library_to_arrow(
240240- backend: &LanceBackend,
256256+ store: &LanceZoteroStore,
241257 start_from: Option<usize>,
242258 limit: Option<usize>,
243259) -> Result<RecordBatch, ArrowError> {
244244- let lib_items = parse_library(backend, start_from, limit).await?;
260260+ let lib_items = parse_library(store, start_from, limit).await?;
245261 log::info!("Finished parsing library items.");
246262247247- library_to_arrow(lib_items, backend.embedding_config().clone()).await
248248-}
249249-250250-/// Statistics about the characters processed in a vector search call, used for cost estimation.
251251-pub struct VectorSearchStats {
252252- /// Number of characters in the query string that was embedded.
253253- pub embedding_chars: usize,
254254- /// Total characters of documents + query sent to the reranker (0 if no reranker was used).
255255- pub rerank_chars: usize,
256256-}
257257-258258-/// Perform vector search using a query and a specified embedding method.
259259-///
260260-/// This function is a Zotero-specific wrapper for the `vector_search` function in the `rag` crate.
261261-/// It is implemented here since the knowledge of which column is which in the `RecordBatch`es that
262262-/// we create is in this file, so there's better locality-of-behaviour; this also makes the
263263-/// underlying implementation of `vector_search` simpler and potentially allows other RAG
264264-/// applications to be built on top of it.
265265-///
266266-/// TODO: A limit of 10 results is currently returned, but this will be changed in a future version.
267267-///
268268-/// In some sense, this function is the reverse of the `library_to_arrow` function, which creates a
269269-/// `RecordBatch` from vectors after calling `parse_library`.
270270-///
271271-/// This function also uses a reranking provider to perform reranking of the vector search results.
272272-///
273273-/// # Arguments
274274-///
275275-/// * `query` - The query to search the `LanceDB` table for.
276276-/// * `embedding_config` - The embedding provider configuration. Note that this must be the same
277277-/// embedding provider used when initially creating the database.
278278-/// * `reranker_config` - The reranker provider to use.
279279-///
280280-/// # Returns
281281-///
282282-/// A tuple of the matching `ZoteroItem`s and [`VectorSearchStats`] with character counts used for
283283-/// cost estimation. Returns an `ArrowError` that wraps the underlying `LanceError` if the `rag`
284284-/// crate's `vector_search` is unsuccessful for any reason.
285285-///
286286-/// # Errors
287287-///
288288-/// * `ArrowError::LanceError` if vector search fails.
289289-/// * `ArrowError::LLMError` if reranking fails.
290290-pub async fn vector_search(
291291- query: String,
292292- backend: &LanceBackend,
293293- reranker_config: Option<&RerankProviderConfig>,
294294-) -> Result<(Vec<ZoteroItem>, VectorSearchStats), ArrowError> {
295295- let embedding_chars = query.len();
296296- let batches = backend.vector_search(query.clone(), 10).await?;
297297-298298- let items: ZoteroItemSet = batches.into();
299299- let items: Vec<ZoteroItem> = items.into();
300300-301301- let filtered_items: Vec<ZoteroItem> = items
302302- .into_iter()
303303- .filter(|item| !item.text.trim().is_empty())
304304- .collect();
305305-306306- if filtered_items.is_empty() {
307307- return Ok((
308308- Vec::new(),
309309- VectorSearchStats {
310310- embedding_chars,
311311- rerank_chars: 0,
312312- },
313313- ));
314314- }
315315-316316- let Some(reranker) = reranker_config else {
317317- return Ok((
318318- filtered_items,
319319- VectorSearchStats {
320320- embedding_chars,
321321- rerank_chars: 0,
322322- },
323323- ));
324324- };
325325-326326- let rerank_provider = get_reranking_provider_with_config(reranker)?;
327327- let item_strings = filtered_items
328328- .iter()
329329- .map(|f| f.text.as_str())
330330- .collect::<Vec<_>>();
331331-332332- let rerank_chars = item_strings.iter().map(|s| s.len()).sum::<usize>() + query.len();
333333-334334- let indices = rerank_provider.rerank(&item_strings, &query).await?;
335335-336336- let reranked_items = indices
337337- .into_iter()
338338- .filter_map(|idx| filtered_items.get(idx).cloned())
339339- .collect();
340340-341341- Ok((
342342- reranked_items,
343343- VectorSearchStats {
344344- embedding_chars,
345345- rerank_chars,
346346- },
347347- ))
263263+ let include_embeddings = lancedb_exists().await;
264264+ library_to_arrow(lib_items, store.get_embedding_config(), include_embeddings).await
348265}
349266350267#[cfg(test)]
···393310394311 let record_batch = temp_env::async_with_vars([("LANCEDB_URI", Some(&db_uri))], async {
395312 let embedding_config = config.get_embedding_config().unwrap();
396396- let schema = Arc::new(get_schema(embedding_config.provider()).await);
397397- let backend = LanceBackend::new(
398398- embedding_config,
399399- schema,
400400- DbFields::PdfText.as_ref().to_string(),
401401- );
402402- full_library_to_arrow(&backend, Some(0), Some(5)).await
313313+ let schema = Arc::new(get_schema(embedding_config.provider(), true).await);
314314+ let store = LanceZoteroStore::from_schema(embedding_config, schema);
315315+ full_library_to_arrow(&store, Some(0), Some(5)).await
403316 })
404317 .await;
405318
+27-48
zqa/src/utils/library.rs
···1010use std::thread;
1111use std::time::Instant;
12121313-use arrow_array::RecordBatch;
1313+use arrow_array::{RecordBatch, cast::AsArray};
1414use directories::UserDirs;
1515use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
1616use rusqlite::Connection;
1717use serde::Serialize;
1818use thiserror::Error;
1919use zqa_pdftools::parse::extract_text;
2020-use zqa_rag::vector::backends::{
2121- backend::VectorBackend,
2222- lance::{LanceBackend, LanceError, get_column_from_batch},
2323-};
24202525-use crate::izip;
2626-use crate::utils::arrow::DbFields;
2121+use crate::store::common::ZoteroStore;
2222+use crate::{izip, utils::arrow::DbFields};
27232824/// Gets the Zotero library path. Works on Linux, macOS, and Windows systems.
2925/// On CI environments, returns a location to a toy library in assets/ instead.
···153149 }
154150}
155151156156-impl From<LanceError> for LibraryParsingError {
157157- fn from(value: LanceError) -> Self {
158158- LibraryParsingError::LanceDBError(value.to_string())
159159- }
160160-}
161161-162152impl From<Box<dyn std::error::Error>> for LibraryParsingError {
163153 fn from(value: Box<dyn std::error::Error>) -> Self {
164154 LibraryParsingError::PdfParsingError(value.to_string())
165155 }
166156}
167157158158+/// From a `RecordBatch`, return all values from a specified column as a `Vec<String>`.
159159+#[must_use]
160160+pub(crate) fn get_column_from_batch(batch: &RecordBatch, column: usize) -> Vec<String> {
161161+ let results = batch.column(column).as_string::<i32>();
162162+163163+ results
164164+ .iter()
165165+ .filter_map(|s| Some(s?.to_string()))
166166+ .collect()
167167+}
168168+168169/// Assuming an existing `LanceDB` database exists, returns a list of items present in the Zotero
169170/// library but not in the database. The primary use case for this is to update the DB with new
170171/// items. Note that this does not take into account removed items.
···182183/// * `LibraryParsingError::SqliteError` if the library path was not found, the query could not be prepared, or
183184/// columns from the result set could not be parsed, or `query_map` fails.
184185/// * `LibraryParsingError::LanceDBError` if fetching the rows from LanceDB fails.
185185-pub async fn get_new_library_items(
186186- backend: &LanceBackend,
186186+pub async fn get_new_library_items<T: ZoteroStore>(
187187+ store: &T,
187188) -> Result<Vec<ZoteroItemMetadata>, LibraryParsingError> {
188188- let db_items = backend
189189- .get_items(&[
190190- DbFields::LibraryKey.into(),
191191- DbFields::Title.into(),
192192- DbFields::FilePath.into(),
193193- ])
194194- .await?;
195195-196196- let metadata_vecs = db_items
197197- .iter()
198198- .flat_map(|batch| {
199199- let library_keys = get_column_from_batch(batch, 0);
200200- let titles = get_column_from_batch(batch, 1);
201201- let file_paths = get_column_from_batch(batch, 2);
202202-203203- let zipped = izip!(library_keys, titles, file_paths).collect::<Vec<_>>();
204204- zipped
205205- .iter()
206206- .map(|(key, title, path)| ZoteroItemMetadata {
207207- library_key: key.clone(),
208208- title: title.clone(),
209209- file_path: PathBuf::from(path.clone()),
210210- authors: None,
211211- })
212212- .collect::<Vec<_>>()
213213- })
214214- .collect::<Vec<_>>();
189189+ let metadata_vecs = store
190190+ .existing_item_metadata()
191191+ .await
192192+ .map_err(|e| LibraryParsingError::LanceDBError(e.to_string()))?;
215193216194 let library_items = parse_library_metadata(None, None)?;
217195···425403/// * If a Mutex lock could not be acquired on the progress bar.
426404/// * If the threads could not be joined.
427405#[allow(clippy::too_many_lines)]
428428-pub async fn parse_library(
429429- backend: &LanceBackend,
406406+pub async fn parse_library<T: ZoteroStore>(
407407+ store: &T,
430408 start_from: Option<usize>,
431409 limit: Option<usize>,
432410) -> Result<Vec<ZoteroItem>, LibraryParsingError> {
433411 let start_time = Instant::now();
434412435435- let metadata = if backend.db_exists().await {
436436- get_new_library_items(backend).await?
413413+ let metadata = if store.exists().await {
414414+ get_new_library_items(store).await?
437415 } else {
438416 parse_library_metadata(start_from, limit)?
439417 };
···643621 };
644622645623 use super::*;
624624+ use crate::LanceZoteroStore;
646625 use crate::common::setup_logger;
647626648627 #[test]
···732711 arrow_schema::Field::new("file_path", arrow_schema::DataType::Utf8, false),
733712 arrow_schema::Field::new("pdf_text", arrow_schema::DataType::Utf8, false),
734713 ]));
735735- let backend = LanceBackend::new(embedding_config, schema, "pdf_text".into());
736736- let items = parse_library(&backend, Some(0), Some(7)).await;
714714+ let store = LanceZoteroStore::from_schema(embedding_config, schema);
715715+ let items = parse_library(&store, Some(0), Some(7)).await;
737716 test_ok!(items);
738717739718 // Two of the items in the toy library are HTML files, so we actually
+3-15
zqa/tests/new_library.rs
···44use log::LevelFilter;
55use zqa::common::setup_logger;
66use zqa::config::{AnthropicConfig, Config, VoyageAIConfig};
77-use zqa::full_library_to_arrow;
77+use zqa::{LanceZoteroStore, full_library_to_arrow};
88use zqa_macros::test_ok;
99use zqa_rag::capabilities::{EmbeddingProvider, ModelProvider, RerankerProvider};
1010use zqa_rag::constants::{
1111 DEFAULT_MAX_CONCURRENT_REQUESTS, DEFAULT_MAX_RETRIES, DEFAULT_VOYAGE_EMBEDDING_DIM,
1212};
1313-use zqa_rag::vector::backends::{backend::VectorBackend, lance::LanceBackend};
14131514#[tokio::test]
1615async fn test_integration_works() {
···5049 };
51505251 let embedding_config = config.get_embedding_config().unwrap();
5353- let schema = zqa::utils::arrow::get_schema(embedding_config.provider()).await;
5454- let backend = LanceBackend::new(
5555- embedding_config,
5656- std::sync::Arc::new(schema),
5757- "pdf_text".into(),
5858- );
5252+ let store = LanceZoteroStore::from_embedding_config(embedding_config).await;
59536060- let record_batch = full_library_to_arrow(&backend, None, None).await;
5454+ let record_batch = full_library_to_arrow(&store, None, None).await;
6155 test_ok!(record_batch);
6262-6363- let record_batch = record_batch.unwrap();
6464- let batches = vec![record_batch.clone()];
6565- let db = backend.insert_items(batches, None).await;
6666-6767- test_ok!(db);
6856}