update-doc

lightonai · Sep 11, 2024 · 4633bde · 4633bde
1 parent d2db110
commit 4633bde
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 109 deletions.
diff --git a/docs/documentation/graph.md b/docs/documentation/graph.md
@@ -0,0 +1,100 @@
+## Graph
+
+The `search.graphs` function can be used to search documents with a graph-based query. This function is useful if we have paired documents and queries. The search will retrieve the set of documents and queries that match the input query. Then it will build a graph and compute the weight of each document using a graph-based scoring function.
+
+The `search.graphs` function is much slower than the `search.documents` function, but might provide better results with decent amount of paired documents / queries.
+
+### Documents queries interactions
+
+We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions:
+
+```python
+from ducksearch import search, upload
+
+documents = [
+    {
+        "id": 0,
+        "title": "Hotel California",
+        "style": "rock",
+        "date": "1977-02-22",
+        "popularity": 9,
+    },
+    {
+        "id": 1,
+        "title": "Here Comes the Sun",
+        "style": "rock",
+        "date": "1969-06-10",
+        "popularity": 10,
+    },
+    {
+        "id": 2,
+        "title": "Alive",
+        "style": "electro, punk",
+        "date": "2007-11-19",
+        "popularity": 9,
+    },
+]
+
+upload.documents(
+    database="ducksearch.duckdb",
+    key="id",
+    fields=["title", "style", "date", "popularity"],
+    documents=documents,
+    dtypes={
+        "date": "DATE",
+        "popularity": "INT",
+    },
+)
+
+# Mapping between documents ids and queries
+documents_queries = {
+    0: ["the beatles", "rock band"],
+    1: ["rock band", "california"],
+    2: ["daft"],
+}
+
+upload.queries(
+	database="ducksearch.duckdb",
+	documents_queries=documents_queries,
+)
+```
+
+???+ tip
+    We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function:
+
+    ```python
+    documents_queries = {
+        0: {"the beatles": 30, "rock band": 10},
+        1: {"rock band": 10, "california": 1},
+        2: {"daft": 60},
+    }
+    ```
+
+    When the weight is not specified, the default value is 1.
+
+### Search Graphs
+
+The following example demonstrates how to search documents with a graph-based query:
+
+```python
+from ducksearch import search
+
+search.graphs(
+	database="ducksearch.duckdb",
+	queries="daft punk",
+	top_k=10,
+)
+```
+
+```python
+[
+    {
+        "id": "2",
+        "title": "Alive",
+        "style": "electro, punk",
+        "date": Timestamp("2007-11-19 00:00:00"),
+        "popularity": 9,
+        "score": 2.877532958984375,
+    }
+]
+```
diff --git a/docs/documentation/search.md b/docs/documentation/search.md
@@ -16,7 +16,7 @@ search.documents(
     queries=["daft punk", "rock"],
     top_k=10,
     top_k_token=10_000,
-    batch_size=30,
+    batch_size=32,
     n_jobs=-1,
 )
 ```
@@ -69,7 +69,7 @@ search.documents(
     queries=["rock", "california"],
     top_k=10,
     top_k_token=10_000,
-    batch_size=30,
+    batch_size=32,
     filters="YEAR(date) <= 1990 AND YEAR(date) >= 1970",
     n_jobs=-1,
 )
@@ -103,35 +103,3 @@ search.documents(
 ???+ info
     The filters are evaluated by DuckDB, so all DuckDB functions are available for use in the filters. You can find more information about DuckDB functions in the [DuckDB documentation](https://duckdb.org/docs/sql/functions/overview).
 
-### Graphs
-
-???+ info
-    To benefit from the `search.graphs` function, we need to upload documents and queries to DuckDB using the `upload.documents` and `upload.queries` functions.
-
-The `search.graphs` function retrieves the top documents. Then it retrieves the top queries indexed from `upload.queries`. Finally, it computes a graph-based ranking of the documents based on the queries.
-
-```python
-from ducksearch import search
-
-search.graphs(
-	database="ducksearch.duckdb",
-	queries="daft punk",
-	top_k=10,
-    top_k_token=10_000,
-    batch_size=30,
-    n_jobs=-1,
-)
-```
-
-```python
-[
-    {
-        "id": "2",
-        "title": "Alive",
-        "style": "electro, punk",
-        "date": Timestamp("2007-11-19 00:00:00"),
-        "popularity": 9,
-        "score": 0.17841622233390808,
-    }
-]
-```
diff --git a/docs/documentation/upload.md b/docs/documentation/upload.md
@@ -82,78 +82,3 @@ upload.documents(
 
 ???+ info
     More informations about DuckDB and HuggingFace compatibility can be found [here](https://huggingface.co/docs/hub/en/datasets-duckdb) and [here](https://duckdb.org/2024/05/29/access-150k-plus-datasets-from-hugging-face-with-duckdb.html).
-
-
-### Documents queries interactions
-
-We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions:
-
-```python
-from ducksearch import search, upload
-
-documents = [
-    {
-        "id": 0,
-        "title": "Hotel California",
-        "style": "rock",
-        "date": "1977-02-22",
-        "popularity": 9,
-    },
-    {
-        "id": 1,
-        "title": "Here Comes the Sun",
-        "style": "rock",
-        "date": "1969-06-10",
-        "popularity": 10,
-    },
-    {
-        "id": 2,
-        "title": "Alive",
-        "style": "electro, punk",
-        "date": "2007-11-19",
-        "popularity": 9,
-    },
-]
-
-upload.documents(
-    database="ducksearch.duckdb",
-    key="id",
-    fields=["title", "style", "date", "popularity"],
-    documents=documents,
-    dtypes={
-        "date": "DATE",
-        "popularity": "INT",
-    },
-)
-
-# Mapping between documents ids and queries
-documents_queries = {
-    0: ["the beatles", "rock band"],
-    1: ["rock band", "california"],
-    2: ["daft"],
-}
-
-upload.queries(
-	database="ducksearch.duckdb",
-	documents_queries=documents_queries,
-)
-
-search.graphs(
-	database="ducksearch.duckdb",
-	queries="daft punk",
-	top_k=10,
-)
-```
-
-???+ tip
-    We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function:
-
-    ```python
-    documents_queries = {
-        0: {"the beatles": 30, "rock band": 10},
-        1: {"rock band": 10, "california": 1},
-        2: {"daft": 60},
-    }
-    ```
-
-    When the weight is not specified, the default value is 1.
diff --git a/ducksearch/delete/documents.py b/ducksearch/delete/documents.py
@@ -21,6 +21,13 @@ def _update_score() -> None:
     """Update the score after deleting documents."""
 
 
+@execute_with_duckdb(
+    relative_path="delete/update/df.sql",
+)
+def _update_df() -> None:
+    """Update the token frequency deleting documents."""
+
+
 @execute_with_duckdb(
     relative_path="delete/delete/scores.sql",
 )
@@ -131,6 +138,12 @@ def documents(
         config=config,
     )
 
+    _update_df(
+        database=database,
+        parquet_file="_documents_ids.parquet",
+        config=config,
+    )
+
     _update_terms(
         database=database,
         parquet_file="_documents_ids.parquet",

diff --git a/ducksearch/delete/update/df.sql b/ducksearch/delete/update/df.sql
@@ -0,0 +1,22 @@
+WITH _docs_to_delete AS (
+    SELECT DISTINCT
+        bm25.docid
+    FROM parquet_scan('{parquet_file}') p
+    INNER JOIN bm25_documents.docs bm25
+        ON p.id = bm25.name
+),
+
+_tf AS (
+    SELECT
+        termid,
+        sum(tf) as df
+    FROM bm25_documents.terms
+    INNER JOIN _docs_to_delete
+        ON bm25_documents.terms.docid = _docs_to_delete.docid 
+    GROUP BY 1
+)
+
+UPDATE bm25_documents.dict _dict
+SET df = GREATEST(_dict.df - _tf.df, 0)
+FROM _tf
+WHERE _dict.termid = _tf.termid;