From 4633bde0cd6810b7ec94522dff3745ee981238f2 Mon Sep 17 00:00:00 2001 From: Raphael Sourty Date: Wed, 11 Sep 2024 00:53:41 +0200 Subject: [PATCH] update-doc --- docs/documentation/graph.md | 100 ++++++++++++++++++++++++++++++++ docs/documentation/search.md | 36 +----------- docs/documentation/upload.md | 75 ------------------------ ducksearch/delete/documents.py | 13 +++++ ducksearch/delete/update/df.sql | 22 +++++++ 5 files changed, 137 insertions(+), 109 deletions(-) create mode 100644 docs/documentation/graph.md create mode 100644 ducksearch/delete/update/df.sql diff --git a/docs/documentation/graph.md b/docs/documentation/graph.md new file mode 100644 index 0000000..4851d27 --- /dev/null +++ b/docs/documentation/graph.md @@ -0,0 +1,100 @@ +## Graph + +The `search.graphs` function can be used to search documents with a graph-based query. This function is useful if we have paired documents and queries. The search will retrieve the set of documents and queries that match the input query. Then it will build a graph and compute the weight of each document using a graph-based scoring function. + +The `search.graphs` function is much slower than the `search.documents` function, but might provide better results with decent amount of paired documents / queries. + +### Documents queries interactions + +We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions: + +```python +from ducksearch import search, upload + +documents = [ + { + "id": 0, + "title": "Hotel California", + "style": "rock", + "date": "1977-02-22", + "popularity": 9, + }, + { + "id": 1, + "title": "Here Comes the Sun", + "style": "rock", + "date": "1969-06-10", + "popularity": 10, + }, + { + "id": 2, + "title": "Alive", + "style": "electro, punk", + "date": "2007-11-19", + "popularity": 9, + }, +] + +upload.documents( + database="ducksearch.duckdb", + key="id", + fields=["title", "style", "date", "popularity"], + documents=documents, + dtypes={ + "date": "DATE", + "popularity": "INT", + }, +) + +# Mapping between documents ids and queries +documents_queries = { + 0: ["the beatles", "rock band"], + 1: ["rock band", "california"], + 2: ["daft"], +} + +upload.queries( + database="ducksearch.duckdb", + documents_queries=documents_queries, +) +``` + +???+ tip + We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function: + + ```python + documents_queries = { + 0: {"the beatles": 30, "rock band": 10}, + 1: {"rock band": 10, "california": 1}, + 2: {"daft": 60}, + } + ``` + + When the weight is not specified, the default value is 1. + +### Search Graphs + +The following example demonstrates how to search documents with a graph-based query: + +```python +from ducksearch import search + +search.graphs( + database="ducksearch.duckdb", + queries="daft punk", + top_k=10, +) +``` + +```python +[ + { + "id": "2", + "title": "Alive", + "style": "electro, punk", + "date": Timestamp("2007-11-19 00:00:00"), + "popularity": 9, + "score": 2.877532958984375, + } +] +``` \ No newline at end of file diff --git a/docs/documentation/search.md b/docs/documentation/search.md index 4eea581..dfe1ecd 100644 --- a/docs/documentation/search.md +++ b/docs/documentation/search.md @@ -16,7 +16,7 @@ search.documents( queries=["daft punk", "rock"], top_k=10, top_k_token=10_000, - batch_size=30, + batch_size=32, n_jobs=-1, ) ``` @@ -69,7 +69,7 @@ search.documents( queries=["rock", "california"], top_k=10, top_k_token=10_000, - batch_size=30, + batch_size=32, filters="YEAR(date) <= 1990 AND YEAR(date) >= 1970", n_jobs=-1, ) @@ -103,35 +103,3 @@ search.documents( ???+ info The filters are evaluated by DuckDB, so all DuckDB functions are available for use in the filters. You can find more information about DuckDB functions in the [DuckDB documentation](https://duckdb.org/docs/sql/functions/overview). -### Graphs - -???+ info - To benefit from the `search.graphs` function, we need to upload documents and queries to DuckDB using the `upload.documents` and `upload.queries` functions. - -The `search.graphs` function retrieves the top documents. Then it retrieves the top queries indexed from `upload.queries`. Finally, it computes a graph-based ranking of the documents based on the queries. - -```python -from ducksearch import search - -search.graphs( - database="ducksearch.duckdb", - queries="daft punk", - top_k=10, - top_k_token=10_000, - batch_size=30, - n_jobs=-1, -) -``` - -```python -[ - { - "id": "2", - "title": "Alive", - "style": "electro, punk", - "date": Timestamp("2007-11-19 00:00:00"), - "popularity": 9, - "score": 0.17841622233390808, - } -] -``` \ No newline at end of file diff --git a/docs/documentation/upload.md b/docs/documentation/upload.md index b0072ff..7a540df 100644 --- a/docs/documentation/upload.md +++ b/docs/documentation/upload.md @@ -82,78 +82,3 @@ upload.documents( ???+ info More informations about DuckDB and HuggingFace compatibility can be found [here](https://huggingface.co/docs/hub/en/datasets-duckdb) and [here](https://duckdb.org/2024/05/29/access-150k-plus-datasets-from-hugging-face-with-duckdb.html). - - -### Documents queries interactions - -We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions: - -```python -from ducksearch import search, upload - -documents = [ - { - "id": 0, - "title": "Hotel California", - "style": "rock", - "date": "1977-02-22", - "popularity": 9, - }, - { - "id": 1, - "title": "Here Comes the Sun", - "style": "rock", - "date": "1969-06-10", - "popularity": 10, - }, - { - "id": 2, - "title": "Alive", - "style": "electro, punk", - "date": "2007-11-19", - "popularity": 9, - }, -] - -upload.documents( - database="ducksearch.duckdb", - key="id", - fields=["title", "style", "date", "popularity"], - documents=documents, - dtypes={ - "date": "DATE", - "popularity": "INT", - }, -) - -# Mapping between documents ids and queries -documents_queries = { - 0: ["the beatles", "rock band"], - 1: ["rock band", "california"], - 2: ["daft"], -} - -upload.queries( - database="ducksearch.duckdb", - documents_queries=documents_queries, -) - -search.graphs( - database="ducksearch.duckdb", - queries="daft punk", - top_k=10, -) -``` - -???+ tip - We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function: - - ```python - documents_queries = { - 0: {"the beatles": 30, "rock band": 10}, - 1: {"rock band": 10, "california": 1}, - 2: {"daft": 60}, - } - ``` - - When the weight is not specified, the default value is 1. diff --git a/ducksearch/delete/documents.py b/ducksearch/delete/documents.py index 8b7cf23..e8fdebc 100644 --- a/ducksearch/delete/documents.py +++ b/ducksearch/delete/documents.py @@ -21,6 +21,13 @@ def _update_score() -> None: """Update the score after deleting documents.""" +@execute_with_duckdb( + relative_path="delete/update/df.sql", +) +def _update_df() -> None: + """Update the token frequency deleting documents.""" + + @execute_with_duckdb( relative_path="delete/delete/scores.sql", ) @@ -131,6 +138,12 @@ def documents( config=config, ) + _update_df( + database=database, + parquet_file="_documents_ids.parquet", + config=config, + ) + _update_terms( database=database, parquet_file="_documents_ids.parquet", diff --git a/ducksearch/delete/update/df.sql b/ducksearch/delete/update/df.sql new file mode 100644 index 0000000..60926d4 --- /dev/null +++ b/ducksearch/delete/update/df.sql @@ -0,0 +1,22 @@ +WITH _docs_to_delete AS ( + SELECT DISTINCT + bm25.docid + FROM parquet_scan('{parquet_file}') p + INNER JOIN bm25_documents.docs bm25 + ON p.id = bm25.name +), + +_tf AS ( + SELECT + termid, + sum(tf) as df + FROM bm25_documents.terms + INNER JOIN _docs_to_delete + ON bm25_documents.terms.docid = _docs_to_delete.docid + GROUP BY 1 +) + +UPDATE bm25_documents.dict _dict +SET df = GREATEST(_dict.df - _tf.df, 0) +FROM _tf +WHERE _dict.termid = _tf.termid; \ No newline at end of file