Merge pull request #2 from lightonai/retry-conn

1.0.2
lightonai · Oct 2, 2024 · 4e396e9 · 4e396e9
2 parents 1298851 + 218b327
commit 4e396e9
Show file tree

Hide file tree

Showing 56 changed files with 1,214 additions and 356 deletions.
diff --git a/README.md b/README.md
@@ -61,8 +61,8 @@ documents = [
 
 upload.documents(
     database="ducksearch.duckdb",
-    key="id", # unique document identifier
-    fields=["title", "style", "date", "popularity"], # list of fields to index
+    key="id", # Unique document identifier
+    fields=["title", "style"], # List of fields to use for search.
     documents=documents,
     dtypes={
         "date": "DATE",
@@ -73,7 +73,7 @@ upload.documents(
 
 ## Search
 
-`search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8.
+`search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document.
 
 ```python
 from ducksearch import search
@@ -83,6 +83,7 @@ search.documents(
     queries=["punk", "california"],
     top_k=10,
     filters="YEAR(date) >= 1970 AND popularity > 8",
+    order_by="0.8 * score + 0.2 * popularity DESC",
 )
 ```
 
@@ -113,6 +114,8 @@ search.documents(
 
 Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date).
 
+Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied.
+
 ## Delete and update index
 
 We can delete documents and update the BM25 weights accordingly using the `delete.documents` function.
@@ -132,62 +135,76 @@ To update the index, we should first delete the documents and then upload the up
 
 ### HuggingFace
 
-The `upload.documents` function can also index HuggingFace datasets directly from the url. 
-The following example demonstrates how to index the FineWeb dataset from HuggingFace:
+The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results.
 
 ```python
 from ducksearch import upload
 
 upload.documents(
     database="fineweb.duckdb",
     key="id",
-    fields=["text", "url", "date", "language", "token_count", "language_score"],
+    fields=["text", "url"],
     documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet",
     dtypes={
         "date": "DATE",
         "token_count": "INT",
         "language_score": "FLOAT",
     },
-    limit=1000, # demonstrate with a small dataset
+    limit=3000, # demonstrate with a small dataset
 )
 ```
 
-We can then search the FineWeb dataset with the `search.documents` function:
+We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date.
 
 ```python
 from ducksearch import search
 
 search.documents(
     database="fineweb.duckdb",
-    queries="earth science",
+    queries=["earth science"],
     top_k=2,
+    order_by="score DESC, date DESC",
 )
 ```
 
 ```python
 [
-    {
-        "id": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
-        "text": "Earth Science Tutors in Rowland ...",
-        "date": Timestamp("2017-08-19 00:00:00"),
-        "language": "en",
-        "token_count": 313,
-        "language_score": 0.8718525171279907,
-        "score": 1.1588547229766846,
-    },
-    {
-        "score": 1.6727683544158936,
-        "id": "<urn:uuid:c732ce90-2fbf-41ad-8916-345f6c08e452>",
-        "text": "The existing atmosphere surrounding the earth contains ...",
-        "url": "http://www.accuracyingenesis.com/atmargon.html",
-        "date": Timestamp("2015-04-02 00:00:00"),
-        "language": "en",
-        "token_count": 1348,
-        "language_score": 0.9564403295516968,
-    },
+    [
+        {
+            "id": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
+            "text": "Earth Science Tutors in Rowland...",
+            "id_1": "<urn:uuid:1e6ae53b-e0d7-431b-8d46-290244e597e9>",
+            "dump": "CC-MAIN-2017-34",
+            "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring",
+            "date": Timestamp("2017-08-19 00:00:00"),
+            "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz",
+            "language": "en",
+            "language_score": 0.8718525171279907,
+            "token_count": 313,
+            "bm25id": 523,
+            "score": 2.3761106729507446,
+        },
+        {
+            "id": "<urn:uuid:cd94a04f-1632-4c8b-81d2-cb353163116e>",
+            "text": "- Geomagnetic field....",
+            "id_1": "<urn:uuid:cd94a04f-1632-4c8b-81d2-cb353163116e>",
+            "dump": "CC-MAIN-2022-21",
+            "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript",
+            "date": Timestamp("2022-05-20 00:00:00"),
+            "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz",
+            "language": "en",
+            "language_score": 0.8225595951080322,
+            "token_count": 517,
+            "bm25id": 4783,
+            "score": 2.3569871187210083,
+        },
+    ]
 ]
+
 ```
 
+Note: by default, results are ordered by BM25 relevance.
+
 ## Tables
 
 Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`.

diff --git a/docs/api/decorators/connect-to-duckdb.md b/docs/api/decorators/connect-to-duckdb.md
@@ -1,6 +1,6 @@
 # connect_to_duckdb
 
-Establish a connection to the DuckDB database.
+Establish a connection to the DuckDB database. Retry connecting if an error occurs.
 
 
 
@@ -18,6 +18,16 @@ Establish a connection to the DuckDB database.
 
     Optional configuration settings for the DuckDB connection.
 
+- **max_retry** (*int*) – defaults to `20`
+
+    The maximum number of times to retry connecting to DuckDB.
+
+- **sleep_time** (*float*) – defaults to `0.1`
+
+    The time to sleep between retries.
+
+- **kwargs**
+
 
 
 
diff --git a/docs/api/hf/insert-documents.md b/docs/api/hf/insert-documents.md
@@ -18,10 +18,6 @@ Insert documents from a Hugging Face dataset into DuckDB.
 
     The key field that uniquely identifies each document (e.g., 'query_id').
 
-- **fields** (*str | list[str]*)
-
-    A list of fields to be inserted from the dataset. If a single field is provided as a string, it will be converted to a list.
-
 - **url** (*str*)
 
     The URL of the Hugging Face dataset in Parquet format.
@@ -32,6 +28,8 @@ Insert documents from a Hugging Face dataset into DuckDB.
 
 - **limit** (*int | None*) – defaults to `None`
 
+- **dtypes** (*dict | None*) – defaults to `None`
+
 
 
 ## Examples
@@ -41,13 +39,24 @@ Insert documents from a Hugging Face dataset into DuckDB.
 
 >>> upload.documents(
 ...     database="test.duckdb",
-...     documents="hf://datasets/lightonai/lighton-ms-marco-mini/train.parquet",
-...     fields=["document_ids", "scores"],
+...     documents="hf://datasets/lightonai/lighton-ms-marco-mini/queries.parquet",
 ...     key="query_id",
+...     fields=["query_id", "text"],
 ... )
 | Table          | Size |
 |----------------|------|
 | documents      | 19   |
 | bm25_documents | 19   |
+
+>>> upload.documents(
+...     database="test.duckdb",
+...     documents="hf://datasets/lightonai/lighton-ms-marco-mini/documents.parquet",
+...     key="document_id",
+...     fields=["document_id", "text"],
+... )
+| Table          | Size |
+|----------------|------|
+| documents      | 51   |
+| bm25_documents | 51   |
 ```
 
diff --git a/docs/api/overview.md b/docs/api/overview.md
@@ -25,6 +25,7 @@
 
 ## tables
 
+- [add_columns_documents](../tables/add-columns-documents)
 - [create_documents](../tables/create-documents)
 - [create_documents_queries](../tables/create-documents-queries)
 - [create_queries](../tables/create-queries)
@@ -43,6 +44,15 @@
 
 ## utils
 
+
+**Classes**
+
+- [ParallelTqdm](../utils/ParallelTqdm)
+
+**Functions**
+
 - [batchify](../utils/batchify)
+- [generate_random_hash](../utils/generate-random-hash)
+- [get_list_columns_df](../utils/get-list-columns-df)
 - [plot](../utils/plot)
 
diff --git a/docs/api/search/documents.md b/docs/api/search/documents.md
@@ -38,6 +38,12 @@ Search for documents in the documents table using specified queries.
 
     Optional SQL filters to apply during the search.
 
+- **order_by** (*str | None*) – defaults to `None`
+
+- **tqdm_bar** (*bool*) – defaults to `True`
+
+    Whether to display a progress bar when searching.
+
 
 
 ## Examples

diff --git a/docs/api/search/graphs.md b/docs/api/search/graphs.md
@@ -38,6 +38,8 @@ Search for graphs in DuckDB using the provided queries.
 
     Optional SQL filters to apply during the search.
 
+- **tqdm_bar** (*bool*) – defaults to `True`
+
 
 
 ## Examples
@@ -53,17 +55,22 @@ Search for graphs in DuckDB using the provided queries.
 ...     fields=["title", "text"],
 ...     documents=documents,
 ... )
+| Table          | Size |
+|----------------|------|
+| documents      | 5183 |
+| bm25_documents | 5183 |
 
 >>> upload.queries(
 ...     database="test.duckdb",
 ...     queries=queries,
 ...     documents_queries=qrels,
 ... )
-
->>> scores = search.graphs(
-...     database="test.duckdb",
-...     queries=queries,
-...     top_k=10,
-... )
+| Table             | Size |
+|-------------------|------|
+| documents         | 5183 |
+| queries           | 807  |
+| bm25_documents    | 5183 |
+| bm25_queries      | 807  |
+| documents_queries | 916  |
 ```
 
diff --git a/docs/api/search/queries.md b/docs/api/search/queries.md
@@ -38,6 +38,8 @@ Search for queries in the queries table using specified queries.
 
     Optional SQL filters to apply during the search.
 
+- **tqdm_bar** (*bool*) – defaults to `True`
+
 
 
 ## Examples

diff --git a/docs/api/search/search.md b/docs/api/search/search.md
@@ -50,6 +50,12 @@ Run the search for documents or queries in parallel.
 
     Optional SQL filters to apply during the search.
 
+- **order_by** (*str | None*) – defaults to `None`
+
+- **tqdm_bar** (*bool*) – defaults to `True`
+
+    Whether to display a progress bar when searching.
+
 
 
 ## Examples
@@ -67,7 +73,6 @@ Run the search for documents or queries in parallel.
 ...     top_k=10,
 ... )
 
->>> assert len(documents) == 1
->>> assert len(documents[0]) == 10
+>>> assert len(documents) == 10
 ```
 
diff --git a/docs/api/search/update-index-documents.md b/docs/api/search/update-index-documents.md
@@ -10,6 +10,10 @@ Update the BM25 search index for documents.
 
     The name of the DuckDB database.
 
+- **fields** (*list[str]*)
+
+    The fields to index for each document.
+
 - **k1** (*float*) – defaults to `1.5`
 
     The BM25 k1 parameter, controls term saturation.

diff --git a/docs/api/tables/add-columns-documents.md b/docs/api/tables/add-columns-documents.md
@@ -0,0 +1,21 @@
+# add_columns_documents
+
+Add columns to the documents table in the DuckDB database.
+
+
+
+## Parameters
+
+- **database** (*str*)
+
+- **schema** (*str*)
+
+- **columns** (*list[str] | str*)
+
+- **dtypes** (*dict*) – defaults to `None`
+
+- **config** (*dict*) – defaults to `None`
+
+
+
+
diff --git a/docs/api/tables/create-documents.md b/docs/api/tables/create-documents.md
@@ -10,7 +10,7 @@ Create the documents table in the DuckDB database.
 
 - **schema** (*str*)
 
-- **fields** (*str | list[str]*)
+- **columns** (*str | list[str]*)
 
 - **dtypes** (*dict[str, str] | None*) – defaults to `None`
 
@@ -31,7 +31,7 @@ Create the documents table in the DuckDB database.
 >>> tables.create_documents(
 ...     database="test.duckdb",
 ...     schema="bm25_tables",
-...     fields=["title", "text"],
+...     columns=["title", "text"],
 ...     dtypes={"text": "VARCHAR", "title": "VARCHAR"},
 ... )
 
@@ -46,7 +46,7 @@ Create the documents table in the DuckDB database.
 ...     schema="bm25_tables",
 ...     key="id",
 ...     df=df,
-...     fields=["title", "text"],
+...     columns=["title", "text"],
 ... )
 ```
 
diff --git a/docs/api/tables/insert-documents.md b/docs/api/tables/insert-documents.md
@@ -22,7 +22,7 @@ Insert documents into the documents table with optional multi-threading.
 
     The field that uniquely identifies each document (e.g., 'id').
 
-- **fields** (*list[str] | str*)
+- **columns** (*list[str] | str*)
 
     The list of document fields to insert. Can be a string if inserting a single field.
 
@@ -61,7 +61,7 @@ Insert documents into the documents table with optional multi-threading.
 ...     database="test.duckdb",
 ...     schema="bm25_tables",
 ...     key="id",
-...     fields=["title", "text"],
+...     columns=["title", "text"],
 ...     df=df
 ... )
 ```