From 2bcaae48cb5f248e5c133c6371cc34a6b999b576 Mon Sep 17 00:00:00 2001 From: xcharleslin <4212216+xcharleslin@users.noreply.github.com> Date: Wed, 21 Jun 2023 14:32:44 -0700 Subject: [PATCH] [DOCS] Embeddings tutorial: Temporarily remove full dataset (#1039) Co-authored-by: Xiayue Charles Lin --- .../embeddings/daft_tutorial_embeddings_stackexchange.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb index 41bd8dfeb4..2ffc9a5b72 100644 --- a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb +++ b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb @@ -52,6 +52,8 @@ "\n", "We will use the **StackExchange crawl from the [RedPajamas dataset](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T)**. It is 75GB of `jsonl` files. \n", "\n", + "*EDIT (June 2023): Our hosted version of the full dataset is temporarily unavailable. Please enjoy the demo with the sample dataset for now.*\n", + "\n", "**Note:** This demo runs best on a cluster with many GPUs available. Information on how to connect Daft to a cluster is available [here](https://www.getdaft.io/projects/docs/en/stable/learn/user_guides/scaling-up.html). \n", "\n", "If running on a single node, you can use the provided subsample of the data, which is 75MB in size. If you like, you can also truncate either dataset to a desired number of rows using `df.limit`." @@ -87,7 +89,6 @@ "source": [ "import daft\n", "\n", - "FULL_DATA_PATH = \"s3://daft-public-data/redpajama-1t/stackexchange/*\"\n", "SAMPLE_DATA_PATH = \"s3://daft-public-data/redpajama-1t-sample/stackexchange_sample.jsonl\"\n", "\n", "df = daft.read_json(SAMPLE_DATA_PATH)\n",