From 2bcaae48cb5f248e5c133c6371cc34a6b999b576 Mon Sep 17 00:00:00 2001
From: xcharleslin <4212216+xcharleslin@users.noreply.github.com>
Date: Wed, 21 Jun 2023 14:32:44 -0700
Subject: [PATCH] [DOCS] Embeddings tutorial: Temporarily remove full dataset
 (#1039)

Co-authored-by: Xiayue Charles Lin <charles@eventualcomputing.com>
---
 .../embeddings/daft_tutorial_embeddings_stackexchange.ipynb    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb
index 41bd8dfeb4..2ffc9a5b72 100644
--- a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb
+++ b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb
@@ -52,6 +52,8 @@
     "\n",
     "We will use the **StackExchange crawl from the [RedPajamas dataset](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T)**. It is 75GB of `jsonl` files. \n",
     "\n",
+    "*EDIT (June 2023): Our hosted version of the full dataset is temporarily unavailable. Please enjoy the demo with the sample dataset for now.*\n",
+    "\n",
     "**Note:** This demo runs best on a cluster with many GPUs available. Information on how to connect Daft to a cluster is available [here](https://www.getdaft.io/projects/docs/en/stable/learn/user_guides/scaling-up.html). \n",
     "\n",
     "If running on a single node, you can use the provided subsample of the data, which is 75MB in size. If you like, you can also truncate either dataset to a desired number of rows using `df.limit`."
@@ -87,7 +89,6 @@
    "source": [
     "import daft\n",
     "\n",
-    "FULL_DATA_PATH = \"s3://daft-public-data/redpajama-1t/stackexchange/*\"\n",
     "SAMPLE_DATA_PATH = \"s3://daft-public-data/redpajama-1t-sample/stackexchange_sample.jsonl\"\n",
     "\n",
     "df = daft.read_json(SAMPLE_DATA_PATH)\n",