feat: Add Dockerfile template (#27)

run-llama · Mar 28, 2024 · 78ded9e · 78ded9e
1 parent 4f10840
commit 78ded9e
Show file tree

Hide file tree

Showing 11 changed files with 209 additions and 75 deletions.
diff --git a/.changeset/healthy-insects-check.md b/.changeset/healthy-insects-check.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add Dockerfile template
diff --git a/helpers/python.ts b/helpers/python.ts
@@ -216,9 +216,10 @@ export const installPythonTemplate = async ({
     },
   });
 
+  const compPath = path.join(templatesDir, "components");
+
   if (dataSources.length > 0) {
     const enginePath = path.join(root, "app", "engine");
-    const compPath = path.join(templatesDir, "components");
 
     const vectorDbDirName = vectorDb ?? "none";
     const VectorDBPath = path.join(
@@ -265,7 +266,19 @@ export const installPythonTemplate = async ({
     // Generate loaders config
     // Web loader config
     if (dataSources.some((ds) => ds.type === "web")) {
-      const webLoaderConfig = dataSources
+      const webLoaderConfig = new Document({});
+
+      // Create config for browser driver arguments
+      const driverArgNodeValue = webLoaderConfig.createNode([
+        "--no-sandbox",
+        "--disable-dev-shm-usage",
+      ]);
+      driverArgNodeValue.commentBefore =
+        " The arguments to pass to the webdriver. E.g.: add --headless to run in headless mode";
+      webLoaderConfig.set("driver_arguments", driverArgNodeValue);
+
+      // Create config for urls
+      const urlConfigs = dataSources
         .filter((ds) => ds.type === "web")
         .map((ds) => {
           const dsConfig = ds.config as WebSourceConfig;
@@ -275,13 +288,15 @@ export const installPythonTemplate = async ({
             depth: dsConfig.depth,
           };
         });
-      // Add documentation to web loader config
-      const node = loaderConfig.createNode(webLoaderConfig);
-      node.commentBefore = ` base_url: The URL to start crawling with
+      const urlConfigNode = webLoaderConfig.createNode(urlConfigs);
+      urlConfigNode.commentBefore = ` base_url: The URL to start crawling with
  prefix: Only crawl URLs matching the specified prefix
  depth: The maximum depth for BFS traversal
  You can add more websites by adding more entries (don't forget the - prefix from YAML)`;
-      loaderConfig.set("web", node);
+      webLoaderConfig.set("urls", urlConfigNode);
+
+      // Add web config to the loaders config
+      loaderConfig.set("web", webLoaderConfig);
     }
     // File loader config
     if (dataSources.some((ds) => ds.type === "file")) {
@@ -308,4 +323,9 @@ export const installPythonTemplate = async ({
   if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
     installPythonDependencies();
   }
+
+  // Copy deployment files for python
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "python"),
+  });
 };
diff --git a/helpers/typescript.ts b/helpers/typescript.ts
@@ -295,4 +295,9 @@ export const installTSTemplate = async ({
   if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
     await installTSDependencies(packageJson, packageManager, isOnline);
   }
+
+  // Copy deployment files for typescript
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "typescript"),
+  });
 };
diff --git a/templates/components/deployments/python/Dockerfile b/templates/components/deployments/python/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.11 as build
+
+WORKDIR /app
+
+ENV PYTHONPATH=/app
+
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
+    cd /usr/local/bin && \
+    ln -s /opt/poetry/bin/poetry && \
+    poetry config virtualenvs.create false
+
+# Install Chromium for web loader
+# Can disable this if you don't use the web loader to reduce the image size
+RUN apt update && apt install -y chromium chromium-driver
+
+# Install dependencies
+COPY ./pyproject.toml ./poetry.lock* /app/
+RUN poetry install --no-root --no-cache --only main
+
+# ====================================
+FROM build as release
+
+COPY . .
+
+CMD ["python", "main.py"]
diff --git a/templates/components/deployments/typescript/Dockerfile b/templates/components/deployments/typescript/Dockerfile
@@ -0,0 +1,22 @@
+FROM node:20-alpine as build
+
+WORKDIR /app
+
+# Install dependencies
+COPY package.json pnpm-lock.yaml* /app/
+RUN npm install 
+
+# Build the application
+COPY . .
+RUN npm run build
+
+# ====================================
+FROM build as release
+
+# Copy built output from the previous stage
+COPY --from=build /app/.next* ./.next
+COPY --from=build /app/public* ./public
+COPY --from=build /app/package.json ./package.json
+COPY --from=build /app/node_modules ./node_modules
+
+CMD ["npm", "start"]
diff --git a/templates/components/loaders/python/__init__.py b/templates/components/loaders/python/__init__.py
@@ -26,8 +26,7 @@ def get_documents():
             document = get_file_documents(FileLoaderConfig(**loader_config))
             documents.extend(document)
         elif loader_type == "web":
-            for entry in loader_config:
-                document = get_web_documents(WebLoaderConfig(**entry))
-                documents.extend(document)
+            document = get_web_documents(WebLoaderConfig(**loader_config))
+            documents.extend(document)
 
     return documents
diff --git a/templates/components/loaders/python/web.py b/templates/components/loaders/python/web.py
@@ -3,17 +3,34 @@
 from pydantic import BaseModel, Field
 
 
-class WebLoaderConfig(BaseModel):
+class CrawlUrl(BaseModel):
     base_url: str
     prefix: str
     max_depth: int = Field(default=1, ge=0)
 
 
+class WebLoaderConfig(BaseModel):
+    driver_arguments: list[str] = Field(default=None)
+    urls: list[CrawlUrl]
+
+
 def get_web_documents(config: WebLoaderConfig):
     from llama_index.readers.web import WholeSiteReader
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+
+    options = Options()
+    driver_arguments = config.driver_arguments or []
+    for arg in driver_arguments:
+        options.add_argument(arg)
+
+    docs = []
+    for url in config.urls:
+        scraper = WholeSiteReader(
+            prefix=url.prefix,
+            max_depth=url.max_depth,
+            driver=webdriver.Chrome(options=options),
+        )
+        docs.extend(scraper.load_data(url.base_url))
 
-    scraper = WholeSiteReader(
-        prefix=config.prefix,
-        max_depth=config.max_depth,
-    )
-    return scraper.load_data(config.base_url)
+    return docs
diff --git a/templates/types/simple/fastapi/README-template.md b/templates/types/simple/fastapi/README-template.md
diff --git a/templates/types/streaming/express/README-template.md b/templates/types/streaming/express/README-template.md
@@ -60,6 +60,38 @@ NODE_ENV=production npm run start
 
 > Note that the `NODE_ENV` environment variable is set to `production`. This disables CORS for all origins.
 
+## Using Docker
+
+1. Build an image for Express app:
+
+```
+docker build -t <your_backend_image_name> .
+```
+
+2. Start the app:
+
+- Generate index data:
+
+```
+docker run --rm \
+  --v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  <your_backend_image_name>
+  npm run generate
+```
+
+- Start the API:
+
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources:

diff --git a/templates/types/streaming/fastapi/README-template.md b/templates/types/streaming/fastapi/README-template.md
@@ -64,6 +64,39 @@ The API allows CORS for all origins to simplify development. You can change this
 ENVIRONMENT=prod python main.py
 ```
 
+## Using docker
+
+1. Build an image for FastAPI app:
+
+```
+docker build -t <your_backend_image_name> .
+```
+
+2. Start the app:
+
+- Generate embedding for index data:
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  <your_backend_image_name> \
+  python app/engine/generate.py
+```
+
+- Start the API:
+
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources:

diff --git a/templates/types/streaming/nextjs/README-template.md b/templates/types/streaming/nextjs/README-template.md
@@ -26,6 +26,41 @@ You can start editing the page by modifying `app/page.tsx`. The page auto-update
 
 This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font.
 
+## Using Docker
+
+1. Build an image for the Next.js app:
+
+```
+docker build -t <your_app_image_name> .
+```
+
+2. Generate embeddings:
+
+Parse the data and generate the vector embeddings if the `./data` folder exists - otherwise, skip this step:
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name> \
+  npm run generate
+```
+
+3. Start the API
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources: