Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add multiple data sources #19

Merged
merged 17 commits into from
Mar 27, 2024
58 changes: 25 additions & 33 deletions helpers/python.ts
Original file line number Diff line number Diff line change
Expand Up @@ -257,46 +257,38 @@ export const installPythonTemplate = async ({
});
}

if (dataSources.length === 0) {
if (dataSources.length > 0) {
const loaderConfigs: Record<string, any> = {};
const loaderPath = path.join(enginePath, "loaders");

// Copy loader.py file to enginePath
await copy("loader.py", enginePath, {
// Copy loaders to enginePath
await copy("**", loaderPath, {
parents: true,
cwd: path.join(compPath, "loaders", "python"),
});

for (const dataSource of dataSources) {
const sourceType = dataSource.type;
switch (sourceType) {
case "file":
case "folder": {
const loaderFolder = useLlamaParse ? "llama_parse" : "file";
await copy("**", loaderPath, {
parents: true,
cwd: path.join(compPath, "loaders", "python", loaderFolder),
});
break;
}
case "web": {
const config = dataSource.config as WebSourceConfig[];
// Append web loader config
const webLoaderConfig = config.map((c) => {
return {
base_url: c.baseUrl,
prefix: c.prefix || c.baseUrl,
depth: c.depth || 1,
};
});
loaderConfigs["web"] = webLoaderConfig;
await copy("**", loaderPath, {
parents: true,
cwd: path.join(compPath, "loaders", "python", sourceType),
});
break;
}
}
// Generate loaders config
// Web loader config
if (dataSources.some((ds) => ds.type === "web")) {
const webLoaderConfig = dataSources
.filter((ds) => ds.type === "web")
.map((ds) => {
const dsConfig = ds.config as WebSourceConfig;
return {
base_url: dsConfig.baseUrl,
prefix: dsConfig.prefix,
depth: dsConfig.depth,
};
});
loaderConfigs["web"] = webLoaderConfig;
}
// File loader config
if (
dataSources.some((ds) => ds.type === "file" || ds.type === "folder")
) {
loaderConfigs["file"] = {
use_llama_parse: useLlamaParse,
};
}
// Write loaders config
if (Object.keys(loaderConfigs).length > 0) {
Expand Down
3 changes: 1 addition & 2 deletions helpers/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@ export type TemplateObservability = "none" | "opentelemetry";
// Config for both file and folder
export type FileSourceConfig = {
paths?: string[];
useLlamaParse?: boolean;
};
export type WebSourceConfig = {
baseUrl?: string;
prefix?: string;
depth?: number;
};

export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig[];
export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig;

export type CommunityProjectConfig = {
owner: string;
Expand Down
127 changes: 41 additions & 86 deletions questions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import {
TemplateDataSource,
TemplateDataSourceType,
TemplateFramework,
WebSourceConfig,
} from "./helpers";
import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant";
import { templatesDir } from "./helpers/dir";
Expand Down Expand Up @@ -143,27 +142,21 @@ export const getDataSourceChoices = (
});
}

if (!selectedDataSource.some((ds) => ds.type === "file")) {
choices.push({
choices.push(
{
title: `Use local files (${supportedContextFileTypes.join(", ")})`,
value: "file",
});
}

if (!selectedDataSource.some((ds) => ds.type === "folder")) {
choices.push({
},
{
title:
process.platform === "win32"
? "Use a local folder"
: "Use local folders",
value: "folder",
});
}
},
);

if (
!selectedDataSource.some((ds) => ds.type === "web") &&
framework === "fastapi"
) {
if (framework === "fastapi") {
choices.push({
title: "Use website content (requires Chrome)",
value: "web",
Expand Down Expand Up @@ -636,7 +629,7 @@ export const askQuestions = async (

if (program.files) {
// If user specified files option, then the program should use context engine
program.engine == "context";
program.engine = "context";
if (!fs.existsSync(program.files)) {
console.log("File or folder not found");
process.exit(1);
Expand All @@ -658,14 +651,13 @@ export const askQuestions = async (
program.dataSources = getPrefOrDefault("dataSources");
} else {
program.dataSources = [];
const numberDataSources = program.framework === "fastapi" ? 2 : 1;
for (let i = 0; i < numberDataSources; i++) {
while (true) {
const { selectedSource } = await prompts(
{
type: "select",
name: "selectedSource",
message:
i === 0
program.dataSources.length === 0
? "Which data source would you like to use?"
: "Would you like to add another data source?",
choices: getDataSourceChoices(
Expand Down Expand Up @@ -707,80 +699,43 @@ export const askQuestions = async (
program.dataSources.push(dataSource);
} else if (selectedSource === "web") {
// Selected web data source
const validateUrl = (value: string) => {
for (let url of value.split(",")) {
if (!url.includes("://")) {
url = `https://${url}`;
}
const urlObj = new URL(url);
if (urlObj.protocol !== "https:" && urlObj.protocol !== "http:") {
return `URL=${url} has invalid protocol, only allow http or https`;
}
}
return true;
};
const { baseUrl } = await prompts(
{
type: "text",
name: "baseUrl",
message: "Please provide base URL of the website: ",
initial: "https://www.llamaindex.ai",
validate: (value: string) => {
if (!value.includes("://")) {
value = `https://${value}`;
}
const urlObj = new URL(value);
if (
urlObj.protocol !== "https:" &&
urlObj.protocol !== "http:"
) {
return `URL=${value} has invalid protocol, only allow http or https`;
}
return true;
},
},
handlers,
);

const dataSource: TemplateDataSource = {
program.dataSources.push({
type: "web",
config: [] as WebSourceConfig[],
};

while (true) {
const questions: any[] = [
{
type: "text",
name: "baseUrl",
message: "Please provide base URL of the website: ",
initial: "https://www.llamaindex.ai",
validate: (value: string) => {
if (!value.includes("://")) {
value = `https://${value}`;
}
const urlObj = new URL(value);
if (
urlObj.protocol !== "https:" &&
urlObj.protocol !== "http:"
) {
return `URL=${value} has invalid protocol, only allow http or https`;
}
// Check duplicated URL
if (
(dataSource.config as WebSourceConfig[]).some(
(c) => c.baseUrl === value,
)
) {
return `URL=${value} is already added. Please provide a different URL.`;
}
return true;
},
},
{
type: "toggle",
name: "shouldContinue",
message: "Would you like to add another website?",
initial: false,
active: "Yes",
inactive: "No",
},
];
let { shouldContinue, baseUrl } = await prompts(
questions,
handlers,
);
(dataSource.config as WebSourceConfig[]).push({
baseUrl: baseUrl,
config: {
baseUrl,
prefix: baseUrl,
depth: 1,
});
if (shouldContinue !== undefined && !shouldContinue) {
break;
}
}
program.dataSources.push(dataSource);
},
});
}

// No need to ask for another data source if user selected example data
if (selectedSource === "exampleFile") {
if (
program.framework !== "fastapi" ||
marcusschiesser marked this conversation as resolved.
Show resolved Hide resolved
selectedSource === "exampleFile"
) {
break;
}
}
Expand Down
33 changes: 33 additions & 0 deletions templates/components/loaders/python/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
marcusschiesser marked this conversation as resolved.
Show resolved Hide resolved
import json
import importlib
import logging
from typing import Dict
from app.engine.loaders.file import FileLoaderConfig, get_file_documents
from app.engine.loaders.web import WebLoaderConfig, get_web_documents

logger = logging.getLogger(__name__)


def load_configs():
with open("config/loaders.json") as f:
configs = json.load(f)
return configs


def get_documents():
documents = []
config = load_configs()
for loader_type, loader_config in config.items():
logger.info(
f"Loading documents from loader: {loader_type}, config: {loader_config}"
)
if loader_type == "file":
document = get_file_documents(FileLoaderConfig(**loader_config))
documents.extend(document)
elif loader_type == "web":
for entry in loader_config:
document = get_web_documents(WebLoaderConfig(**entry))
documents.extend(document)

return documents
37 changes: 37 additions & 0 deletions templates/components/loaders/python/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
from llama_parse import LlamaParse
from pydantic import BaseModel, validator


class FileLoaderConfig(BaseModel):
data_dir: str = "data"
use_llama_parse: bool = False

@validator("data_dir")
def data_dir_must_exist(cls, v):
if not os.path.isdir(v):
raise ValueError(f"Directory '{v}' does not exist")
return v


def llama_parse_parser():
if os.getenv("LLAMA_CLOUD_API_KEY") is None:
raise ValueError(
"LLAMA_CLOUD_API_KEY environment variable is not set. "
"Please set it in .env file or in your shell environment then run again!"
)
parser = LlamaParse(result_type="markdown", verbose=True, language="en")
return parser


def get_file_documents(config: FileLoaderConfig):
from llama_index.core.readers import SimpleDirectoryReader

reader = SimpleDirectoryReader(
config.data_dir,
recursive=True,
)
if config.use_llama_parse:
parser = llama_parse_parser()
reader.file_extractor = {".pdf": parser}
return reader.load_data()
10 changes: 0 additions & 10 deletions templates/components/loaders/python/file/file.py

This file was deleted.

19 changes: 0 additions & 19 deletions templates/components/loaders/python/llama_parse/llama_parse.py

This file was deleted.

13 changes: 0 additions & 13 deletions templates/components/loaders/python/loader.py

This file was deleted.

Loading
Loading