Skip to content

Commit

Permalink
Merge pull request #70 from upstash/add-llama-parse
Browse files Browse the repository at this point in the history
feat: add llama parse
  • Loading branch information
ogzhanolguncu authored Sep 4, 2024
2 parents 10c90ea + 76e6891 commit d15f1a3
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 49 deletions.
Binary file modified bun.lockb
Binary file not shown.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
"d3-dsv": "^3.0.1",
"html-to-text": "^9.0.5",
"langchain": "^0.2.0",
"llamaindex": "^0.5.20",
"nanoid": "^5.0.7",
"pdf-parse": "^1.1.1",
"unstructured-client": "^0.15.1"
Expand Down
11 changes: 7 additions & 4 deletions src/database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ import { DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_TOP_K } from "./constants";
import { FileDataLoader } from "./file-loader";
import type { AddContextOptions } from "./types";
import type { UnstructuredLoaderOptions } from "@langchain/community/document_loaders/fs/unstructured";
import type { LlamaParseReader } from "llamaindex";

export type FilePath = string;
export type URL = string;

export type ProcessorType = {
name: "unstructured";
options: UnstructuredLoaderOptions;
};
export type ProcessorType =
| {
name: "unstructured";
options: UnstructuredLoaderOptions;
}
| { name: "llama-parse"; options: Partial<LlamaParseReader> };

export type DatasWithFileSource =
| {
Expand Down
126 changes: 81 additions & 45 deletions src/file-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { nanoid } from "nanoid";
import { UnstructuredClient } from "unstructured-client";
import type { DatasWithFileSource, FilePath, ProcessorType, URL } from "./database";
import { LlamaParseReader } from "llamaindex";

type Element = {
type: string;
Expand All @@ -37,51 +38,7 @@ export class FileDataLoader {

private async createLoader(args: any) {
if (hasProcessor(this.config)) {
const client = new UnstructuredClient({
serverURL: "https://api.unstructuredapp.io",
security: {
apiKeyAuth: this.config.processor.options.apiKey,
},
});

//@ts-expect-error TS can't pick up the correct type due to complex union
const fileData = await Bun.file(this.config.fileSource).text();
const response = await client.general.partition({
//@ts-expect-error Will be fixed soon
partitionParameters: {
files: {
content: fileData,
//@ts-expect-error TS can't pick up the correct type due to complex union
fileName: this.config.fileSource,
},
...this.config.processor.options,
},
});
const elements = response.elements?.filter(
(element) => typeof element.text === "string"
) as Element[];

return {
// eslint-disable-next-line @typescript-eslint/require-await
load: async (): Promise<Document[]> => {
const documents: Document[] = [];
for (const element of elements) {
const { metadata, text } = element;
if (typeof text === "string" && text !== "") {
documents.push(
new Document({
pageContent: text,
metadata: {
...metadata,
category: element.type,
},
})
);
}
}
return documents;
},
};
return await this.createLoaderForProcessors();
}
switch (this.config.type) {
case "pdf": {
Expand Down Expand Up @@ -115,6 +72,84 @@ export class FileDataLoader {
}
}

private async createLoaderForProcessors() {
// Without this check typescript complains about types because of unions
if (!hasProcessor(this.config)) throw new Error("Only processors are allowed");

switch (this.config.processor.name) {
case "unstructured": {
const client = new UnstructuredClient({
serverURL: "https://api.unstructuredapp.io",
security: {
apiKeyAuth: this.config.processor.options.apiKey,
},
});

//@ts-expect-error TS can't pick up the correct type due to complex union
const fileData = await Bun.file(this.config.fileSource).text();
const response = await client.general.partition({
//@ts-expect-error Will be fixed soon
partitionParameters: {
files: {
content: fileData,
//@ts-expect-error TS can't pick up the correct type due to complex union
fileName: this.config.fileSource,
},
...this.config.processor.options,
},
});
const elements = response.elements?.filter(
(element) => typeof element.text === "string"
) as Element[];

return {
// eslint-disable-next-line @typescript-eslint/require-await
load: async (): Promise<Document[]> => {
const documents: Document[] = [];
for (const element of elements) {
const { metadata, text } = element;
if (typeof text === "string" && text !== "") {
documents.push(
new Document({
pageContent: text,
metadata: {
...metadata,
category: element.type,
},
})
);
}
}
return documents;
},
};
}
case "llama-parse": {
const reader = new LlamaParseReader(this.config.processor.options);
//@ts-expect-error TS can't pick up the correct type due to complex union
const parsedDocuments = await reader.loadData(this.config.fileSource);
return {
// eslint-disable-next-line @typescript-eslint/require-await
load: async (): Promise<Document[]> => {
const documents: Document[] = [];
for (const element of parsedDocuments) {
const { metadata, text } = element;
if (typeof text === "string" && text !== "") {
documents.push(
new Document({
pageContent: text,
metadata,
})
);
}
}
return documents;
},
};
}
}
}

private isURL(source: FilePath | Blob): source is URL {
return typeof source === "string" && source.startsWith("http");
}
Expand Down Expand Up @@ -158,6 +193,7 @@ export class FileDataLoader {
return mapDocumentsIntoInsertPayload(newDocuments);
}

// Processors will be handled here. E.g. "unstructured", "llama-parse"
case undefined: {
const documents_ = documents.map(
(item) => new Document({ pageContent: item.pageContent, metadata: item.metadata })
Expand Down

0 comments on commit d15f1a3

Please sign in to comment.