From d9b0e62618b8fe669015c0e718ae38fb9b1364cb Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Thu, 12 Sep 2024 17:22:24 +0200 Subject: [PATCH 01/13] feat(uptime): check if connection to db works (#3199) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --- .../modules/misc/controller/misc_routes.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/backend/api/quivr_api/modules/misc/controller/misc_routes.py b/backend/api/quivr_api/modules/misc/controller/misc_routes.py index 53b8ab08f13..590b3cd0e3a 100644 --- a/backend/api/quivr_api/modules/misc/controller/misc_routes.py +++ b/backend/api/quivr_api/modules/misc/controller/misc_routes.py @@ -1,4 +1,11 @@ -from fastapi import APIRouter + +from fastapi import APIRouter, Depends, HTTPException +from quivr_api.logger import get_logger +from quivr_api.modules.dependencies import get_async_session +from sqlmodel.ext.asyncio.session import AsyncSession +from sqlmodel import text + +logger = get_logger(__name__) misc_router = APIRouter() @@ -12,5 +19,14 @@ async def root(): @misc_router.get("/healthz", tags=["Health"]) -async def healthz(): +async def healthz(session: AsyncSession = Depends(get_async_session)): + + try: + result = await session.execute(text("SELECT 1")) + if not result: + raise HTTPException(status_code=500, detail="Database is not healthy") + except Exception as e: + logger.error(f"Error checking database health: {e}") + raise HTTPException(status_code=500, detail="Database is not healthy") + return {"status": "ok"} From 8fb488771643a407292f6b9a0e17c39057b12dbf Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Thu, 12 Sep 2024 18:42:00 +0200 Subject: [PATCH 02/13] chore(main): release 0.0.311 (#3194) :robot: I have created a release *beep* *boop* --- ## 0.0.311 (2024-09-12) ## What's Changed * chore(embeddings): added tests for embeddings by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3183 * feat(uptime): check if connection to db works by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3199 **Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.310...v0.0.311 --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- CHANGELOG.md | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 260d79a5a70..ba12f1bbb89 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { "backend/core": "0.0.14", - ".": "0.0.310" + ".": "0.0.311" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 6409b0689ec..77cbbd24109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 0.0.311 (2024-09-12) + +## What's Changed +* chore(embeddings): added tests for embeddings by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3183 +* feat(uptime): check if connection to db works by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3199 + + +**Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.310...v0.0.311 + ## 0.0.310 (2024-09-10) ## What's Changed From 13ed225b172407ee9826b9c01b2f7b124a8b5a10 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Fri, 13 Sep 2024 10:59:41 +0200 Subject: [PATCH 03/13] fix: Update LLMEndpoint to include max_tokens parameter (#3201) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --- backend/core/quivr_core/llm/llm_endpoint.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/core/quivr_core/llm/llm_endpoint.py b/backend/core/quivr_core/llm/llm_endpoint.py index bd636878479..54ed51faf06 100644 --- a/backend/core/quivr_core/llm/llm_endpoint.py +++ b/backend/core/quivr_core/llm/llm_endpoint.py @@ -42,6 +42,7 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): if config.llm_api_key else None, azure_endpoint=azure_endpoint, + max_tokens=config.max_tokens ) elif config.model.startswith("claude"): _llm = ChatAnthropic( @@ -50,6 +51,7 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): if config.llm_api_key else None, base_url=config.llm_base_url, + max_tokens=config.max_tokens ) else: _llm = ChatOpenAI( @@ -58,6 +60,7 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): if config.llm_api_key else None, base_url=config.llm_base_url, + max_tokens=config.max_tokens ) return cls(llm=_llm, llm_config=config) From 06f72eb451699254a1a9aaf3b632cd4b2336b0a2 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Fri, 13 Sep 2024 11:17:28 +0200 Subject: [PATCH 04/13] chore(main): release 0.0.312 (#3204) :robot: I have created a release *beep* *boop* --- ## 0.0.312 (2024-09-13) ## What's Changed * fix: Update LLMEndpoint to include max_tokens parameter by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3201 **Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.311...v0.0.312 --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index ba12f1bbb89..33f13453d19 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { "backend/core": "0.0.14", - ".": "0.0.311" + ".": "0.0.312" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 77cbbd24109..bdbfc85220d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.0.312 (2024-09-13) + +## What's Changed +* fix: Update LLMEndpoint to include max_tokens parameter by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3201 + + +**Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.311...v0.0.312 + ## 0.0.311 (2024-09-12) ## What's Changed From eda619f4547921ab4c50458b2d44c6b5c10e40d1 Mon Sep 17 00:00:00 2001 From: AmineDiro Date: Fri, 13 Sep 2024 15:35:28 +0200 Subject: [PATCH 05/13] feat: save and load brain (#3202) # Description - Save and load brain to disk: ```python async def main(): with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file: temp_file.write("Gold is a liquid of blue-like colour.") temp_file.flush() brain = await Brain.afrom_files(name="test_brain", file_paths=[temp_file.name]) save_path = await brain.save("/home/amine/.local/quivr") brain_loaded = Brain.load(save_path) brain_loaded.print_info() ``` # TODO: - Loading all chat history - Loading from other vector stores, PG for example can be great ... --- backend/core/examples/save_load_brain.py | 22 ++++ backend/core/examples/simple_question.py | 21 +-- .../examples/simple_question_streaming.py | 23 ++-- backend/core/quivr_core/brain/brain.py | 121 +++++++++++++++++- backend/core/quivr_core/brain/info.py | 1 - .../core/quivr_core/brain/serialization.py | 55 ++++++++ backend/core/quivr_core/files/file.py | 39 +++++- backend/core/quivr_core/llm/llm_endpoint.py | 6 +- backend/core/quivr_core/processor/registry.py | 2 - .../core/quivr_core/quivr_rag_langgraph.py | 64 +++++---- .../core/quivr_core/storage/local_storage.py | 17 ++- backend/core/tests/test_brain.py | 1 - 12 files changed, 310 insertions(+), 62 deletions(-) create mode 100644 backend/core/examples/save_load_brain.py create mode 100644 backend/core/quivr_core/brain/serialization.py diff --git a/backend/core/examples/save_load_brain.py b/backend/core/examples/save_load_brain.py new file mode 100644 index 00000000000..336db58dac1 --- /dev/null +++ b/backend/core/examples/save_load_brain.py @@ -0,0 +1,22 @@ +import asyncio +import tempfile + +from quivr_core import Brain + + +async def main(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file: + temp_file.write("Gold is a liquid of blue-like colour.") + temp_file.flush() + + brain = await Brain.afrom_files(name="test_brain", file_paths=[temp_file.name]) + + save_path = await brain.save("/home/amine/.local/quivr") + + brain_loaded = Brain.load(save_path) + brain_loaded.print_info() + + +if __name__ == "__main__": + # Run the main function in the existing event loop + asyncio.run(main()) diff --git a/backend/core/examples/simple_question.py b/backend/core/examples/simple_question.py index 46c40b6ff75..3b6ce3b5d4b 100644 --- a/backend/core/examples/simple_question.py +++ b/backend/core/examples/simple_question.py @@ -1,22 +1,23 @@ import tempfile + from quivr_core import Brain -from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph from quivr_core.quivr_rag import QuivrQARAG +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph if __name__ == "__main__": with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file: temp_file.write("Gold is a liquid of blue-like colour.") temp_file.flush() - brain = Brain.from_files(name="test_brain", - file_paths=[temp_file.name], - ) + brain = Brain.from_files( + name="test_brain", + file_paths=[temp_file.name], + ) - answer = brain.ask("what is gold? asnwer in french", - rag_pipeline=QuivrQARAGLangGraph) + answer = brain.ask( + "what is gold? asnwer in french", rag_pipeline=QuivrQARAGLangGraph + ) print("answer QuivrQARAGLangGraph :", answer.answer) - - answer = brain.ask("what is gold? asnwer in french", - rag_pipeline=QuivrQARAG) - print("answer QuivrQARAG :", answer.answer) \ No newline at end of file + answer = brain.ask("what is gold? asnwer in french", rag_pipeline=QuivrQARAG) + print("answer QuivrQARAG :", answer.answer) diff --git a/backend/core/examples/simple_question_streaming.py b/backend/core/examples/simple_question_streaming.py index 1cd72f4fa73..acd75880c0c 100644 --- a/backend/core/examples/simple_question_streaming.py +++ b/backend/core/examples/simple_question_streaming.py @@ -1,29 +1,34 @@ -from dotenv import load_dotenv -import tempfile import asyncio +import tempfile + +from dotenv import load_dotenv from quivr_core import Brain -from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph from quivr_core.quivr_rag import QuivrQARAG +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph async def main(): dotenv_path = "/Users/jchevall/Coding/QuivrHQ/quivr/.env" load_dotenv(dotenv_path) - + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file: temp_file.write("Gold is a liquid of blue-like colour.") temp_file.flush() - brain = await Brain.afrom_files(name="test_brain", - file_paths=[temp_file.name]) + brain = await Brain.afrom_files(name="test_brain", file_paths=[temp_file.name]) + + await brain.save("~/.local/quivr") question = "what is gold? answer in french" async for chunk in brain.ask_streaming(question, rag_pipeline=QuivrQARAG): - print("answer QuivrQARAG:", chunk.answer) + print("answer QuivrQARAG:", chunk.answer) - async for chunk in brain.ask_streaming(question, rag_pipeline=QuivrQARAGLangGraph): + async for chunk in brain.ask_streaming( + question, rag_pipeline=QuivrQARAGLangGraph + ): print("answer QuivrQARAGLangGraph:", chunk.answer) + if __name__ == "__main__": # Run the main function in the existing event loop - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/backend/core/quivr_core/brain/brain.py b/backend/core/quivr_core/brain/brain.py index d1082fbace8..b1175b6be31 100644 --- a/backend/core/quivr_core/brain/brain.py +++ b/backend/core/quivr_core/brain/brain.py @@ -1,18 +1,27 @@ import asyncio import logging +import os from pathlib import Path from pprint import PrettyPrinter -from typing import Any, AsyncGenerator, Callable, Dict, Self, Union, Type +from typing import Any, AsyncGenerator, Callable, Dict, Self, Type, Union from uuid import UUID, uuid4 from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.messages import AIMessage, HumanMessage from langchain_core.vectorstores import VectorStore +from langchain_openai import OpenAIEmbeddings from rich.console import Console from rich.panel import Panel from quivr_core.brain.info import BrainInfo, ChatHistoryInfo +from quivr_core.brain.serialization import ( + BrainSerialized, + EmbedderConfig, + FAISSConfig, + LocalStorageConfig, + TransparentStorageConfig, +) from quivr_core.chat import ChatHistory from quivr_core.config import RAGConfig from quivr_core.files.file import load_qfile @@ -20,8 +29,8 @@ from quivr_core.models import ParsedRAGChunkResponse, ParsedRAGResponse, SearchResult from quivr_core.processor.registry import get_processor_class from quivr_core.quivr_rag import QuivrQARAG -from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph -from quivr_core.storage.local_storage import TransparentStorage +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph +from quivr_core.storage.local_storage import LocalStorage, TransparentStorage from quivr_core.storage.storage_base import StorageBase from .brain_defaults import build_default_vectordb, default_embedder, default_llm @@ -90,6 +99,108 @@ def print_info(self): panel = Panel(tree, title="Brain Info", expand=False, border_style="bold") console.print(panel) + @classmethod + def load(cls, folder_path: str | Path) -> Self: + if isinstance(folder_path, str): + folder_path = Path(folder_path) + if not folder_path.exists(): + raise ValueError(f"path {folder_path} doesn't exist") + + # Load brainserialized + with open(os.path.join(folder_path, "config.json"), "r") as f: + bserialized = BrainSerialized.model_validate_json(f.read()) + + # Loading storage + if bserialized.storage_config.storage_type == "transparent_storage": + storage: StorageBase = TransparentStorage.load(bserialized.storage_config) + elif bserialized.storage_config.storage_type == "local_storage": + storage: StorageBase = LocalStorage.load(bserialized.storage_config) + else: + raise ValueError("unknown storage") + + # Load Embedder + if bserialized.embedding_config.embedder_type == "openai_embedding": + from langchain_openai import OpenAIEmbeddings + + embedder = OpenAIEmbeddings(**bserialized.embedding_config.config) + else: + raise ValueError("unknown embedder") + + # Load vector db + if bserialized.vectordb_config.vectordb_type == "faiss": + from langchain_community.vectorstores import FAISS + + vector_db = FAISS.load_local( + folder_path=bserialized.vectordb_config.vectordb_folder_path, + embeddings=embedder, + allow_dangerous_deserialization=True, + ) + else: + raise ValueError("Unsupported vectordb") + + return cls( + id=bserialized.id, + name=bserialized.name, + embedder=embedder, + llm=LLMEndpoint.from_config(bserialized.llm_config), + storage=storage, + vector_db=vector_db, + ) + + async def save(self, folder_path: str | Path): + if isinstance(folder_path, str): + folder_path = Path(folder_path) + + brain_path = os.path.join(folder_path, f"brain_{self.id}") + os.makedirs(brain_path, exist_ok=True) + + from langchain_community.vectorstores import FAISS + + if isinstance(self.vector_db, FAISS): + vectordb_path = os.path.join(brain_path, "vector_store") + os.makedirs(vectordb_path, exist_ok=True) + self.vector_db.save_local(folder_path=vectordb_path) + vector_store = FAISSConfig(vectordb_folder_path=vectordb_path) + else: + raise Exception("can't serialize other vector stores for now") + + if isinstance(self.embedder, OpenAIEmbeddings): + embedder_config = EmbedderConfig( + config=self.embedder.dict(exclude={"openai_api_key"}) + ) + else: + raise Exception("can't serialize embedder other than openai for now") + + # TODO : each instance should know how to serialize/deserialize itself + if isinstance(self.storage, LocalStorage): + serialized_files = { + f.id: f.serialize() for f in await self.storage.get_files() + } + storage_config = LocalStorageConfig( + storage_path=self.storage.dir_path, files=serialized_files + ) + elif isinstance(self.storage, TransparentStorage): + serialized_files = { + f.id: f.serialize() for f in await self.storage.get_files() + } + storage_config = TransparentStorageConfig(files=serialized_files) + else: + raise Exception("can't serialize storage. not supported for now") + + bserialized = BrainSerialized( + id=self.id, + name=self.name, + chat_history=self.chat_history.get_chat_history(), + llm_config=self.llm.get_config(), + vectordb_config=vector_store, + embedding_config=embedder_config, + storage_config=storage_config, + ) + + with open(os.path.join(brain_path, "config.json"), "w") as f: + f.write(bserialized.model_dump_json()) + return brain_path + def info(self) -> BrainInfo: # TODO: dim of embedding # "embedder": {}, @@ -177,7 +288,7 @@ def from_files( storage: StorageBase = TransparentStorage(), llm: LLMEndpoint | None = None, embedder: Embeddings | None = None, - skip_file_error: bool = False + skip_file_error: bool = False, ) -> Self: loop = asyncio.get_event_loop() return loop.run_until_complete( @@ -223,7 +334,7 @@ async def afrom_langchain_documents( storage=storage, llm=llm, embedder=embedder, - vector_db=vector_db + vector_db=vector_db, ) async def asearch( diff --git a/backend/core/quivr_core/brain/info.py b/backend/core/quivr_core/brain/info.py index 862eec36935..bb0747f32ea 100644 --- a/backend/core/quivr_core/brain/info.py +++ b/backend/core/quivr_core/brain/info.py @@ -33,7 +33,6 @@ def add_to_tree(self, llm_tree: Tree): llm_tree.add(f"Base URL: [underline]{self.llm_base_url}[/underline]") llm_tree.add(f"Temperature: [bold]{self.temperature}[/bold]") llm_tree.add(f"Max Tokens: [bold]{self.max_tokens}[/bold]") - func_call_color = "green" if self.supports_function_calling else "red" llm_tree.add( f"Supports Function Calling: [bold {func_call_color}]{self.supports_function_calling}[/bold {func_call_color}]" diff --git a/backend/core/quivr_core/brain/serialization.py b/backend/core/quivr_core/brain/serialization.py new file mode 100644 index 00000000000..7b2764a1f9d --- /dev/null +++ b/backend/core/quivr_core/brain/serialization.py @@ -0,0 +1,55 @@ +from pathlib import Path +from typing import Any, Dict, Literal, Union +from uuid import UUID + +from pydantic import BaseModel, Field, SecretStr + +from quivr_core.config import LLMEndpointConfig +from quivr_core.files.file import QuivrFileSerialized +from quivr_core.models import ChatMessage + + +class EmbedderConfig(BaseModel): + embedder_type: Literal["openai_embedding"] = "openai_embedding" + # TODO: type this correctly + config: Dict[str, Any] + + +class PGVectorConfig(BaseModel): + vectordb_type: Literal["pgvector"] = "pgvector" + pg_url: str + pg_user: str + pg_psswd: SecretStr + table_name: str + vector_dim: int + + +class FAISSConfig(BaseModel): + vectordb_type: Literal["faiss"] = "faiss" + vectordb_folder_path: str + + +class LocalStorageConfig(BaseModel): + storage_type: Literal["local_storage"] = "local_storage" + storage_path: Path + files: dict[UUID, QuivrFileSerialized] + + +class TransparentStorageConfig(BaseModel): + storage_type: Literal["transparent_storage"] = "transparent_storage" + files: dict[UUID, QuivrFileSerialized] + + +class BrainSerialized(BaseModel): + id: UUID + name: str + chat_history: list[ChatMessage] + vectordb_config: Union[FAISSConfig, PGVectorConfig] = Field( + ..., discriminator="vectordb_type" + ) + storage_config: Union[TransparentStorageConfig, LocalStorageConfig] = Field( + ..., discriminator="storage_type" + ) + + llm_config: LLMEndpointConfig + embedding_config: EmbedderConfig diff --git a/backend/core/quivr_core/files/file.py b/backend/core/quivr_core/files/file.py index fd9d5935aa9..9f4089b103f 100644 --- a/backend/core/quivr_core/files/file.py +++ b/backend/core/quivr_core/files/file.py @@ -5,10 +5,22 @@ from contextlib import asynccontextmanager from enum import Enum from pathlib import Path -from typing import Any, AsyncGenerator, AsyncIterable +from typing import Any, AsyncGenerator, AsyncIterable, Self from uuid import UUID, uuid4 import aiofiles +from openai import BaseModel + + +class QuivrFileSerialized(BaseModel): + id: UUID + brain_id: UUID + path: Path + original_filename: str + file_size: int | None + file_extension: str + file_sha1: str + additional_metadata: dict[str, Any] class FileExtension(str, Enum): @@ -137,3 +149,28 @@ def metadata(self) -> dict[str, Any]: "file_size": self.file_size, **self.additional_metadata, } + + def serialize(self) -> QuivrFileSerialized: + return QuivrFileSerialized( + id=self.id, + brain_id=self.brain_id, + path=self.path.absolute(), + original_filename=self.original_filename, + file_size=self.file_size, + file_extension=self.file_extension, + file_sha1=self.file_sha1, + additional_metadata=self.additional_metadata, + ) + + @classmethod + def deserialize(cls, serialized: QuivrFileSerialized) -> Self: + return cls( + id=serialized.id, + brain_id=serialized.brain_id, + path=serialized.path, + original_filename=serialized.original_filename, + file_size=serialized.file_size, + file_extension=serialized.file_extension, + file_sha1=serialized.file_sha1, + metadata=serialized.additional_metadata, + ) diff --git a/backend/core/quivr_core/llm/llm_endpoint.py b/backend/core/quivr_core/llm/llm_endpoint.py index 54ed51faf06..51b83419502 100644 --- a/backend/core/quivr_core/llm/llm_endpoint.py +++ b/backend/core/quivr_core/llm/llm_endpoint.py @@ -1,10 +1,10 @@ import logging from urllib.parse import parse_qs, urlparse +from langchain_anthropic import ChatAnthropic from langchain_core.language_models.chat_models import BaseChatModel -from pydantic.v1 import SecretStr from langchain_openai import AzureChatOpenAI, ChatOpenAI -from langchain_anthropic import ChatAnthropic +from pydantic.v1 import SecretStr from quivr_core.brain.info import LLMInfo from quivr_core.config import LLMEndpointConfig @@ -27,8 +27,6 @@ def get_config(self): @classmethod def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): try: - - if config.model.startswith("azure/"): # Parse the URL parsed_url = urlparse(config.llm_base_url) diff --git a/backend/core/quivr_core/processor/registry.py b/backend/core/quivr_core/processor/registry.py index d37b098f5c2..697268a1e9c 100644 --- a/backend/core/quivr_core/processor/registry.py +++ b/backend/core/quivr_core/processor/registry.py @@ -147,8 +147,6 @@ def get_processor_class(file_extension: FileExtension | str) -> Type[ProcessorBa if file_extension not in known_processors: raise ValueError(f"Extension not known: {file_extension}") entries = known_processors[file_extension] - if file_extension == FileExtension.txt: - print(entries) while entries: proc_entry = heappop(entries) try: diff --git a/backend/core/quivr_core/quivr_rag_langgraph.py b/backend/core/quivr_core/quivr_rag_langgraph.py index d1ce14d4f43..f856e52ceac 100644 --- a/backend/core/quivr_core/quivr_rag_langgraph.py +++ b/backend/core/quivr_core/quivr_rag_langgraph.py @@ -1,15 +1,15 @@ import logging -from typing import AsyncGenerator, Optional, Sequence, Annotated, Sequence, TypedDict +from typing import Annotated, AsyncGenerator, Optional, Sequence, TypedDict + # TODO(@aminediro): this is the only dependency to langchain package, we should remove it from langchain.retrievers import ContextualCompressionRetriever from langchain_core.callbacks import Callbacks from langchain_core.documents import BaseDocumentCompressor, Document -from langchain_core.messages import AIMessage, HumanMessage, BaseMessage +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage from langchain_core.messages.ai import AIMessageChunk from langchain_core.vectorstores import VectorStore - -from langgraph.graph.message import add_messages from langgraph.graph import END, StateGraph +from langgraph.graph.message import add_messages from quivr_core.chat import ChatHistory from quivr_core.config import RAGConfig @@ -19,19 +19,20 @@ ParsedRAGResponse, QuivrKnowledge, RAGResponseMetadata, - cited_answer + cited_answer, ) -from quivr_core.prompts import CONDENSE_QUESTION_PROMPT, ANSWER_PROMPT +from quivr_core.prompts import ANSWER_PROMPT, CONDENSE_QUESTION_PROMPT from quivr_core.utils import ( + combine_documents, format_file_list, get_chunk_metadata, parse_chunk_response, - combine_documents, - parse_response + parse_response, ) logger = logging.getLogger("quivr_core") + class AgentState(TypedDict): # The add_messages function defines how an update should be processed # Default is to replace. add_messages says "append" @@ -43,6 +44,7 @@ class AgentState(TypedDict): files: str final_response: dict + class IdempotentCompressor(BaseDocumentCompressor): def compress_documents( self, @@ -50,7 +52,6 @@ def compress_documents( query: str, callbacks: Optional[Callbacks] = None, ) -> Sequence[Document]: - """ A no-op document compressor that simply returns the documents it is given. @@ -59,6 +60,7 @@ def compress_documents( """ return documents + class QuivrQARAGLangGraph: def __init__( self, @@ -96,18 +98,15 @@ def retriever(self): """ return self.vector_store.as_retriever() - def filter_history( - self, - state - ): + def filter_history(self, state): """ Filter out the chat history to only include the messages that are relevant to the current question - Takes in a chat_history= [HumanMessage(content='Qui est Chloé ? '), - AIMessage(content="Chloé est une salariée travaillant pour l'entreprise Quivr en tant qu'AI Engineer, - sous la direction de son supérieur hiérarchique, Stanislas Girard."), - HumanMessage(content='Dis moi en plus sur elle'), AIMessage(content=''), - HumanMessage(content='Dis moi en plus sur elle'), + Takes in a chat_history= [HumanMessage(content='Qui est Chloé ? '), + AIMessage(content="Chloé est une salariée travaillant pour l'entreprise Quivr en tant qu'AI Engineer, + sous la direction de son supérieur hiérarchique, Stanislas Girard."), + HumanMessage(content='Dis moi en plus sur elle'), AIMessage(content=''), + HumanMessage(content='Dis moi en plus sur elle'), AIMessage(content="Désolé, je n'ai pas d'autres informations sur Chloé à partir des fichiers fournis.")] Returns a filtered chat_history with in priority: first max_tokens, then max_history where a Human message and an AI message count as one pair a token is 4 characters @@ -131,7 +130,6 @@ def filter_history( return {"filtered_chat_history": filtered_chat_history} - ### Nodes def rewrite(self, state): """ @@ -145,7 +143,10 @@ def rewrite(self, state): """ # Grader - msg = CONDENSE_QUESTION_PROMPT.format(chat_history=state['filtered_chat_history'], question=state["messages"][0].content) + msg = CONDENSE_QUESTION_PROMPT.format( + chat_history=state["filtered_chat_history"], + question=state["messages"][0].content, + ) model = self.llm_endpoint._llm response = model.invoke(msg) @@ -179,7 +180,7 @@ def generate(self, state): question = messages[0].content files = state["files"] - docs = state['docs'] + docs = state["docs"] # Prompt prompt = self.rag_config.prompt @@ -206,11 +207,10 @@ def generate(self, state): response = rag_chain.invoke(final_inputs) formatted_response = { "answer": response, # Assuming the last message contains the final answer - "docs": docs + "docs": docs, } return {"messages": [response], "final_response": formatted_response} - def build_langgraph_chain(self): """ Builds the langchain chain for the given configuration. @@ -247,7 +247,7 @@ def create_graph(self): workflow.add_node("filter_history", self.filter_history) workflow.add_node("rewrite", self.rewrite) # Re-writing the question workflow.add_node("retrieve", self.retrieve) # retrieval - workflow.add_node("generate", self.generate) + workflow.add_node("generate", self.generate) # Add node for filtering history @@ -293,7 +293,9 @@ def answer( inputs, config={"metadata": metadata}, ) - response = parse_response(raw_llm_response["final_response"], self.rag_config.llm_config.model) + response = parse_response( + raw_llm_response["final_response"], self.rag_config.llm_config.model + ) return response async def answer_astream( @@ -303,7 +305,6 @@ async def answer_astream( list_files: list[QuivrKnowledge], metadata: dict[str, str] = {}, ) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: - """ Answer a question using the langgraph chain and yield each chunk of the answer separately. @@ -337,10 +338,17 @@ async def answer_astream( ): kind = event["event"] - if not sources and "output" in event["data"] and "docs" in event["data"]["output"]: + if ( + not sources + and "output" in event["data"] + and "docs" in event["data"]["output"] + ): sources = event["data"]["output"]["docs"] - if kind == "on_chat_model_stream" and event["metadata"]["langgraph_node"] == "generate": + if ( + kind == "on_chat_model_stream" + and event["metadata"]["langgraph_node"] == "generate" + ): chunk = event["data"]["chunk"] rolling_message, answer_str = parse_chunk_response( diff --git a/backend/core/quivr_core/storage/local_storage.py b/backend/core/quivr_core/storage/local_storage.py index 51d41f12960..146c0f725bb 100644 --- a/backend/core/quivr_core/storage/local_storage.py +++ b/backend/core/quivr_core/storage/local_storage.py @@ -1,9 +1,10 @@ import os import shutil from pathlib import Path -from typing import Set +from typing import Self, Set from uuid import UUID +from quivr_core.brain.serialization import LocalStorageConfig, TransparentStorageConfig from quivr_core.files.file import QuivrFile from quivr_core.storage.storage_base import StorageBase @@ -57,6 +58,12 @@ async def get_files(self) -> list[QuivrFile]: async def remove_file(self, file_id: UUID) -> None: raise NotImplementedError + @classmethod + def load(cls, config: LocalStorageConfig) -> Self: + tstorage = cls(dir_path=config.storage_path) + tstorage.files = [QuivrFile.deserialize(f) for f in config.files.values()] + return tstorage + class TransparentStorage(StorageBase): """Transparent Storage.""" @@ -77,3 +84,11 @@ async def remove_file(self, file_id: UUID) -> None: async def get_files(self) -> list[QuivrFile]: return list(self.id_files.values()) + + @classmethod + def load(cls, config: TransparentStorageConfig) -> Self: + tstorage = cls() + tstorage.id_files = { + i: QuivrFile.deserialize(f) for i, f in config.files.items() + } + return tstorage diff --git a/backend/core/tests/test_brain.py b/backend/core/tests/test_brain.py index 93eb1c350df..367df9e0704 100644 --- a/backend/core/tests/test_brain.py +++ b/backend/core/tests/test_brain.py @@ -4,7 +4,6 @@ import pytest from langchain_core.documents import Document from langchain_core.embeddings import Embeddings - from quivr_core.brain import Brain from quivr_core.chat import ChatHistory from quivr_core.llm import LLMEndpoint From edc4118ba15c73853838278dcb56dfcc3838e517 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Sun, 15 Sep 2024 12:05:28 +0200 Subject: [PATCH 06/13] chore(main): release 0.0.313 (#3205) :robot: I have created a release *beep* *boop* --- ## 0.0.313 (2024-09-13) ## What's Changed * feat: save and load brain by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3202 **Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.312...v0.0.313 --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 33f13453d19..3ae474fcc93 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { "backend/core": "0.0.14", - ".": "0.0.312" + ".": "0.0.313" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index bdbfc85220d..9f43c6e8cf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.0.313 (2024-09-13) + +## What's Changed +* feat: save and load brain by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3202 + + +**Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.312...v0.0.313 + ## 0.0.312 (2024-09-13) ## What's Changed From 71edca572ffd2901ed582005ac4b2803d9d95e57 Mon Sep 17 00:00:00 2001 From: AmineDiro Date: Mon, 16 Sep 2024 13:31:09 +0200 Subject: [PATCH 07/13] feat: CRUD KMS (no syncs) (#3162) # Description closes #3056. closes #3198 - Create knowledge route - Get knowledge route - List knowledge route : accepts knowledge_id | None. None to list root knowledge for use - Update (patch) knowledge to rename and move knowledge - Remove knowledge: Cascade if parent_id in knowledge and cleanup storage - Link storage upload to knowledge_service - Relax sha1 file constraint - Tests to all repository / service --------- Co-authored-by: Stan Girard --- .github/workflows/backend-tests.yml | 6 +- .pre-commit-config.yaml | 7 - .../quivr_api/middlewares/auth/auth_bearer.py | 9 +- .../modules/brain/entity/brain_entity.py | 1 + .../brain/service/brain_vector_service.py | 8 +- backend/api/quivr_api/modules/conftest.py | 3 - backend/api/quivr_api/modules/dependencies.py | 4 +- .../knowledge/controller/knowledge_routes.py | 175 ++- .../quivr_api/modules/knowledge/dto/inputs.py | 16 +- .../modules/knowledge/dto/outputs.py | 4 +- .../modules/knowledge/entity/knowledge.py | 84 +- .../knowledge/repository/knowledges.py | 125 +- .../modules/knowledge/repository/storage.py | 84 +- .../knowledge/repository/storage_interface.py | 24 +- .../knowledge/service/knowledge_exceptions.py | 34 + .../knowledge/service/knowledge_service.py | 150 ++- .../modules/knowledge/tests/conftest.py | 67 ++ .../tests/test_knowledge_controller.py | 74 ++ .../knowledge/tests/test_knowledge_entity.py | 229 ++++ .../knowledge/tests/test_knowledge_service.py | 1019 +++++++++++++++++ .../knowledge/tests/test_knowledges.py | 450 -------- .../modules/sync/tests/test_syncutils.py | 9 +- .../upload/controller/upload_routes.py | 6 +- backend/api/quivr_api/routes/crawl_routes.py | 2 +- backend/api/quivr_api/utils/partial.py | 50 + backend/core/quivr_core/models.py | 1 + .../20240905153004_knowledge-folders.sql | 31 + backend/worker/quivr_worker/celery_worker.py | 40 +- .../quivr_worker/process/process_s3_file.py | 25 +- .../syncs/process_active_syncs.py | 2 +- .../worker/quivr_worker/syncs/store_notion.py | 2 + backend/worker/quivr_worker/syncs/utils.py | 7 +- 32 files changed, 2157 insertions(+), 591 deletions(-) create mode 100644 backend/api/quivr_api/modules/knowledge/service/knowledge_exceptions.py create mode 100644 backend/api/quivr_api/modules/knowledge/tests/conftest.py create mode 100644 backend/api/quivr_api/modules/knowledge/tests/test_knowledge_controller.py create mode 100644 backend/api/quivr_api/modules/knowledge/tests/test_knowledge_entity.py create mode 100644 backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py delete mode 100644 backend/api/quivr_api/modules/knowledge/tests/test_knowledges.py create mode 100644 backend/api/quivr_api/utils/partial.py create mode 100644 backend/supabase/migrations/20240905153004_knowledge-folders.sql diff --git a/.github/workflows/backend-tests.yml b/.github/workflows/backend-tests.yml index 013d207c3e7..cc77192c634 100644 --- a/.github/workflows/backend-tests.yml +++ b/.github/workflows/backend-tests.yml @@ -9,7 +9,9 @@ on: jobs: test: runs-on: ubuntu-latest - + strategy: + matrix: + project: [quivr-api, quivr-worker] steps: - name: 👀 Checkout code uses: actions/checkout@v2 @@ -65,4 +67,4 @@ jobs: supabase start rye run python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" rye run python -c "import nltk;nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" - rye test -p quivr-api -p quivr-worker + rye test -p ${{ matrix.project }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aabdcae300f..9496988b0b7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,13 +32,6 @@ repos: - id: mypy name: mypy additional_dependencies: ["types-aiofiles"] - - repo: https://github.com/python-poetry/poetry - rev: "1.8.0" - hooks: - - id: poetry-check - args: ["-C", "./backend/core"] - - id: poetry-lock - args: ["-C", "./backend/core"] ci: autofix_commit_msg: | [pre-commit.ci] auto fixes from pre-commit.com hooks diff --git a/backend/api/quivr_api/middlewares/auth/auth_bearer.py b/backend/api/quivr_api/middlewares/auth/auth_bearer.py index 3001b7f45ba..73e3867cf79 100644 --- a/backend/api/quivr_api/middlewares/auth/auth_bearer.py +++ b/backend/api/quivr_api/middlewares/auth/auth_bearer.py @@ -3,6 +3,7 @@ from fastapi import Depends, HTTPException, Request from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer + from quivr_api.middlewares.auth.jwt_token_handler import ( decode_access_token, verify_token, @@ -57,9 +58,13 @@ async def authenticate( def get_test_user(self) -> UserIdentity: return UserIdentity( - email="admin@quivr.app", id="39418e3b-0258-4452-af60-7acfcc1263ff" # type: ignore + email="admin@quivr.app", + id="39418e3b-0258-4452-af60-7acfcc1263ff", # type: ignore ) # replace with test user information -def get_current_user(user: UserIdentity = Depends(AuthBearer())) -> UserIdentity: +auth_bearer = AuthBearer() + + +def get_current_user(user: UserIdentity = Depends(auth_bearer)) -> UserIdentity: return user diff --git a/backend/api/quivr_api/modules/brain/entity/brain_entity.py b/backend/api/quivr_api/modules/brain/entity/brain_entity.py index 6a722bda732..0b8e3460c39 100644 --- a/backend/api/quivr_api/modules/brain/entity/brain_entity.py +++ b/backend/api/quivr_api/modules/brain/entity/brain_entity.py @@ -69,6 +69,7 @@ class Brain(AsyncAttrs, SQLModel, table=True): back_populates="brains", link_model=KnowledgeBrain ) + # TODO : add # "meaning" "public"."vector", # "tags" "public"."tags"[] diff --git a/backend/api/quivr_api/modules/brain/service/brain_vector_service.py b/backend/api/quivr_api/modules/brain/service/brain_vector_service.py index 4016b8a7ed8..ec514cdd75c 100644 --- a/backend/api/quivr_api/modules/brain/service/brain_vector_service.py +++ b/backend/api/quivr_api/modules/brain/service/brain_vector_service.py @@ -2,7 +2,7 @@ from quivr_api.logger import get_logger from quivr_api.modules.brain.repository.brains_vectors import BrainsVectors -from quivr_api.modules.knowledge.repository.storage import Storage +from quivr_api.modules.knowledge.repository.storage import SupabaseS3Storage logger = get_logger(__name__) @@ -11,7 +11,7 @@ class BrainVectorService: def __init__(self, brain_id: UUID): self.repository = BrainsVectors() self.brain_id = brain_id - self.storage = Storage() + self.storage = SupabaseS3Storage() def create_brain_vector(self, vector_id: str, file_sha1: str): return self.repository.create_brain_vector(self.brain_id, vector_id, file_sha1) # type: ignore @@ -26,10 +26,10 @@ def update_brain_with_file(self, file_sha1: str): for vector_id in vector_ids: self.create_brain_vector(vector_id, file_sha1) - def delete_file_from_brain(self, file_name: str, only_vectors: bool = False): + async def delete_file_from_brain(self, file_name: str, only_vectors: bool = False): file_name_with_brain_id = f"{self.brain_id}/{file_name}" if not only_vectors: - self.storage.remove_file(file_name_with_brain_id) + await self.storage.remove_file(file_name_with_brain_id) return self.repository.delete_file_from_brain(self.brain_id, file_name) # type: ignore def delete_file_url_from_brain(self, file_name: str): diff --git a/backend/api/quivr_api/modules/conftest.py b/backend/api/quivr_api/modules/conftest.py index 721eacaeb99..d9def549ca0 100644 --- a/backend/api/quivr_api/modules/conftest.py +++ b/backend/api/quivr_api/modules/conftest.py @@ -24,9 +24,6 @@ "postgresql+asyncpg://" + pg_database_base_url, echo=True if os.getenv("ORM_DEBUG") else False, future=True, - pool_pre_ping=True, - pool_size=10, - pool_recycle=0.1, ) diff --git a/backend/api/quivr_api/modules/dependencies.py b/backend/api/quivr_api/modules/dependencies.py index edb6f728b60..fd71696cd15 100644 --- a/backend/api/quivr_api/modules/dependencies.py +++ b/backend/api/quivr_api/modules/dependencies.py @@ -7,8 +7,7 @@ from langchain_community.embeddings.ollama import OllamaEmbeddings # from langchain_community.vectorstores.supabase import SupabaseVectorStore -from langchain_openai import OpenAIEmbeddings -from langchain_openai import AzureOpenAIEmbeddings +from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings # from quivr_api.modules.vector.service.vector_service import VectorService # from quivr_api.modules.vectorstore.supabase import CustomSupabaseVectorStore @@ -22,7 +21,6 @@ from quivr_api.models.settings import BrainSettings from supabase.client import AsyncClient, Client, create_async_client, create_client - # Global variables to store the Supabase client and database instances _supabase_client: Optional[Client] = None _supabase_async_client: Optional[AsyncClient] = None diff --git a/backend/api/quivr_api/modules/knowledge/controller/knowledge_routes.py b/backend/api/quivr_api/modules/knowledge/controller/knowledge_routes.py index 5003eb8fb38..68d01afb0c5 100644 --- a/backend/api/quivr_api/modules/knowledge/controller/knowledge_routes.py +++ b/backend/api/quivr_api/modules/knowledge/controller/knowledge_routes.py @@ -1,8 +1,8 @@ from http import HTTPStatus -from typing import Annotated +from typing import Annotated, List, Optional from uuid import UUID -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile, status from quivr_api.logger import get_logger from quivr_api.middlewares.auth import AuthBearer, get_current_user @@ -12,6 +12,14 @@ validate_brain_authorization, ) from quivr_api.modules.dependencies import get_service +from quivr_api.modules.knowledge.dto.inputs import AddKnowledge +from quivr_api.modules.knowledge.entity.knowledge import Knowledge, KnowledgeUpdate +from quivr_api.modules.knowledge.service.knowledge_exceptions import ( + KnowledgeDeleteError, + KnowledgeForbiddenAccess, + KnowledgeNotFoundException, + UploadError, +) from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService from quivr_api.modules.upload.service.generate_file_signed_url import ( generate_file_signed_url, @@ -21,9 +29,8 @@ knowledge_router = APIRouter() logger = get_logger(__name__) -KnowledgeServiceDep = Annotated[ - KnowledgeService, Depends(get_service(KnowledgeService)) -] +get_km_service = get_service(KnowledgeService) +KnowledgeServiceDep = Annotated[KnowledgeService, Depends(get_km_service)] @knowledge_router.get( @@ -53,7 +60,7 @@ async def list_knowledge_in_brain_endpoint( ], tags=["Knowledge"], ) -async def delete_endpoint( +async def delete_knowledge_brain( knowledge_id: UUID, knowledge_service: KnowledgeServiceDep, current_user: UserIdentity = Depends(get_current_user), @@ -65,7 +72,7 @@ async def delete_endpoint( knowledge = await knowledge_service.get_knowledge(knowledge_id) file_name = knowledge.file_name if knowledge.file_name else knowledge.url - await knowledge_service.remove_knowledge(brain_id, knowledge_id) + await knowledge_service.remove_knowledge_brain(brain_id, knowledge_id) return { "message": f"{file_name} of brain {brain_id} has been deleted by user {current_user.email}." @@ -88,13 +95,13 @@ async def generate_signed_url_endpoint( knowledge = await knowledge_service.get_knowledge(knowledge_id) - if len(knowledge.brain_ids) == 0: + if len(knowledge.brains) == 0: raise HTTPException( status_code=HTTPStatus.NOT_FOUND, detail="knowledge not associated with brains yet.", ) - brain_id = knowledge.brain_ids[0] + brain_id = knowledge.brains[0]["brain_id"] validate_brain_authorization(brain_id=brain_id, user_id=current_user.id) @@ -108,3 +115,153 @@ async def generate_signed_url_endpoint( file_signed_url = generate_file_signed_url(file_path_in_storage) return file_signed_url + + +@knowledge_router.post( + "/knowledge/", + tags=["Knowledge"], + response_model=Knowledge, +) +async def create_knowledge( + knowledge_data: str = File(...), + file: Optional[UploadFile] = None, + knowledge_service: KnowledgeService = Depends(get_km_service), + current_user: UserIdentity = Depends(get_current_user), +): + knowledge = AddKnowledge.model_validate_json(knowledge_data) + if not knowledge.file_name and not knowledge.url: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Either file_name or url must be provided", + ) + try: + km = await knowledge_service.create_knowledge( + knowledge_to_add=knowledge, upload_file=file, user_id=current_user.id + ) + km_dto = await km.to_dto() + return km_dto + except ValueError: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail="Unprocessable knowledge ", + ) + except FileExistsError: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, detail="Existing knowledge" + ) + except UploadError: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Error occured uploading knowledge", + ) + except Exception: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + + +@knowledge_router.get( + "/knowledge/children", + response_model=List[Knowledge] | None, + tags=["Knowledge"], +) +async def list_knowledge( + parent_id: UUID | None = None, + knowledge_service: KnowledgeService = Depends(get_km_service), + current_user: UserIdentity = Depends(get_current_user), +): + try: + # TODO: Returns one level of children + children = await knowledge_service.list_knowledge(parent_id, current_user.id) + return [await c.to_dto(get_children=False) for c in children] + except KnowledgeNotFoundException as e: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, detail=f"{e.message}" + ) + except KnowledgeForbiddenAccess as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail=f"{e.message}" + ) + except Exception: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + + +@knowledge_router.get( + "/knowledge/{knowledge_id}", + response_model=Knowledge, + tags=["Knowledge"], +) +async def get_knowledge( + knowledge_id: UUID, + knowledge_service: KnowledgeService = Depends(get_km_service), + current_user: UserIdentity = Depends(get_current_user), +): + try: + km = await knowledge_service.get_knowledge(knowledge_id) + if km.user_id != current_user.id: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="You do not have permission to access this knowledge.", + ) + return await km.to_dto() + except KnowledgeNotFoundException as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail=f"{e.message}" + ) + except Exception: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + + +@knowledge_router.patch( + "/knowledge/{knowledge_id}", + status_code=status.HTTP_202_ACCEPTED, + response_model=Knowledge, + tags=["Knowledge"], +) +async def update_knowledge( + knowledge_id: UUID, + payload: KnowledgeUpdate, + knowledge_service: KnowledgeService = Depends(get_km_service), + current_user: UserIdentity = Depends(get_current_user), +): + try: + km = await knowledge_service.get_knowledge(knowledge_id) + if km.user_id != current_user.id: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="You do not have permission to access this knowledge.", + ) + km = await knowledge_service.update_knowledge(km, payload) + return km + except KnowledgeNotFoundException as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail=f"{e.message}" + ) + except Exception: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + + +@knowledge_router.delete( + "/knowledge/{knowledge_id}", + status_code=status.HTTP_202_ACCEPTED, + tags=["Knowledge"], +) +async def delete_knowledge( + knowledge_id: UUID, + knowledge_service: KnowledgeService = Depends(get_km_service), + current_user: UserIdentity = Depends(get_current_user), +): + try: + km = await knowledge_service.get_knowledge(knowledge_id) + + if km.user_id != current_user.id: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="You do not have permission to remove this knowledge.", + ) + delete_response = await knowledge_service.remove_knowledge(km) + return delete_response + except KnowledgeNotFoundException as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail=f"{e.message}" + ) + except KnowledgeDeleteError: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) diff --git a/backend/api/quivr_api/modules/knowledge/dto/inputs.py b/backend/api/quivr_api/modules/knowledge/dto/inputs.py index a943ee6b4c4..85a2438e920 100644 --- a/backend/api/quivr_api/modules/knowledge/dto/inputs.py +++ b/backend/api/quivr_api/modules/knowledge/dto/inputs.py @@ -16,8 +16,16 @@ class CreateKnowledgeProperties(BaseModel): file_size: Optional[int] = None file_sha1: Optional[str] = None metadata: Optional[Dict[str, str]] = None + is_folder: bool = False + parent_id: Optional[UUID] = None - def dict(self, *args, **kwargs): - knowledge_dict = super().dict(*args, **kwargs) - knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id")) - return knowledge_dict + +class AddKnowledge(BaseModel): + file_name: Optional[str] = None + url: Optional[str] = None + extension: str = ".txt" + source: str = "local" + source_link: Optional[str] = None + metadata: Optional[Dict[str, str]] = None + is_folder: bool = False + parent_id: Optional[UUID] = None diff --git a/backend/api/quivr_api/modules/knowledge/dto/outputs.py b/backend/api/quivr_api/modules/knowledge/dto/outputs.py index a020dbece6e..20218dfce3e 100644 --- a/backend/api/quivr_api/modules/knowledge/dto/outputs.py +++ b/backend/api/quivr_api/modules/knowledge/dto/outputs.py @@ -4,6 +4,6 @@ class DeleteKnowledgeResponse(BaseModel): - file_name: str - status: str = "delete" + file_name: str | None = None + status: str = "DELETED" knowledge_id: UUID diff --git a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py index def4e42f5aa..d890ee42d1c 100644 --- a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py +++ b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py @@ -1,5 +1,6 @@ from datetime import datetime -from typing import Dict, List, Optional +from enum import Enum +from typing import Any, Dict, List, Optional from uuid import UUID from pydantic import BaseModel @@ -12,20 +13,44 @@ from quivr_api.modules.knowledge.entity.knowledge_brain import KnowledgeBrain +class KnowledgeSource(str, Enum): + LOCAL = "local" + WEB = "web" + GDRIVE = "google drive" + DROPBOX = "dropbox" + SHAREPOINT = "sharepoint" + + class Knowledge(BaseModel): id: UUID + file_size: int = 0 + status: KnowledgeStatus file_name: Optional[str] = None url: Optional[str] = None extension: str = ".txt" - status: str + is_folder: bool = False + updated_at: datetime + created_at: datetime source: Optional[str] = None source_link: Optional[str] = None - file_size: Optional[int] = None file_sha1: Optional[str] = None - updated_at: Optional[datetime] = None - created_at: Optional[datetime] = None metadata: Optional[Dict[str, str]] = None - brain_ids: list[UUID] + user_id: UUID + brains: List[Dict[str, Any]] + parent: Optional["Knowledge"] + children: Optional[list["Knowledge"]] + + +class KnowledgeUpdate(BaseModel): + file_name: Optional[str] = None + status: Optional[KnowledgeStatus] = None + url: Optional[str] = None + file_sha1: Optional[str] = None + extension: Optional[str] = None + parent_id: Optional[UUID] = None + source: Optional[str] = None + source_link: Optional[str] = None + metadata: Optional[Dict[str, str]] = None class KnowledgeDB(AsyncAttrs, SQLModel, table=True): @@ -49,23 +74,25 @@ class KnowledgeDB(AsyncAttrs, SQLModel, table=True): file_sha1: Optional[str] = Field( max_length=40 ) # FIXME: Should not be optional @chloedia - updated_at: datetime | None = Field( + created_at: datetime | None = Field( default=None, sa_column=Column( TIMESTAMP(timezone=False), server_default=text("CURRENT_TIMESTAMP"), ), ) - created_at: datetime | None = Field( + updated_at: datetime | None = Field( default=None, sa_column=Column( TIMESTAMP(timezone=False), server_default=text("CURRENT_TIMESTAMP"), + onupdate=datetime.utcnow, ), ) metadata_: Optional[Dict[str, str]] = Field( default=None, sa_column=Column("metadata", JSON) ) + is_folder: bool = Field(default=False) user_id: UUID = Field(foreign_key="users.id", nullable=False) brains: List["Brain"] = Relationship( back_populates="knowledges", @@ -73,10 +100,35 @@ class KnowledgeDB(AsyncAttrs, SQLModel, table=True): sa_relationship_kwargs={"lazy": "select"}, ) - async def to_dto(self) -> Knowledge: + parent_id: UUID | None = Field( + default=None, foreign_key="knowledge.id", ondelete="CASCADE" + ) + parent: Optional["KnowledgeDB"] = Relationship( + back_populates="children", + sa_relationship_kwargs={"remote_side": "KnowledgeDB.id"}, + ) + children: list["KnowledgeDB"] = Relationship( + back_populates="parent", + sa_relationship_kwargs={ + "cascade": "all, delete-orphan", + }, + ) + + # TODO: nested folder search + async def to_dto(self, get_children: bool = True) -> Knowledge: + assert ( + self.updated_at + ), "knowledge should be inserted before transforming to dto" + assert ( + self.created_at + ), "knowledge should be inserted before transforming to dto" brains = await self.awaitable_attrs.brains - size = self.file_size if self.file_size else 0 - sha1 = self.file_sha1 if self.file_sha1 else "" + children: list[KnowledgeDB] = ( + await self.awaitable_attrs.children if get_children else [] + ) + parent = await self.awaitable_attrs.parent + parent = await parent.to_dto(get_children=False) if parent else None + return Knowledge( id=self.id, # type: ignore file_name=self.file_name, @@ -85,10 +137,14 @@ async def to_dto(self) -> Knowledge: status=KnowledgeStatus(self.status), source=self.source, source_link=self.source_link, - file_size=size, - file_sha1=sha1, + is_folder=self.is_folder, + file_size=self.file_size or 0, + file_sha1=self.file_sha1, updated_at=self.updated_at, created_at=self.created_at, metadata=self.metadata_, # type: ignore - brain_ids=[brain.brain_id for brain in brains], + brains=[b.model_dump() for b in brains], + parent=parent, + children=[await c.to_dto(get_children=False) for c in children], + user_id=self.user_id, ) diff --git a/backend/api/quivr_api/modules/knowledge/repository/knowledges.py b/backend/api/quivr_api/modules/knowledge/repository/knowledges.py index 436e240610a..427b3be063f 100644 --- a/backend/api/quivr_api/modules/knowledge/repository/knowledges.py +++ b/backend/api/quivr_api/modules/knowledge/repository/knowledges.py @@ -1,9 +1,10 @@ -from typing import Sequence +from typing import Any, Sequence from uuid import UUID from fastapi import HTTPException from quivr_core.models import KnowledgeStatus from sqlalchemy.exc import IntegrityError, NoResultFound +from sqlalchemy.orm import joinedload from sqlmodel import select, text from sqlmodel.ext.asyncio.session import AsyncSession @@ -11,7 +12,15 @@ from quivr_api.modules.brain.entity.brain_entity import Brain from quivr_api.modules.dependencies import BaseRepository, get_supabase_client from quivr_api.modules.knowledge.dto.outputs import DeleteKnowledgeResponse -from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB +from quivr_api.modules.knowledge.entity.knowledge import ( + Knowledge, + KnowledgeDB, + KnowledgeUpdate, +) +from quivr_api.modules.knowledge.service.knowledge_exceptions import ( + KnowledgeNotFoundException, + KnowledgeUpdateError, +) logger = get_logger(__name__) @@ -22,7 +31,43 @@ def __init__(self, session: AsyncSession): supabase_client = get_supabase_client() self.db = supabase_client - async def insert_knowledge( + async def create_knowledge(self, knowledge: KnowledgeDB) -> KnowledgeDB: + try: + self.session.add(knowledge) + await self.session.commit() + await self.session.refresh(knowledge) + except IntegrityError: + await self.session.rollback() + raise + except Exception: + await self.session.rollback() + raise + return knowledge + + async def update_knowledge( + self, + knowledge: KnowledgeDB, + payload: Knowledge | KnowledgeUpdate | dict[str, Any], + ) -> KnowledgeDB: + try: + logger.debug(f"updating {knowledge.id} with payload {payload}") + if isinstance(payload, dict): + update_data = payload + else: + update_data = payload.model_dump(exclude_unset=True) + for field in update_data: + setattr(knowledge, field, update_data[field]) + + self.session.add(knowledge) + await self.session.commit() + await self.session.refresh(knowledge) + return knowledge + except IntegrityError as e: + await self.session.rollback() + logger.error(f"Error updating knowledge {e}") + raise KnowledgeUpdateError + + async def insert_knowledge_brain( self, knowledge: KnowledgeDB, brain_id: UUID ) -> KnowledgeDB: logger.debug(f"Inserting knowledge {knowledge}") @@ -69,6 +114,14 @@ async def remove_knowledge_from_brain( await self.session.refresh(knowledge) return knowledge + async def remove_knowledge(self, knowledge: KnowledgeDB) -> DeleteKnowledgeResponse: + assert knowledge.id + await self.session.delete(knowledge) + await self.session.commit() + return DeleteKnowledgeResponse( + status="deleted", knowledge_id=knowledge.id, file_name=knowledge.file_name + ) + async def remove_knowledge_by_id( self, knowledge_id: UUID ) -> DeleteKnowledgeResponse: @@ -126,14 +179,70 @@ async def get_knowledge_by_sha1(self, sha1: str) -> KnowledgeDB: return knowledge - async def get_knowledge_by_id(self, knowledge_id: UUID) -> KnowledgeDB: - query = select(KnowledgeDB).where(KnowledgeDB.id == knowledge_id) + async def get_all_children(self, parent_id: UUID) -> list[KnowledgeDB]: + query = text(""" + WITH RECURSIVE knowledge_tree AS ( + SELECT * + FROM knowledge + WHERE parent_id = :parent_id + UNION ALL + SELECT k.* + FROM knowledge k + JOIN knowledge_tree kt ON k.parent_id = kt.id + ) + SELECT * FROM knowledge_tree + """) + + result = await self.session.execute(query, params={"parent_id": parent_id}) + rows = result.fetchall() + knowledge_list = [] + for row in rows: + knowledge = KnowledgeDB( + id=row.id, + parent_id=row.parent_id, + file_name=row.file_name, + url=row.url, + extension=row.extension, + status=row.status, + source=row.source, + source_link=row.source_link, + file_size=row.file_size, + file_sha1=row.file_sha1, + created_at=row.created_at, + updated_at=row.updated_at, + metadata_=row.metadata, + is_folder=row.is_folder, + user_id=row.user_id, + ) + knowledge_list.append(knowledge) + + return knowledge_list + + async def get_root_knowledge_user(self, user_id: UUID) -> list[KnowledgeDB]: + query = ( + select(KnowledgeDB) + .where(KnowledgeDB.parent_id.is_(None)) # type: ignore + .where(KnowledgeDB.user_id == user_id) + .options(joinedload(KnowledgeDB.parent), joinedload(KnowledgeDB.children)) # type: ignore + ) result = await self.session.exec(query) - knowledge = result.first() + kms = result.unique().all() + return list(kms) + async def get_knowledge_by_id( + self, knowledge_id: UUID, user_id: UUID | None = None + ) -> KnowledgeDB: + query = ( + select(KnowledgeDB) + .where(KnowledgeDB.id == knowledge_id) + .options(joinedload(KnowledgeDB.parent), joinedload(KnowledgeDB.children)) # type: ignore + ) + if user_id: + query = query.where(KnowledgeDB.user_id == user_id) + result = await self.session.exec(query) + knowledge = result.first() if not knowledge: - raise NoResultFound("Knowledge not found") - + raise KnowledgeNotFoundException("Knowledge not found") return knowledge async def get_brain_by_id(self, brain_id: UUID) -> Brain: diff --git a/backend/api/quivr_api/modules/knowledge/repository/storage.py b/backend/api/quivr_api/modules/knowledge/repository/storage.py index 47120ba5bc5..0e58e25d9e4 100644 --- a/backend/api/quivr_api/modules/knowledge/repository/storage.py +++ b/backend/api/quivr_api/modules/knowledge/repository/storage.py @@ -1,29 +1,87 @@ +import mimetypes +from io import BufferedReader, FileIO + from quivr_api.logger import get_logger -from quivr_api.modules.dependencies import get_supabase_client +from quivr_api.modules.dependencies import get_supabase_async_client +from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB from quivr_api.modules.knowledge.repository.storage_interface import StorageInterface logger = get_logger(__name__) -class Storage(StorageInterface): +class SupabaseS3Storage(StorageInterface): def __init__(self): - supabase_client = get_supabase_client() - self.db = supabase_client + self.client = None - def upload_file(self, file_name: str): - """ - Upload file to storage - """ - self.db.storage.from_("quivr").download(file_name) + async def _set_client(self): + if self.client is None: + self.client = await get_supabase_async_client() + + def get_storage_path( + self, + knowledge: KnowledgeDB, + ) -> str: + if knowledge.id is None: + raise ValueError("knowledge should have a valid id") + return str(knowledge.id) + + async def upload_file_storage( + self, + knowledge: KnowledgeDB, + knowledge_data: FileIO | BufferedReader | bytes, + upsert: bool = False, + ): + await self._set_client() + assert self.client - def remove_file(self, file_name: str): + mime_type = "application/html" + if knowledge.file_name: + guessed_mime_type, _ = mimetypes.guess_type(knowledge.file_name) + mime_type = guessed_mime_type or mime_type + + storage_path = self.get_storage_path(knowledge) + logger.info( + f"Uploading file to s3://quivr/{storage_path} using supabase. upsert={upsert}, mimetype={mime_type}" + ) + + if upsert: + _ = await self.client.storage.from_("quivr").update( + storage_path, + knowledge_data, + file_options={ + "content-type": mime_type, + "upsert": "true", + "cache-control": "3600", + }, + ) + return storage_path + else: + # check if file sha1 is already in storage + try: + _ = await self.client.storage.from_("quivr").upload( + storage_path, + knowledge_data, + file_options={ + "content-type": mime_type, + "upsert": "false", + "cache-control": "3600", + }, + ) + return storage_path + + except Exception as e: + if "The resource already exists" in str(e) and not upsert: + raise FileExistsError(f"File {storage_path} already exists") + raise e + + async def remove_file(self, storage_path: str): """ Remove file from storage """ + await self._set_client() + assert self.client try: - response = self.db.storage.from_("quivr").remove([file_name]) + response = await self.client.storage.from_("quivr").remove([storage_path]) return response except Exception as e: logger.error(e) - # raise e - diff --git a/backend/api/quivr_api/modules/knowledge/repository/storage_interface.py b/backend/api/quivr_api/modules/knowledge/repository/storage_interface.py index 228c998276e..bd5a3debc03 100644 --- a/backend/api/quivr_api/modules/knowledge/repository/storage_interface.py +++ b/backend/api/quivr_api/modules/knowledge/repository/storage_interface.py @@ -1,10 +1,26 @@ from abc import ABC, abstractmethod +from io import BufferedReader, FileIO + +from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB class StorageInterface(ABC): @abstractmethod - def remove_file(self, file_name: str): - """ - Remove file from storage - """ + def get_storage_path( + self, + knowledge: KnowledgeDB, + ) -> str: + pass + + @abstractmethod + async def upload_file_storage( + self, + knowledge: KnowledgeDB, + knowledge_data: FileIO | BufferedReader | bytes, + upsert: bool = False, + ): + pass + + @abstractmethod + async def remove_file(self, storage_path: str): pass diff --git a/backend/api/quivr_api/modules/knowledge/service/knowledge_exceptions.py b/backend/api/quivr_api/modules/knowledge/service/knowledge_exceptions.py new file mode 100644 index 00000000000..c95cefa4558 --- /dev/null +++ b/backend/api/quivr_api/modules/knowledge/service/knowledge_exceptions.py @@ -0,0 +1,34 @@ +class KnowledgeException(Exception): + def __init__(self, message="A knowledge-related error occurred"): + self.message = message + super().__init__(self.message) + + +class UploadError(KnowledgeException): + def __init__(self, message="An error occurred while uploading"): + super().__init__(message) + + +class KnowledgeCreationError(KnowledgeException): + def __init__(self, message="An error occurred while creating"): + super().__init__(message) + + +class KnowledgeUpdateError(KnowledgeException): + def __init__(self, message="An error occurred while updating"): + super().__init__(message) + + +class KnowledgeDeleteError(KnowledgeException): + def __init__(self, message="An error occurred while deleting"): + super().__init__(message) + + +class KnowledgeForbiddenAccess(KnowledgeException): + def __init__(self, message="You do not have permission to access this knowledge."): + super().__init__(message) + + +class KnowledgeNotFoundException(KnowledgeException): + def __init__(self, message="The requested knowledge was not found"): + super().__init__(message) diff --git a/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py b/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py index 987bff7d1c8..cfc88884b98 100644 --- a/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py +++ b/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py @@ -1,18 +1,33 @@ -from typing import List +import asyncio +import io +from typing import Any, List from uuid import UUID +from fastapi import UploadFile from quivr_core.models import KnowledgeStatus from sqlalchemy.exc import NoResultFound from quivr_api.logger import get_logger from quivr_api.modules.dependencies import BaseService from quivr_api.modules.knowledge.dto.inputs import ( + AddKnowledge, CreateKnowledgeProperties, ) from quivr_api.modules.knowledge.dto.outputs import DeleteKnowledgeResponse -from quivr_api.modules.knowledge.entity.knowledge import Knowledge, KnowledgeDB +from quivr_api.modules.knowledge.entity.knowledge import ( + Knowledge, + KnowledgeDB, + KnowledgeSource, + KnowledgeUpdate, +) from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository -from quivr_api.modules.knowledge.repository.storage import Storage +from quivr_api.modules.knowledge.repository.storage import SupabaseS3Storage +from quivr_api.modules.knowledge.repository.storage_interface import StorageInterface +from quivr_api.modules.knowledge.service.knowledge_exceptions import ( + KnowledgeDeleteError, + KnowledgeForbiddenAccess, + UploadError, +) from quivr_api.modules.sync.entity.sync_models import ( DBSyncFile, DownloadedSyncFile, @@ -26,9 +41,13 @@ class KnowledgeService(BaseService[KnowledgeRepository]): repository_cls = KnowledgeRepository - def __init__(self, repository: KnowledgeRepository): + def __init__( + self, + repository: KnowledgeRepository, + storage: StorageInterface = SupabaseS3Storage(), + ): self.repository = repository - self.storage = Storage() + self.storage = storage async def get_knowledge_sync(self, sync_id: int) -> Knowledge: km = await self.repository.get_knowledge_by_sync_id(sync_id) @@ -54,19 +73,37 @@ async def get_knowledge_storage_path( except NoResultFound: raise FileNotFoundError(f"No knowledge for file_name: {file_name}") - async def get_knowledge(self, knowledge_id: UUID) -> Knowledge: - inserted_knowledge_db_instance = await self.repository.get_knowledge_by_id( - knowledge_id - ) - assert inserted_knowledge_db_instance.id, "Knowledge ID not generated" - km = await inserted_knowledge_db_instance.to_dto() - return km + async def list_knowledge( + self, knowledge_id: UUID | None, user_id: UUID | None = None + ) -> list[KnowledgeDB]: + if knowledge_id is not None: + km = await self.repository.get_knowledge_by_id(knowledge_id, user_id) + return km.children + else: + if user_id is None: + raise KnowledgeForbiddenAccess( + "can't get root knowledges without user_id" + ) + return await self.repository.get_root_knowledge_user(user_id) + + async def get_knowledge( + self, knowledge_id: UUID, user_id: UUID | None = None + ) -> KnowledgeDB: + return await self.repository.get_knowledge_by_id(knowledge_id, user_id) + async def update_knowledge( + self, + knowledge: KnowledgeDB, + payload: Knowledge | KnowledgeUpdate | dict[str, Any], + ): + return await self.repository.update_knowledge(knowledge, payload) + + # TODO: Remove all of this # TODO (@aminediro): Replace with ON CONFLICT smarter query... # there is a chance of race condition but for now we let it crash in worker # the tasks will be dealt with on retry async def update_sha1_conflict( - self, knowledge: Knowledge, brain_id: UUID, file_sha1: str + self, knowledge: KnowledgeDB, brain_id: UUID, file_sha1: str ) -> bool: assert knowledge.id knowledge.file_sha1 = file_sha1 @@ -89,12 +126,12 @@ async def update_sha1_conflict( ) else: await self.repository.link_to_brain(existing_knowledge, brain_id) - await self.remove_knowledge(brain_id, knowledge.id) + await self.remove_knowledge_brain(brain_id, knowledge.id) return False else: logger.debug(f"Removing previous errored file {existing_knowledge.id}") assert existing_knowledge.id - await self.remove_knowledge(brain_id, existing_knowledge.id) + await self.remove_knowledge_brain(brain_id, existing_knowledge.id) await self.update_file_sha1_knowledge(knowledge.id, knowledge.file_sha1) return True except NoResultFound: @@ -104,7 +141,47 @@ async def update_sha1_conflict( await self.update_file_sha1_knowledge(knowledge.id, knowledge.file_sha1) return True - async def insert_knowledge( + async def create_knowledge( + self, + user_id: UUID, + knowledge_to_add: AddKnowledge, + upload_file: UploadFile | None = None, + ) -> KnowledgeDB: + knowledgedb = KnowledgeDB( + user_id=user_id, + file_name=knowledge_to_add.file_name, + is_folder=knowledge_to_add.is_folder, + url=knowledge_to_add.url, + extension=knowledge_to_add.extension, + source=knowledge_to_add.source, + source_link=knowledge_to_add.source_link, + file_size=upload_file.size if upload_file else 0, + metadata_=knowledge_to_add.metadata, # type: ignore + status=KnowledgeStatus.RESERVED, + parent_id=knowledge_to_add.parent_id, + ) + knowledge_db = await self.repository.create_knowledge(knowledgedb) + try: + if knowledgedb.source == KnowledgeSource.LOCAL and upload_file: + # NOTE(@aminediro): Unnecessary mem buffer because supabase doesnt accept FileIO.. + buff_reader = io.BufferedReader(upload_file.file) # type: ignore + storage_path = await self.storage.upload_file_storage( + knowledgedb, buff_reader + ) + knowledgedb.source_link = storage_path + knowledge_db = await self.repository.update_knowledge( + knowledge_db, + KnowledgeUpdate(status=KnowledgeStatus.UPLOADED), # type: ignore + ) + return knowledge_db + except Exception as e: + logger.exception( + f"Error uploading knowledge {knowledgedb.id} to storage : {e}" + ) + await self.repository.remove_knowledge(knowledge=knowledge_db) + raise UploadError() + + async def insert_knowledge_brain( self, user_id: UUID, knowledge_to_add: CreateKnowledgeProperties, # FIXME: (later) @Amine brain id should not be in CreateKnowledgeProperties but since storage is brain_id/file_name @@ -122,7 +199,7 @@ async def insert_knowledge( user_id=user_id, ) - knowledge_db = await self.repository.insert_knowledge( + knowledge_db = await self.repository.insert_knowledge_brain( knowledge, brain_id=knowledge_to_add.brain_id ) @@ -150,7 +227,7 @@ async def update_status_knowledge( assert isinstance(knowledge.file_name, str), "file_name should be a string" file_name_with_brain_id = f"{brain_id}/{knowledge.file_name}" try: - self.storage.remove_file(file_name_with_brain_id) + await self.storage.remove_file(file_name_with_brain_id) except Exception as e: logger.error( f"Error while removing file {file_name_with_brain_id}: {e}" @@ -161,29 +238,52 @@ async def update_status_knowledge( async def update_file_sha1_knowledge(self, knowledge_id: UUID, file_sha1: str): return await self.repository.update_file_sha1_knowledge(knowledge_id, file_sha1) - async def remove_knowledge( + async def remove_knowledge(self, knowledge: KnowledgeDB) -> DeleteKnowledgeResponse: + assert knowledge.id + + try: + # TODO: + # - Notion folders are special, they are themselves files and should be removed from storage + children = await self.repository.get_all_children(knowledge.id) + km_paths = [ + self.storage.get_storage_path(k) for k in children if not k.is_folder + ] + if not knowledge.is_folder: + km_paths.append(self.storage.get_storage_path(knowledge)) + + # recursively deletes files + deleted_km = await self.repository.remove_knowledge(knowledge) + await asyncio.gather(*[self.storage.remove_file(p) for p in km_paths]) + + return deleted_km + except Exception as e: + logger.error(f"Error while remove knowledge : {e}") + raise KnowledgeDeleteError + + async def remove_knowledge_brain( self, brain_id: UUID, knowledge_id: UUID, # FIXME: @amine when name in storage change no need for brain id ) -> DeleteKnowledgeResponse: # TODO: fix KMS # REDO ALL THIS - knowledge = await self.get_knowledge(knowledge_id) - if len(knowledge.brain_ids) > 1: + knowledge = await self.repository.get_knowledge_by_id(knowledge_id) + km_brains = await knowledge.awaitable_attrs.brains + if len(km_brains) > 1: km = await self.repository.remove_knowledge_from_brain( knowledge_id, brain_id ) + assert km.id return DeleteKnowledgeResponse(file_name=km.file_name, knowledge_id=km.id) else: message = await self.repository.remove_knowledge_by_id(knowledge_id) file_name_with_brain_id = f"{brain_id}/{message.file_name}" try: - self.storage.remove_file(file_name_with_brain_id) + await self.storage.remove_file(file_name_with_brain_id) except Exception as e: logger.error( f"Error while removing file {file_name_with_brain_id}: {e}" ) - return message async def remove_all_knowledges_from_brain(self, brain_id: UUID) -> None: @@ -210,7 +310,7 @@ async def update_or_create_knowledge_sync( # TODO: THIS IS A HACK!! Remove all of this if prev_sync_file: prev_knowledge = await self.get_knowledge_sync(sync_id=prev_sync_file.id) - if len(prev_knowledge.brain_ids) > 1: + if len(prev_knowledge.brains) > 1: await self.repository.remove_knowledge_from_brain( prev_knowledge.id, brain_id ) @@ -231,7 +331,7 @@ async def update_or_create_knowledge_sync( file_sha1=None, metadata={"sync_file_id": str(sync_id)}, ) - added_knowledge = await self.insert_knowledge( + added_knowledge = await self.insert_knowledge_brain( knowledge_to_add=knowledge_to_add, user_id=user_id ) return added_knowledge diff --git a/backend/api/quivr_api/modules/knowledge/tests/conftest.py b/backend/api/quivr_api/modules/knowledge/tests/conftest.py new file mode 100644 index 00000000000..2074110f6b5 --- /dev/null +++ b/backend/api/quivr_api/modules/knowledge/tests/conftest.py @@ -0,0 +1,67 @@ +from io import BufferedReader, FileIO + +from quivr_api.modules.knowledge.entity.knowledge import Knowledge, KnowledgeDB +from quivr_api.modules.knowledge.repository.storage_interface import StorageInterface + + +class ErrorStorage(StorageInterface): + async def upload_file_storage( + self, + knowledge: KnowledgeDB, + knowledge_data: FileIO | BufferedReader | bytes, + upsert: bool = False, + ): + raise SystemError + + def get_storage_path( + self, + knowledge: KnowledgeDB | Knowledge, + ) -> str: + if knowledge.id is None: + raise ValueError("knowledge should have a valid id") + return str(knowledge.id) + + async def remove_file(self, storage_path: str): + raise SystemError + + +class FakeStorage(StorageInterface): + def __init__(self): + self.storage = {} + + def get_storage_path( + self, + knowledge: KnowledgeDB | Knowledge, + ) -> str: + if knowledge.id is None: + raise ValueError("knowledge should have a valid id") + return str(knowledge.id) + + async def upload_file_storage( + self, + knowledge: KnowledgeDB, + knowledge_data: FileIO | BufferedReader | bytes, + upsert: bool = False, + ): + storage_path = f"{knowledge.id}" + if not upsert and storage_path in self.storage: + raise ValueError(f"File already exists at {storage_path}") + self.storage[storage_path] = knowledge_data + return storage_path + + async def remove_file(self, storage_path: str): + if storage_path not in self.storage: + raise FileNotFoundError(f"File not found at {storage_path}") + del self.storage[storage_path] + + # Additional helper methods for testing + def get_file(self, storage_path: str) -> FileIO | BufferedReader | bytes: + if storage_path not in self.storage: + raise FileNotFoundError(f"File not found at {storage_path}") + return self.storage[storage_path] + + def knowledge_exists(self, knowledge: KnowledgeDB | Knowledge) -> bool: + return self.get_storage_path(knowledge) in self.storage + + def clear_storage(self): + self.storage.clear() diff --git a/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_controller.py b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_controller.py new file mode 100644 index 00000000000..cf6313e97a1 --- /dev/null +++ b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_controller.py @@ -0,0 +1,74 @@ +import json + +import pytest +import pytest_asyncio +from httpx import ASGITransport, AsyncClient +from sqlmodel import select +from sqlmodel.ext.asyncio.session import AsyncSession + +from quivr_api.main import app +from quivr_api.middlewares.auth.auth_bearer import get_current_user +from quivr_api.modules.knowledge.controller.knowledge_routes import get_km_service +from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository +from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService +from quivr_api.modules.knowledge.tests.conftest import FakeStorage +from quivr_api.modules.user.entity.user_identity import User, UserIdentity + + +@pytest_asyncio.fixture(scope="function") +async def user(session: AsyncSession) -> User: + user_1 = ( + await session.exec(select(User).where(User.email == "admin@quivr.app")) + ).one() + assert user_1.id + return user_1 + + +@pytest_asyncio.fixture(scope="function") +async def test_client(session: AsyncSession, user: User): + def default_current_user() -> UserIdentity: + assert user.id + return UserIdentity(email=user.email, id=user.id) + + async def test_service(): + storage = FakeStorage() + repository = KnowledgeRepository(session) + return KnowledgeService(repository, storage) + + app.dependency_overrides[get_current_user] = default_current_user + app.dependency_overrides[get_km_service] = test_service + # app.dependency_overrides[get_async_session] = lambda: session + + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as ac: + yield ac + app.dependency_overrides = {} + + +@pytest.mark.asyncio(loop_scope="session") +async def test_post_knowledge(test_client: AsyncClient): + km_data = { + "file_name": "test_file.txt", + "source": "local", + "is_folder": False, + "parent_id": None, + } + + multipart_data = { + "knowledge_data": (None, json.dumps(km_data), "application/json"), + "file": ("test_file.txt", b"Test file content", "application/octet-stream"), + } + + response = await test_client.post( + "/knowledge/", + files=multipart_data, + ) + + assert response.status_code == 200 + + +@pytest.mark.asyncio(loop_scope="session") +async def test_add_knowledge_invalid_input(test_client): + response = await test_client.post("/knowledge/", files={}) + assert response.status_code == 422 diff --git a/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_entity.py b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_entity.py new file mode 100644 index 00000000000..7376559ebc3 --- /dev/null +++ b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_entity.py @@ -0,0 +1,229 @@ +from typing import List, Tuple +from uuid import uuid4 + +import pytest +import pytest_asyncio +from quivr_core.models import KnowledgeStatus +from sqlmodel import select, text +from sqlmodel.ext.asyncio.session import AsyncSession + +from quivr_api.modules.brain.entity.brain_entity import Brain, BrainType +from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB +from quivr_api.modules.user.entity.user_identity import User + +TestData = Tuple[Brain, List[KnowledgeDB]] + + +@pytest_asyncio.fixture(scope="function") +async def other_user(session: AsyncSession): + sql = text( + """ + INSERT INTO "auth"."users" ("instance_id", "id", "aud", "role", "email", "encrypted_password", "email_confirmed_at", "invited_at", "confirmation_token", "confirmation_sent_at", "recovery_token", "recovery_sent_at", "email_change_token_new", "email_change", "email_change_sent_at", "last_sign_in_at", "raw_app_meta_data", "raw_user_meta_data", "is_super_admin", "created_at", "updated_at", "phone", "phone_confirmed_at", "phone_change", "phone_change_token", "phone_change_sent_at", "email_change_token_current", "email_change_confirm_status", "banned_until", "reauthentication_token", "reauthentication_sent_at", "is_sso_user", "deleted_at") VALUES + ('00000000-0000-0000-0000-000000000000', :id , 'authenticated', 'authenticated', 'other@quivr.app', '$2a$10$vwKX0eMLlrOZvxQEA3Vl4e5V4/hOuxPjGYn9QK1yqeaZxa.42Uhze', '2024-01-22 22:27:00.166861+00', NULL, '', NULL, 'e91d41043ca2c83c3be5a6ee7a4abc8a4f4fb1afc0a8453c502af931', '2024-03-05 16:22:13.780421+00', '', '', NULL, '2024-03-30 23:21:12.077887+00', '{"provider": "email", "providers": ["email"]}', '{}', NULL, '2024-01-22 22:27:00.158026+00', '2024-04-01 17:40:15.332205+00', NULL, NULL, '', '', NULL, '', 0, NULL, '', NULL, false, NULL); + """ + ) + await session.execute(sql, params={"id": uuid4()}) + + other_user = ( + await session.exec(select(User).where(User.email == "other@quivr.app")) + ).one() + return other_user + + +@pytest_asyncio.fixture(scope="function") +async def user(session): + user_1 = ( + await session.exec(select(User).where(User.email == "admin@quivr.app")) + ).one() + return user_1 + + +@pytest_asyncio.fixture(scope="function") +async def brain(session): + brain_1 = Brain( + name="test_brain", + description="this is a test brain", + brain_type=BrainType.integration, + ) + session.add(brain_1) + await session.commit() + return brain_1 + + +@pytest_asyncio.fixture(scope="function") +async def folder(session, user): + folder = KnowledgeDB( + file_name="folder_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + + session.add(folder) + await session.commit() + await session.refresh(folder) + return folder + + +@pytest.mark.asyncio(loop_scope="session") +async def test_knowledge_default_file(session, folder, user): + km = KnowledgeDB( + file_name="test_file_1.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha1", + brains=[], + user_id=user.id, + parent_id=folder.id, + ) + session.add(km) + await session.commit() + await session.refresh(km) + + assert not km.is_folder + + +@pytest.mark.asyncio(loop_scope="session") +async def test_knowledge_parent(session: AsyncSession, user: User): + assert user.id + + km = KnowledgeDB( + file_name="test_file_1.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha1", + brains=[], + user_id=user.id, + ) + + folder = KnowledgeDB( + file_name="folder_1", + extension="", + is_folder=True, + status="UPLOADED", + source="local", + source_link="local", + file_size=-1, + file_sha1=None, + brains=[], + children=[km], + user_id=user.id, + ) + + session.add(folder) + await session.commit() + await session.refresh(folder) + await session.refresh(km) + + parent = await km.awaitable_attrs.parent + assert km.parent_id == folder.id, "parent_id isn't set to folder id" + assert parent.id == folder.id, "parent_id isn't set to folder id" + assert parent.is_folder + + query = select(KnowledgeDB).where(KnowledgeDB.id == folder.id) + folder = (await session.exec(query)).first() + assert folder + + children = await folder.awaitable_attrs.children + assert len(children) > 0 + + assert children[0].id == km.id + + +@pytest.mark.asyncio(loop_scope="session") +async def test_knowledge_remove_folder_cascade( + session: AsyncSession, + folder: KnowledgeDB, + user, +): + km = KnowledgeDB( + file_name="test_file_1.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha1", + brains=[], + user_id=user.id, + parent_id=folder.id, + ) + session.add(km) + await session.commit() + await session.refresh(km) + + # Check all removed + await session.delete(folder) + await session.commit() + + statement = select(KnowledgeDB) + results = (await session.exec(statement)).all() + assert results == [] + + +@pytest.mark.asyncio(loop_scope="session") +async def test_knowledge_dto(session, user, brain): + # add folder in brain + folder = KnowledgeDB( + file_name="folder_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[brain], + children=[], + user_id=user.id, + is_folder=True, + ) + km = KnowledgeDB( + file_name="test_file_1.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha1", + user_id=user.id, + brains=[brain], + parent=folder, + ) + session.add(km) + session.add(km) + await session.commit() + await session.refresh(km) + + km_dto = await km.to_dto() + + assert km_dto.file_name == km.file_name + assert km_dto.url == km.url + assert km_dto.extension == km.extension + assert km_dto.status == KnowledgeStatus(km.status) + assert km_dto.source == km.source + assert km_dto.source_link == km.source_link + assert km_dto.is_folder == km.is_folder + assert km_dto.file_size == km.file_size + assert km_dto.file_sha1 == km.file_sha1 + assert km_dto.updated_at == km.updated_at + assert km_dto.created_at == km.created_at + assert km_dto.metadata == km.metadata_ # type: ignor + assert km_dto.parent + assert km_dto.parent.id == folder.id + + folder_dto = await folder.to_dto() + assert folder_dto.brains[0] == brain.model_dump() + assert folder_dto.children == [await km.to_dto()] diff --git a/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py new file mode 100644 index 00000000000..169b9bef2ca --- /dev/null +++ b/backend/api/quivr_api/modules/knowledge/tests/test_knowledge_service.py @@ -0,0 +1,1019 @@ +import os +from io import BytesIO +from typing import List, Tuple +from uuid import uuid4 + +import pytest +import pytest_asyncio +from fastapi import UploadFile +from sqlalchemy.exc import NoResultFound +from sqlmodel import select, text +from sqlmodel.ext.asyncio.session import AsyncSession + +from quivr_api.modules.brain.entity.brain_entity import Brain, BrainType +from quivr_api.modules.knowledge.dto.inputs import AddKnowledge, KnowledgeStatus +from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB, KnowledgeUpdate +from quivr_api.modules.knowledge.entity.knowledge_brain import KnowledgeBrain +from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository +from quivr_api.modules.knowledge.service.knowledge_exceptions import ( + KnowledgeNotFoundException, + KnowledgeUpdateError, + UploadError, +) +from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService +from quivr_api.modules.knowledge.tests.conftest import ErrorStorage, FakeStorage +from quivr_api.modules.upload.service.upload_file import upload_file_storage +from quivr_api.modules.user.entity.user_identity import User +from quivr_api.modules.vector.entity.vector import Vector + +TestData = Tuple[Brain, List[KnowledgeDB]] + + +@pytest_asyncio.fixture(scope="function") +async def other_user(session: AsyncSession): + sql = text( + """ + INSERT INTO "auth"."users" ("instance_id", "id", "aud", "role", "email", "encrypted_password", "email_confirmed_at", "invited_at", "confirmation_token", "confirmation_sent_at", "recovery_token", "recovery_sent_at", "email_change_token_new", "email_change", "email_change_sent_at", "last_sign_in_at", "raw_app_meta_data", "raw_user_meta_data", "is_super_admin", "created_at", "updated_at", "phone", "phone_confirmed_at", "phone_change", "phone_change_token", "phone_change_sent_at", "email_change_token_current", "email_change_confirm_status", "banned_until", "reauthentication_token", "reauthentication_sent_at", "is_sso_user", "deleted_at") VALUES + ('00000000-0000-0000-0000-000000000000', :id , 'authenticated', 'authenticated', 'other@quivr.app', '$2a$10$vwKX0eMLlrOZvxQEA3Vl4e5V4/hOuxPjGYn9QK1yqeaZxa.42Uhze', '2024-01-22 22:27:00.166861+00', NULL, '', NULL, 'e91d41043ca2c83c3be5a6ee7a4abc8a4f4fb1afc0a8453c502af931', '2024-03-05 16:22:13.780421+00', '', '', NULL, '2024-03-30 23:21:12.077887+00', '{"provider": "email", "providers": ["email"]}', '{}', NULL, '2024-01-22 22:27:00.158026+00', '2024-04-01 17:40:15.332205+00', NULL, NULL, '', '', NULL, '', 0, NULL, '', NULL, false, NULL); + """ + ) + await session.execute(sql, params={"id": uuid4()}) + + other_user = ( + await session.exec(select(User).where(User.email == "other@quivr.app")) + ).one() + return other_user + + +@pytest_asyncio.fixture(scope="function") +async def user(session: AsyncSession) -> User: + user_1 = ( + await session.exec(select(User).where(User.email == "admin@quivr.app")) + ).one() + assert user_1.id + return user_1 + + +@pytest_asyncio.fixture(scope="function") +async def test_data(session: AsyncSession) -> TestData: + user_1 = ( + await session.exec(select(User).where(User.email == "admin@quivr.app")) + ).one() + assert user_1.id + # Brain data + brain_1 = Brain( + name="test_brain", + description="this is a test brain", + brain_type=BrainType.integration, + ) + + knowledge_brain_1 = KnowledgeDB( + file_name="test_file_1.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha1", + brains=[brain_1], + user_id=user_1.id, + ) + + knowledge_brain_2 = KnowledgeDB( + file_name="test_file_2.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha2", + brains=[], + user_id=user_1.id, + ) + + session.add(brain_1) + session.add(knowledge_brain_1) + session.add(knowledge_brain_2) + await session.commit() + return brain_1, [knowledge_brain_1, knowledge_brain_2] + + +@pytest_asyncio.fixture(scope="function") +async def folder_km_nested(session: AsyncSession, user: User): + assert user.id + + nested_folder = KnowledgeDB( + file_name="folder_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + folder = KnowledgeDB( + file_name="folder_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + parent=nested_folder, + ) + + knowledge_folder = KnowledgeDB( + file_name="file.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha2", + brains=[], + user_id=user.id, + parent=folder, + ) + + session.add(nested_folder) + session.add(folder) + session.add(knowledge_folder) + await session.commit() + await session.refresh(folder) + return nested_folder + + +@pytest_asyncio.fixture(scope="function") +async def folder_km(session: AsyncSession, user: User): + assert user.id + folder = KnowledgeDB( + file_name="folder_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + + knowledge_folder = KnowledgeDB( + file_name="file.txt", + extension=".txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test_sha2", + brains=[], + user_id=user.id, + parent=folder, + ) + + session.add(folder) + session.add(knowledge_folder) + await session.commit() + await session.refresh(folder) + return folder + + +@pytest.mark.asyncio(loop_scope="session") +async def test_updates_knowledge_status(session: AsyncSession, test_data: TestData): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + await repo.update_status_knowledge(knowledges[0].id, KnowledgeStatus.ERROR) + knowledge = await repo.get_knowledge_by_id(knowledges[0].id) + assert knowledge.status == KnowledgeStatus.ERROR + + +@pytest.mark.asyncio(loop_scope="session") +async def test_updates_knowledge_status_no_knowledge( + session: AsyncSession, test_data: TestData +): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + with pytest.raises(NoResultFound): + await repo.update_status_knowledge(uuid4(), KnowledgeStatus.UPLOADED) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_update_knowledge_source_link(session: AsyncSession, test_data: TestData): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + await repo.update_source_link_knowledge(knowledges[0].id, "new_source_link") + knowledge = await repo.get_knowledge_by_id(knowledges[0].id) + assert knowledge.source_link == "new_source_link" + + +@pytest.mark.asyncio(loop_scope="session") +async def test_remove_knowledge_from_brain(session: AsyncSession, test_data: TestData): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + knowledge = await repo.remove_knowledge_from_brain(knowledges[0].id, brain.brain_id) + assert brain.brain_id not in [ + b.brain_id for b in await knowledge.awaitable_attrs.brains + ] + + +@pytest.mark.asyncio(loop_scope="session") +async def test_cascade_remove_knowledge_by_id( + session: AsyncSession, test_data: TestData +): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + await repo.remove_knowledge_by_id(knowledges[0].id) + with pytest.raises(KnowledgeNotFoundException): + await repo.get_knowledge_by_id(knowledges[0].id) + + query = select(KnowledgeBrain).where( + KnowledgeBrain.knowledge_id == knowledges[0].id + ) + result = await session.exec(query) + knowledge_brain = result.first() + assert knowledge_brain is None + + query = select(Vector).where(Vector.knowledge_id == knowledges[0].id) + result = await session.exec(query) + vector = result.first() + assert vector is None + + +@pytest.mark.asyncio(loop_scope="session") +async def test_remove_all_knowledges_from_brain( + session: AsyncSession, test_data: TestData +): + brain, knowledges = test_data + assert brain.brain_id + + # supabase_client = get_supabase_client() + # db = supabase_client + # storage = db.storage.from_("quivr") + + # storage.upload(f"{brain.brain_id}/test_file_1", b"test_content") + + repo = KnowledgeRepository(session) + service = KnowledgeService(repo) + await repo.remove_all_knowledges_from_brain(brain.brain_id) + knowledges = await service.get_all_knowledge_in_brain(brain.brain_id) + assert len(knowledges) == 0 + + # response = storage.list(path=f"{brain.brain_id}") + # assert response == [] + # FIXME @aminediro &chloedia raise an error when trying to interact with storage UnboundLocalError: cannot access local variable 'response' where it is not associated with a value + + +@pytest.mark.asyncio(loop_scope="session") +async def test_duplicate_sha1_knowledge_same_user( + session: AsyncSession, test_data: TestData +): + brain, [existing_knowledge, _] = test_data + assert brain.brain_id + assert existing_knowledge.id + assert existing_knowledge.file_sha1 + repo = KnowledgeRepository(session) + knowledge = KnowledgeDB( + file_name="test_file_2", + extension="txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1=existing_knowledge.file_sha1, + brains=[brain], + user_id=existing_knowledge.user_id, + ) + + await repo.insert_knowledge_brain(knowledge, brain.brain_id) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_duplicate_sha1_knowledge_diff_user( + session: AsyncSession, test_data: TestData, other_user: User +): + brain, knowledges = test_data + assert other_user.id + assert brain.brain_id + assert knowledges[0].id + repo = KnowledgeRepository(session) + knowledge = KnowledgeDB( + file_name="test_file_2", + extension="txt", + status="UPLOADED", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1=knowledges[0].file_sha1, + brains=[brain], + user_id=other_user.id, # random user id + ) + + result = await repo.insert_knowledge_brain(knowledge, brain.brain_id) + assert result + + +@pytest.mark.asyncio(loop_scope="session") +async def test_add_knowledge_to_brain(session: AsyncSession, test_data: TestData): + brain, knowledges = test_data + assert brain.brain_id + assert knowledges[1].id + repo = KnowledgeRepository(session) + await repo.link_to_brain(knowledges[1], brain.brain_id) + knowledge = await repo.get_knowledge_by_id(knowledges[1].id) + brains_of_knowledge = [b.brain_id for b in await knowledge.awaitable_attrs.brains] + assert brain.brain_id in brains_of_knowledge + + query = select(KnowledgeBrain).where( + KnowledgeBrain.knowledge_id == knowledges[0].id + and KnowledgeBrain.brain_id == brain.brain_id + ) + result = await session.exec(query) + knowledge_brain = result.first() + assert knowledge_brain + + +# Knowledge Service +@pytest.mark.asyncio(loop_scope="session") +async def test_get_knowledge_in_brain(session: AsyncSession, test_data: TestData): + brain, knowledges = test_data + assert brain.brain_id + repo = KnowledgeRepository(session) + service = KnowledgeService(repo) + list_knowledge = await service.get_all_knowledge_in_brain(brain.brain_id) + assert len(list_knowledge) == 1 + brains_of_knowledge = [ + b.brain_id for b in await knowledges[0].awaitable_attrs.brains + ] + assert list_knowledge[0].id == knowledges[0].id + assert list_knowledge[0].file_name == knowledges[0].file_name + assert brain.brain_id in brains_of_knowledge + + +@pytest.mark.asyncio(loop_scope="session") +async def test_should_process_knowledge_exists( + session: AsyncSession, test_data: TestData +): + brain, [existing_knowledge, _] = test_data + assert brain.brain_id + new = KnowledgeDB( + file_name="new", + extension="txt", + status="PROCESSING", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1=None, + brains=[brain], + user_id=existing_knowledge.user_id, + ) + session.add(new) + await session.commit() + await session.refresh(new) + repo = KnowledgeRepository(session) + service = KnowledgeService(repo) + assert existing_knowledge.file_sha1 + with pytest.raises(FileExistsError): + await service.update_sha1_conflict( + new, brain.brain_id, file_sha1=existing_knowledge.file_sha1 + ) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_should_process_knowledge_link_brain( + session: AsyncSession, test_data: TestData +): + repo = KnowledgeRepository(session) + service = KnowledgeService(repo) + brain, [existing_knowledge, _] = test_data + user_id = existing_knowledge.user_id + assert brain.brain_id + prev = KnowledgeDB( + file_name="prev", + extension=".txt", + status=KnowledgeStatus.UPLOADED, + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test1", + brains=[brain], + user_id=user_id, + ) + brain_2 = Brain( + name="test_brain", + description="this is a test brain", + brain_type=BrainType.integration, + ) + session.add(brain_2) + session.add(prev) + await session.commit() + await session.refresh(prev) + await session.refresh(brain_2) + + assert prev.id + assert brain_2.brain_id + + new = KnowledgeDB( + file_name="new", + extension="txt", + status="PROCESSING", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1=None, + brains=[brain_2], + user_id=user_id, + ) + session.add(new) + await session.commit() + await session.refresh(new) + + incoming_knowledge = await new.to_dto() + assert prev.file_sha1 + + should_process = await service.update_sha1_conflict( + incoming_knowledge, brain_2.brain_id, file_sha1=prev.file_sha1 + ) + assert not should_process + + # Check prev knowledge was linked + assert incoming_knowledge.file_sha1 + prev_knowledge = await service.repository.get_knowledge_by_id(prev.id) + prev_brains = await prev_knowledge.awaitable_attrs.brains + assert {b.brain_id for b in prev_brains} == { + brain.brain_id, + brain_2.brain_id, + } + # Check new knowledge was removed + assert new.id + with pytest.raises(KnowledgeNotFoundException): + await service.repository.get_knowledge_by_id(new.id) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_should_process_knowledge_prev_error( + session: AsyncSession, test_data: TestData +): + repo = KnowledgeRepository(session) + service = KnowledgeService(repo) + brain, [existing_knowledge, _] = test_data + user_id = existing_knowledge.user_id + assert brain.brain_id + prev = KnowledgeDB( + file_name="prev", + extension="txt", + status=KnowledgeStatus.ERROR, + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1="test1", + brains=[brain], + user_id=user_id, + ) + session.add(prev) + await session.commit() + await session.refresh(prev) + + assert prev.id + + new = KnowledgeDB( + file_name="new", + extension="txt", + status="PROCESSING", + source="test_source", + source_link="test_source_link", + file_size=100, + file_sha1=None, + brains=[brain], + user_id=user_id, + ) + session.add(new) + await session.commit() + await session.refresh(new) + + incoming_knowledge = await new.to_dto() + assert prev.file_sha1 + should_process = await service.update_sha1_conflict( + incoming_knowledge, brain.brain_id, file_sha1=prev.file_sha1 + ) + + # Checks we should process this file + assert should_process + # Previous errored file is cleaned up + with pytest.raises(KnowledgeNotFoundException): + await service.repository.get_knowledge_by_id(prev.id) + + assert new.id + new = await service.repository.get_knowledge_by_id(new.id) + assert new.file_sha1 + + +@pytest.mark.asyncio(loop_scope="session") +async def test_get_knowledge_storage_path(session: AsyncSession, test_data: TestData): + _, [knowledge, _] = test_data + assert knowledge.file_name + repository = KnowledgeRepository(session) + service = KnowledgeService(repository) + brain_2 = Brain( + name="test_brain", + description="this is a test brain", + brain_type=BrainType.integration, + ) + session.add(brain_2) + await session.commit() + await session.refresh(brain_2) + assert brain_2.brain_id + km_data = os.urandom(128) + km_path = f"{str(knowledge.brains[0].brain_id)}/{knowledge.file_name}" + await upload_file_storage(km_data, km_path) + # Link knowledge to two brains + await repository.link_to_brain(knowledge, brain_2.brain_id) + storage_path = await service.get_knowledge_storage_path( + knowledge.file_name, brain_2.brain_id + ) + assert storage_path == km_path + + +@pytest.mark.asyncio(loop_scope="session") +async def test_create_knowledge_file(session: AsyncSession, user: User): + assert user.id + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + km_to_add = AddKnowledge( + file_name="test", + source="local", + is_folder=False, + parent_id=None, + ) + km_data = BytesIO(os.urandom(128)) + + km = await service.create_knowledge( + user_id=user.id, + knowledge_to_add=km_to_add, + upload_file=UploadFile(file=km_data, size=128, filename=km_to_add.file_name), + ) + + assert km.file_name == km_to_add.file_name + assert km.id + assert km.status == KnowledgeStatus.UPLOADED + assert not km.is_folder + # km in storage + storage.knowledge_exists(km) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_create_knowledge_folder(session: AsyncSession, user: User): + assert user.id + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + km_to_add = AddKnowledge( + file_name="test", + source="local", + is_folder=True, + parent_id=None, + ) + km_data = BytesIO(os.urandom(128)) + + km = await service.create_knowledge( + user_id=user.id, + knowledge_to_add=km_to_add, + upload_file=UploadFile(file=km_data, size=128, filename=km_to_add.file_name), + ) + + assert km.file_name == km_to_add.file_name + assert km.id + # Knowledge properties + assert km.file_name == km_to_add.file_name + assert km.is_folder == km_to_add.is_folder + assert km.url == km_to_add.url + assert km.extension == km_to_add.extension + assert km.source == km_to_add.source + assert km.file_size == 128 + assert km.metadata_ == km_to_add.metadata + assert km.is_folder == km_to_add.is_folder + assert km.status == KnowledgeStatus.UPLOADED + # Knowledge was saved + assert storage.knowledge_exists(km) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_create_knowledge_upload_error(session: AsyncSession, user: User): + assert user.id + storage = ErrorStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + km_to_add = AddKnowledge( + file_name="test", + source="local", + is_folder=True, + parent_id=None, + ) + km_data = BytesIO(os.urandom(128)) + + with pytest.raises(UploadError): + await service.create_knowledge( + user_id=user.id, + knowledge_to_add=km_to_add, + upload_file=UploadFile( + file=km_data, size=128, filename=km_to_add.file_name + ), + ) + # Check removed knowledge + statement = select(KnowledgeDB) + results = (await session.exec(statement)).all() + assert results == [] + + +@pytest.mark.asyncio(loop_scope="session") +async def test_get_knowledge(session: AsyncSession, folder_km: KnowledgeDB, user: User): + assert user.id + assert folder_km.id + storage = ErrorStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + result = await service.get_knowledge(folder_km.id) + assert result.id == folder_km.id + assert result.children + assert len(result.children) > 0 + assert result.children[0] == folder_km.children[0] + + +@pytest.mark.asyncio(loop_scope="session") +async def test_get_knowledge_nested( + session: AsyncSession, folder_km_nested: KnowledgeDB, user: User +): + assert user.id + assert folder_km_nested.id + storage = ErrorStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + result = await service.get_knowledge(folder_km_nested.id) + assert result.id == folder_km_nested.id + assert result.children + assert len(result.children) > 0 + assert result.children[0].is_folder + assert result.children[0] == folder_km_nested.children[0] + + +@pytest.mark.asyncio(loop_scope="session") +async def test_update_knowledge_rename( + session: AsyncSession, folder_km: KnowledgeDB, user: User +): + assert user.id + assert folder_km.id + storage = ErrorStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + new_km = await service.update_knowledge( + folder_km, + KnowledgeUpdate(file_name="change_name"), # type: ignore + ) + assert new_km.file_name == "change_name" + + +@pytest.mark.asyncio(loop_scope="session") +async def test_update_knowledge_move( + session: AsyncSession, folder_km: KnowledgeDB, user: User +): + assert user.id + assert folder_km.id + folder_2 = KnowledgeDB( + file_name="folder_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + session.add(folder_2) + await session.commit() + await session.refresh(folder_2) + + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + new_km = await service.update_knowledge( + folder_km, + KnowledgeUpdate(parent_id=folder_2.id), # type: ignore + ) + assert new_km.parent_id == folder_2.id + + +@pytest.mark.asyncio(loop_scope="session") +async def test_update_knowledge_move_error(session: AsyncSession, user: User): + assert user.id + file_1 = KnowledgeDB( + file_name="file_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=False, + ) + file_2 = KnowledgeDB( + file_name="file_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=False, + ) + session.add(file_1) + session.add(file_2) + await session.commit() + await session.refresh(file_1) + await session.refresh(file_2) + + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + with pytest.raises(KnowledgeUpdateError): + await service.update_knowledge( + file_2, + KnowledgeUpdate(parent_id=file_1.id), # type: ignore + ) + + +@pytest.mark.asyncio(loop_scope="session") +async def test_update_knowledge_multiple(session: AsyncSession, user: User): + assert user.id + file = KnowledgeDB( + file_name="file", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=None, + file_sha1=None, + user_id=user.id, + ) + folder = KnowledgeDB( + file_name="folder_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + session.add(file) + session.add(folder) + await session.commit() + await session.refresh(folder) + + storage = ErrorStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + await service.update_knowledge( + file, + KnowledgeUpdate(parent_id=folder.id, status="UPLOADED", file_sha1="sha1"), # type: ignore + ) + + km = ( + await session.exec(select(KnowledgeDB).where(KnowledgeDB.id == file.id)) + ).first() + assert km + assert km.parent_id == folder.id + assert km.status == "UPLOADED" + assert km.file_sha1 == "sha1" + + +@pytest.mark.asyncio(loop_scope="session") +async def test_remove_knowledge(session: AsyncSession, user: User): + assert user.id + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + km_to_add = AddKnowledge( + file_name="test", + source="local", + is_folder=False, + parent_id=None, + ) + km_data = BytesIO(os.urandom(128)) + + # Create the knowledge + km = await service.create_knowledge( + user_id=user.id, + knowledge_to_add=km_to_add, + upload_file=UploadFile(file=km_data, size=128, filename=km_to_add.file_name), + ) + + # Remove knowledge + response = await service.remove_knowledge(knowledge=km) + + assert response.knowledge_id == km.id + assert response.file_name == km.file_name + + assert not storage.knowledge_exists(km) + assert ( + await session.exec(select(KnowledgeDB).where(KnowledgeDB.id == km.id)) + ).first() is None + + +@pytest.mark.asyncio(loop_scope="session") +async def test_remove_knowledge_folder(session: AsyncSession, user: User): + assert user.id + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + folder_add = AddKnowledge( + file_name="folder", + source="local", + is_folder=True, + parent_id=None, + ) + + # Create the knowledge + folder = await service.create_knowledge( + user_id=user.id, knowledge_to_add=folder_add, upload_file=None + ) + file_add = AddKnowledge( + file_name="file", + source="local", + is_folder=False, + parent_id=folder.id, + ) + + km_data = BytesIO(os.urandom(128)) + file = await service.create_knowledge( + user_id=user.id, + knowledge_to_add=file_add, + upload_file=UploadFile(file=km_data, size=128, filename=file_add.file_name), + ) + assert storage.knowledge_exists(file) + + # Remove knowledge + await service.remove_knowledge(knowledge=folder) + + assert not storage.knowledge_exists(folder) + assert not storage.knowledge_exists(file) + assert ( + await session.exec(select(KnowledgeDB).where(KnowledgeDB.id == folder.id)) + ).first() is None + assert ( + await session.exec(select(KnowledgeDB).where(KnowledgeDB.id == file.id)) + ).first() is None + + +@pytest.mark.asyncio(loop_scope="session") +async def test_list_knowledge_root(session: AsyncSession, user: User): + assert user.id + root_file = KnowledgeDB( + file_name="file_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=None, + file_sha1=None, + user_id=user.id, + ) + + root_folder = KnowledgeDB( + file_name="folder", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + nested_file = KnowledgeDB( + file_name="file_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=10, + file_sha1=None, + user_id=user.id, + parent=root_folder, + ) + session.add(nested_file) + session.add(root_file) + session.add(root_folder) + await session.commit() + await session.refresh(root_folder) + await session.refresh(root_file) + await session.refresh(nested_file) + + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + root_kms = await service.list_knowledge(knowledge_id=None, user_id=user.id) + + assert len(root_kms) == 2 + assert {k.id for k in root_kms} == {root_folder.id, root_file.id} + + +@pytest.mark.asyncio(loop_scope="session") +async def test_list_knowledge(session: AsyncSession, user: User): + assert user.id + root_file = KnowledgeDB( + file_name="file_1", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=None, + file_sha1=None, + user_id=user.id, + ) + + root_folder = KnowledgeDB( + file_name="folder", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=4, + file_sha1=None, + brains=[], + children=[], + user_id=user.id, + is_folder=True, + ) + nested_file = KnowledgeDB( + file_name="file_2", + extension="", + status="UPLOADED", + source="local", + source_link="local", + file_size=10, + file_sha1=None, + user_id=user.id, + parent=root_folder, + ) + session.add(nested_file) + session.add(root_file) + session.add(root_folder) + await session.commit() + await session.refresh(root_folder) + await session.refresh(root_file) + await session.refresh(nested_file) + + storage = FakeStorage() + repository = KnowledgeRepository(session) + service = KnowledgeService(repository, storage) + + kms = await service.list_knowledge(knowledge_id=root_folder.id, user_id=user.id) + + assert len(kms) == 1 + assert kms[0].id == nested_file.id diff --git a/backend/api/quivr_api/modules/knowledge/tests/test_knowledges.py b/backend/api/quivr_api/modules/knowledge/tests/test_knowledges.py deleted file mode 100644 index 749bc0199f3..00000000000 --- a/backend/api/quivr_api/modules/knowledge/tests/test_knowledges.py +++ /dev/null @@ -1,450 +0,0 @@ -import os -from typing import List, Tuple -from uuid import uuid4 - -import pytest -import pytest_asyncio -from sqlalchemy.exc import IntegrityError, NoResultFound -from sqlmodel import select, text -from sqlmodel.ext.asyncio.session import AsyncSession - -from quivr_api.modules.brain.entity.brain_entity import Brain, BrainType -from quivr_api.modules.knowledge.dto.inputs import KnowledgeStatus -from quivr_api.modules.knowledge.entity.knowledge import KnowledgeDB -from quivr_api.modules.knowledge.entity.knowledge_brain import KnowledgeBrain -from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository -from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService -from quivr_api.modules.upload.service.upload_file import upload_file_storage -from quivr_api.modules.user.entity.user_identity import User -from quivr_api.modules.vector.entity.vector import Vector - -pg_database_base_url = "postgres:postgres@localhost:54322/postgres" - -TestData = Tuple[Brain, List[KnowledgeDB]] - - -@pytest_asyncio.fixture(scope="function") -async def other_user(session: AsyncSession): - sql = text( - """ - INSERT INTO "auth"."users" ("instance_id", "id", "aud", "role", "email", "encrypted_password", "email_confirmed_at", "invited_at", "confirmation_token", "confirmation_sent_at", "recovery_token", "recovery_sent_at", "email_change_token_new", "email_change", "email_change_sent_at", "last_sign_in_at", "raw_app_meta_data", "raw_user_meta_data", "is_super_admin", "created_at", "updated_at", "phone", "phone_confirmed_at", "phone_change", "phone_change_token", "phone_change_sent_at", "email_change_token_current", "email_change_confirm_status", "banned_until", "reauthentication_token", "reauthentication_sent_at", "is_sso_user", "deleted_at") VALUES - ('00000000-0000-0000-0000-000000000000', :id , 'authenticated', 'authenticated', 'other@quivr.app', '$2a$10$vwKX0eMLlrOZvxQEA3Vl4e5V4/hOuxPjGYn9QK1yqeaZxa.42Uhze', '2024-01-22 22:27:00.166861+00', NULL, '', NULL, 'e91d41043ca2c83c3be5a6ee7a4abc8a4f4fb1afc0a8453c502af931', '2024-03-05 16:22:13.780421+00', '', '', NULL, '2024-03-30 23:21:12.077887+00', '{"provider": "email", "providers": ["email"]}', '{}', NULL, '2024-01-22 22:27:00.158026+00', '2024-04-01 17:40:15.332205+00', NULL, NULL, '', '', NULL, '', 0, NULL, '', NULL, false, NULL); - """ - ) - await session.execute(sql, params={"id": uuid4()}) - - other_user = ( - await session.exec(select(User).where(User.email == "other@quivr.app")) - ).one() - return other_user - - -@pytest_asyncio.fixture(scope="function") -async def test_data(session: AsyncSession) -> TestData: - user_1 = ( - await session.exec(select(User).where(User.email == "admin@quivr.app")) - ).one() - assert user_1.id - # Brain data - brain_1 = Brain( - name="test_brain", - description="this is a test brain", - brain_type=BrainType.integration, - ) - - knowledge_brain_1 = KnowledgeDB( - file_name="test_file_1.txt", - extension=".txt", - status="UPLOADED", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1="test_sha1", - brains=[brain_1], - user_id=user_1.id, - ) - - knowledge_brain_2 = KnowledgeDB( - file_name="test_file_2.txt", - extension=".txt", - status="UPLOADED", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1="test_sha2", - brains=[], - user_id=user_1.id, - ) - - session.add(brain_1) - session.add(knowledge_brain_1) - session.add(knowledge_brain_2) - await session.commit() - return brain_1, [knowledge_brain_1, knowledge_brain_2] - - -@pytest.mark.asyncio(loop_scope="session") -async def test_updates_knowledge_status(session: AsyncSession, test_data: TestData): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - await repo.update_status_knowledge(knowledges[0].id, KnowledgeStatus.ERROR) - knowledge = await repo.get_knowledge_by_id(knowledges[0].id) - assert knowledge.status == KnowledgeStatus.ERROR - - -@pytest.mark.asyncio(loop_scope="session") -async def test_updates_knowledge_status_no_knowledge( - session: AsyncSession, test_data: TestData -): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - with pytest.raises(NoResultFound): - await repo.update_status_knowledge(uuid4(), KnowledgeStatus.UPLOADED) - - -@pytest.mark.asyncio(loop_scope="session") -async def test_update_knowledge_source_link(session: AsyncSession, test_data: TestData): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - await repo.update_source_link_knowledge(knowledges[0].id, "new_source_link") - knowledge = await repo.get_knowledge_by_id(knowledges[0].id) - assert knowledge.source_link == "new_source_link" - - -@pytest.mark.asyncio(loop_scope="session") -async def test_remove_knowledge_from_brain(session: AsyncSession, test_data: TestData): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - knowledge = await repo.remove_knowledge_from_brain(knowledges[0].id, brain.brain_id) - assert brain.brain_id not in [ - b.brain_id for b in await knowledge.awaitable_attrs.brains - ] - - -@pytest.mark.asyncio(loop_scope="session") -async def test_cascade_remove_knowledge_by_id( - session: AsyncSession, test_data: TestData -): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - await repo.remove_knowledge_by_id(knowledges[0].id) - with pytest.raises(NoResultFound): - await repo.get_knowledge_by_id(knowledges[0].id) - - query = select(KnowledgeBrain).where( - KnowledgeBrain.knowledge_id == knowledges[0].id - ) - result = await session.exec(query) - knowledge_brain = result.first() - assert knowledge_brain is None - - query = select(Vector).where(Vector.knowledge_id == knowledges[0].id) - result = await session.exec(query) - vector = result.first() - assert vector is None - - -@pytest.mark.asyncio(loop_scope="session") -async def test_remove_all_knowledges_from_brain( - session: AsyncSession, test_data: TestData -): - brain, knowledges = test_data - assert brain.brain_id - - # supabase_client = get_supabase_client() - # db = supabase_client - # storage = db.storage.from_("quivr") - - # storage.upload(f"{brain.brain_id}/test_file_1", b"test_content") - - repo = KnowledgeRepository(session) - service = KnowledgeService(repo) - await repo.remove_all_knowledges_from_brain(brain.brain_id) - knowledges = await service.get_all_knowledge_in_brain(brain.brain_id) - assert len(knowledges) == 0 - - # response = storage.list(path=f"{brain.brain_id}") - # assert response == [] - # FIXME @aminediro &chloedia raise an error when trying to interact with storage UnboundLocalError: cannot access local variable 'response' where it is not associated with a value - - -@pytest.mark.asyncio(loop_scope="session") -async def test_duplicate_sha1_knowledge_same_user( - session: AsyncSession, test_data: TestData -): - brain, [existing_knowledge, _] = test_data - assert brain.brain_id - assert existing_knowledge.id - assert existing_knowledge.file_sha1 - repo = KnowledgeRepository(session) - knowledge = KnowledgeDB( - file_name="test_file_2", - extension="txt", - status="UPLOADED", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1=existing_knowledge.file_sha1, - brains=[brain], - user_id=existing_knowledge.user_id, - ) - - with pytest.raises(IntegrityError): # FIXME: Should raise IntegrityError - await repo.insert_knowledge(knowledge, brain.brain_id) - - -@pytest.mark.asyncio(loop_scope="session") -async def test_duplicate_sha1_knowledge_diff_user( - session: AsyncSession, test_data: TestData, other_user: User -): - brain, knowledges = test_data - assert other_user.id - assert brain.brain_id - assert knowledges[0].id - repo = KnowledgeRepository(session) - knowledge = KnowledgeDB( - file_name="test_file_2", - extension="txt", - status="UPLOADED", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1=knowledges[0].file_sha1, - brains=[brain], - user_id=other_user.id, # random user id - ) - - result = await repo.insert_knowledge(knowledge, brain.brain_id) - assert result - - -@pytest.mark.asyncio(loop_scope="session") -async def test_add_knowledge_to_brain(session: AsyncSession, test_data: TestData): - brain, knowledges = test_data - assert brain.brain_id - assert knowledges[1].id - repo = KnowledgeRepository(session) - await repo.link_to_brain(knowledges[1], brain.brain_id) - knowledge = await repo.get_knowledge_by_id(knowledges[1].id) - brains_of_knowledge = [b.brain_id for b in await knowledge.awaitable_attrs.brains] - assert brain.brain_id in brains_of_knowledge - - query = select(KnowledgeBrain).where( - KnowledgeBrain.knowledge_id == knowledges[0].id - and KnowledgeBrain.brain_id == brain.brain_id - ) - result = await session.exec(query) - knowledge_brain = result.first() - assert knowledge_brain - - -# Knowledge Service -@pytest.mark.asyncio(loop_scope="session") -async def test_get_knowledge_in_brain(session: AsyncSession, test_data: TestData): - brain, knowledges = test_data - assert brain.brain_id - repo = KnowledgeRepository(session) - service = KnowledgeService(repo) - list_knowledge = await service.get_all_knowledge_in_brain(brain.brain_id) - assert len(list_knowledge) == 1 - brains_of_knowledge = [ - b.brain_id for b in await knowledges[0].awaitable_attrs.brains - ] - assert list_knowledge[0].id == knowledges[0].id - assert list_knowledge[0].file_name == knowledges[0].file_name - assert brain.brain_id in brains_of_knowledge - - -@pytest.mark.asyncio(loop_scope="session") -async def test_should_process_knowledge_exists( - session: AsyncSession, test_data: TestData -): - brain, [existing_knowledge, _] = test_data - assert brain.brain_id - new = KnowledgeDB( - file_name="new", - extension="txt", - status="PROCESSING", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1=None, - brains=[brain], - user_id=existing_knowledge.user_id, - ) - session.add(new) - await session.commit() - await session.refresh(new) - incoming_knowledge = await new.to_dto() - repo = KnowledgeRepository(session) - service = KnowledgeService(repo) - assert existing_knowledge.file_sha1 - with pytest.raises(FileExistsError): - await service.update_sha1_conflict( - incoming_knowledge, brain.brain_id, file_sha1=existing_knowledge.file_sha1 - ) - - -@pytest.mark.asyncio(loop_scope="session") -async def test_should_process_knowledge_link_brain( - session: AsyncSession, test_data: TestData -): - repo = KnowledgeRepository(session) - service = KnowledgeService(repo) - brain, [existing_knowledge, _] = test_data - user_id = existing_knowledge.user_id - assert brain.brain_id - prev = KnowledgeDB( - file_name="prev", - extension=".txt", - status=KnowledgeStatus.UPLOADED, - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1="test1", - brains=[brain], - user_id=user_id, - ) - brain_2 = Brain( - name="test_brain", - description="this is a test brain", - brain_type=BrainType.integration, - ) - session.add(brain_2) - session.add(prev) - await session.commit() - await session.refresh(prev) - await session.refresh(brain_2) - - assert prev.id - assert brain_2.brain_id - - new = KnowledgeDB( - file_name="new", - extension="txt", - status="PROCESSING", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1=None, - brains=[brain_2], - user_id=user_id, - ) - session.add(new) - await session.commit() - await session.refresh(new) - - incoming_knowledge = await new.to_dto() - assert prev.file_sha1 - - should_process = await service.update_sha1_conflict( - incoming_knowledge, brain_2.brain_id, file_sha1=prev.file_sha1 - ) - assert not should_process - - # Check prev knowledge was linked - assert incoming_knowledge.file_sha1 - prev_knowledge = await service.repository.get_knowledge_by_id(prev.id) - prev_brains = await prev_knowledge.awaitable_attrs.brains - assert {b.brain_id for b in prev_brains} == { - brain.brain_id, - brain_2.brain_id, - } - # Check new knowledge was removed - assert new.id - with pytest.raises(NoResultFound): - await service.repository.get_knowledge_by_id(new.id) - - -@pytest.mark.asyncio(loop_scope="session") -async def test_should_process_knowledge_prev_error( - session: AsyncSession, test_data: TestData -): - repo = KnowledgeRepository(session) - service = KnowledgeService(repo) - brain, [existing_knowledge, _] = test_data - user_id = existing_knowledge.user_id - assert brain.brain_id - prev = KnowledgeDB( - file_name="prev", - extension="txt", - status=KnowledgeStatus.ERROR, - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1="test1", - brains=[brain], - user_id=user_id, - ) - session.add(prev) - await session.commit() - await session.refresh(prev) - - assert prev.id - - new = KnowledgeDB( - file_name="new", - extension="txt", - status="PROCESSING", - source="test_source", - source_link="test_source_link", - file_size=100, - file_sha1=None, - brains=[brain], - user_id=user_id, - ) - session.add(new) - await session.commit() - await session.refresh(new) - - incoming_knowledge = await new.to_dto() - assert prev.file_sha1 - should_process = await service.update_sha1_conflict( - incoming_knowledge, brain.brain_id, file_sha1=prev.file_sha1 - ) - - # Checks we should process this file - assert should_process - # Previous errored file is cleaned up - with pytest.raises(NoResultFound): - await service.repository.get_knowledge_by_id(prev.id) - - assert new.id - new = await service.repository.get_knowledge_by_id(new.id) - assert new.file_sha1 - - -@pytest.mark.asyncio(loop_scope="session") -async def test_get_knowledge_storage_path(session: AsyncSession, test_data: TestData): - brain, [knowledge, _] = test_data - assert knowledge.file_name - repository = KnowledgeRepository(session) - service = KnowledgeService(repository) - brain_2 = Brain( - name="test_brain", - description="this is a test brain", - brain_type=BrainType.integration, - ) - session.add(brain_2) - await session.commit() - await session.refresh(brain_2) - assert brain_2.brain_id - km_data = os.urandom(128) - km_path = f"{str(knowledge.brains[0].brain_id)}/{knowledge.file_name}" - await upload_file_storage(km_data, km_path) - # Link knowledge to two brains - await repository.link_to_brain(knowledge, brain_2.brain_id) - storage_path = await service.get_knowledge_storage_path( - knowledge.file_name, brain_2.brain_id - ) - assert storage_path == km_path diff --git a/backend/api/quivr_api/modules/sync/tests/test_syncutils.py b/backend/api/quivr_api/modules/sync/tests/test_syncutils.py index 0c16ad09d4e..63b212128e1 100644 --- a/backend/api/quivr_api/modules/sync/tests/test_syncutils.py +++ b/backend/api/quivr_api/modules/sync/tests/test_syncutils.py @@ -317,10 +317,11 @@ def _send_task(*args, **kwargs): created_km = all_km[0] assert created_km.file_name == sync_file.name assert created_km.extension == ".txt" - assert created_km.file_sha1 is not None + assert created_km.file_sha1 is None assert created_km.created_at is not None assert created_km.metadata == {"sync_file_id": "1"} - assert created_km.brain_ids == [brain_1.brain_id] + assert len(created_km.brains)> 0 + assert created_km.brains[0]["brain_id"]== brain_1.brain_id # Assert celery task in correct assert task["args"] == ("process_file_task",) @@ -409,12 +410,12 @@ def _send_task(*args, **kwargs): created_km = all_km[0] assert created_km.file_name == sync_file.name assert created_km.extension == ".txt" - assert created_km.file_sha1 is not None + assert created_km.file_sha1 is None assert created_km.updated_at assert created_km.created_at assert created_km.updated_at == created_km.created_at # new line assert created_km.metadata == {"sync_file_id": str(dbfiles[0].id)} - assert created_km.brain_ids == [brain_1.brain_id] + assert created_km.brains[0]["brain_id"]== brain_1.brain_id # Check file content changed assert check_file_exists(str(brain_1.brain_id), sync_file.name) diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py index 0f614c2af82..0bf6e952dc5 100644 --- a/backend/api/quivr_api/modules/upload/controller/upload_routes.py +++ b/backend/api/quivr_api/modules/upload/controller/upload_routes.py @@ -53,12 +53,10 @@ @upload_router.post("/upload", dependencies=[Depends(AuthBearer())], tags=["Upload"]) async def upload_file( uploadFile: UploadFile, - client: AsyncClientDep, - background_tasks: BackgroundTasks, knowledge_service: KnowledgeServiceDep, + background_tasks: BackgroundTasks, bulk_id: Optional[UUID] = Query(None, description="The ID of the bulk upload"), brain_id: UUID = Query(..., description="The ID of the brain"), - chat_id: Optional[UUID] = Query(None, description="The ID of the chat"), current_user: UserIdentity = Depends(get_current_user), integration: Optional[str] = None, integration_link: Optional[str] = None, @@ -121,7 +119,7 @@ async def upload_file( file_size=uploadFile.size, file_sha1=None, ) - knowledge = await knowledge_service.insert_knowledge( + knowledge = await knowledge_service.insert_knowledge_brain( user_id=current_user.id, knowledge_to_add=knowledge_to_add ) # type: ignore diff --git a/backend/api/quivr_api/routes/crawl_routes.py b/backend/api/quivr_api/routes/crawl_routes.py index e4d06d61a0d..804c379afe1 100644 --- a/backend/api/quivr_api/routes/crawl_routes.py +++ b/backend/api/quivr_api/routes/crawl_routes.py @@ -87,7 +87,7 @@ async def crawl_endpoint( source_link=crawl_website.url, ) - added_knowledge = await knowledge_service.insert_knowledge( + added_knowledge = await knowledge_service.insert_knowledge_brain( knowledge_to_add=knowledge_to_add, user_id=current_user.id ) logger.info(f"Knowledge {added_knowledge} added successfully") diff --git a/backend/api/quivr_api/utils/partial.py b/backend/api/quivr_api/utils/partial.py new file mode 100644 index 00000000000..138a36c7198 --- /dev/null +++ b/backend/api/quivr_api/utils/partial.py @@ -0,0 +1,50 @@ +from copy import deepcopy +from typing import Any, Callable, Optional, Type, TypeVar +from uuid import UUID + +from pydantic import BaseModel, create_model +from pydantic.fields import FieldInfo + +Model = TypeVar("Model", bound=Type[BaseModel]) + + +def all_optional(without_fields: list[str] | None = None) -> Callable[[Model], Model]: + if without_fields is None: + without_fields = [] + + def wrapper(model: Type[Model]) -> Type[Model]: + base_model: Type[Model] = model + + def make_field_optional( + field: FieldInfo, default: Any = None + ) -> tuple[Any, FieldInfo]: + new = deepcopy(field) + new.default = default + new.annotation = Optional[field.annotation] + return new.annotation, new + + if without_fields: + base_model = BaseModel + + return create_model( + model.__name__, + __base__=base_model, + __module__=model.__module__, + **{ + field_name: make_field_optional(field_info) + for field_name, field_info in model.model_fields.items() + if field_name not in without_fields + }, + ) + + return wrapper + + +class Test(BaseModel): + id: UUID + name: Optional[str] = None + + +@all_optional() +class TestUpdate(Test): + pass diff --git a/backend/core/quivr_core/models.py b/backend/core/quivr_core/models.py index 053642c2163..8ebf2bbe23b 100644 --- a/backend/core/quivr_core/models.py +++ b/backend/core/quivr_core/models.py @@ -42,6 +42,7 @@ class KnowledgeStatus(str, Enum): PROCESSING = "PROCESSING" UPLOADED = "UPLOADED" ERROR = "ERROR" + RESERVED = "RESERVED" class Source(BaseModel): diff --git a/backend/supabase/migrations/20240905153004_knowledge-folders.sql b/backend/supabase/migrations/20240905153004_knowledge-folders.sql new file mode 100644 index 00000000000..5b2ac3165d1 --- /dev/null +++ b/backend/supabase/migrations/20240905153004_knowledge-folders.sql @@ -0,0 +1,31 @@ +ALTER USER postgres +SET idle_session_timeout = '3min'; +ALTER USER postgres +SET idle_in_transaction_session_timeout = '3min'; +-- Drop previous contraint +alter table "public"."knowledge" drop constraint "unique_file_sha1_user_id"; +alter table "public"."knowledge" +add column "is_folder" boolean default false; +-- Update the knowledge to backfill knowledge to is_folder = false +UPDATE "public"."knowledge" +SET is_folder = false; +-- Add parent_id -> folder +alter table "public"."knowledge" +add column "parent_id" uuid; +alter table "public"."knowledge" +add constraint "public_knowledge_parent_id_fkey" FOREIGN KEY (parent_id) REFERENCES knowledge(id) ON DELETE CASCADE; +-- Add constraint must be folder for parent_id +CREATE FUNCTION is_parent_folder(folder_id uuid) RETURNS boolean AS $$ BEGIN RETURN ( + SELECT k.is_folder + FROM public.knowledge k + WHERE k.id = folder_id +); +END; +$$ LANGUAGE plpgsql; +ALTER TABLE public.knowledge +ADD CONSTRAINT check_parent_is_folder CHECK ( + parent_id IS NULL + OR is_parent_folder(parent_id) + ); +-- Index on parent_id +CREATE INDEX knowledge_parent_id_idx ON public.knowledge USING btree (parent_id); diff --git a/backend/worker/quivr_worker/celery_worker.py b/backend/worker/quivr_worker/celery_worker.py index c438c742d27..ceb1632c8ce 100644 --- a/backend/worker/quivr_worker/celery_worker.py +++ b/backend/worker/quivr_worker/celery_worker.py @@ -13,7 +13,7 @@ from quivr_api.modules.brain.service.brain_service import BrainService from quivr_api.modules.dependencies import get_supabase_client from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository -from quivr_api.modules.knowledge.repository.storage import Storage +from quivr_api.modules.knowledge.repository.storage import SupabaseS3Storage from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService from quivr_api.modules.notification.service.notification_service import ( NotificationService, @@ -58,7 +58,7 @@ sync_files_repo_service = SyncFilesRepository() brain_service = BrainService() brain_vectors = BrainsVectors() -storage = Storage() +storage = SupabaseS3Storage() notion_service: SyncNotionService | None = None async_engine: AsyncEngine | None = None engine: Engine | None = None @@ -170,6 +170,8 @@ async def aprocess_file_task( integration_link=source_link, delete_file=delete_file, ) + session.commit() + await async_session.commit() except Exception as e: session.rollback() await async_session.rollback() @@ -196,19 +198,29 @@ def process_crawl_task( ) global engine assert engine - with Session(engine, expire_on_commit=False, autoflush=False) as session: - vector_repository = VectorRepository(session) - vector_service = VectorService(vector_repository) - loop = asyncio.get_event_loop() - loop.run_until_complete( - process_url_func( - url=crawl_website_url, - brain_id=brain_id, - knowledge_id=knowledge_id, - brain_service=brain_service, - vector_service=vector_service, + try: + with Session(engine, expire_on_commit=False, autoflush=False) as session: + session.execute( + text("SET SESSION idle_in_transaction_session_timeout = '5min';") ) - ) + vector_repository = VectorRepository(session) + vector_service = VectorService(vector_repository) + loop = asyncio.get_event_loop() + loop.run_until_complete( + process_url_func( + url=crawl_website_url, + brain_id=brain_id, + knowledge_id=knowledge_id, + brain_service=brain_service, + vector_service=vector_service, + ) + ) + session.commit() + except Exception as e: + session.rollback() + raise e + finally: + session.close() @celery.task(name="NotionConnectorLoad") diff --git a/backend/worker/quivr_worker/process/process_s3_file.py b/backend/worker/quivr_worker/process/process_s3_file.py index a8546579440..99bc4e7360d 100644 --- a/backend/worker/quivr_worker/process/process_s3_file.py +++ b/backend/worker/quivr_worker/process/process_s3_file.py @@ -2,6 +2,7 @@ from quivr_api.logger import get_logger from quivr_api.modules.brain.service.brain_service import BrainService +from quivr_api.modules.knowledge.entity.knowledge import KnowledgeUpdate from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService from quivr_api.modules.vector.service.vector_service import VectorService @@ -41,17 +42,15 @@ async def process_uploaded_file( # If we have some knowledge with error with build_file(file_data, knowledge_id, file_name) as file_instance: knowledge = await knowledge_service.get_knowledge(knowledge_id=knowledge_id) - should_process = await knowledge_service.update_sha1_conflict( - knowledge=knowledge, - brain_id=brain.brain_id, - file_sha1=file_instance.file_sha1, + await knowledge_service.update_knowledge( + knowledge, + KnowledgeUpdate(file_sha1=file_instance.file_sha1), # type: ignore + ) + await process_file( + file_instance=file_instance, + brain=brain, + brain_service=brain_service, + vector_service=vector_service, + integration=integration, + integration_link=integration_link, ) - if should_process: - await process_file( - file_instance=file_instance, - brain=brain, - brain_service=brain_service, - vector_service=vector_service, - integration=integration, - integration_link=integration_link, - ) diff --git a/backend/worker/quivr_worker/syncs/process_active_syncs.py b/backend/worker/quivr_worker/syncs/process_active_syncs.py index 299d9f2b0bc..d190c219166 100644 --- a/backend/worker/quivr_worker/syncs/process_active_syncs.py +++ b/backend/worker/quivr_worker/syncs/process_active_syncs.py @@ -141,7 +141,7 @@ async def process_notion_sync( UUID(user_id), notion_client, # type: ignore ) - + await session.commit() except Exception as e: await session.rollback() raise e diff --git a/backend/worker/quivr_worker/syncs/store_notion.py b/backend/worker/quivr_worker/syncs/store_notion.py index 82925e77340..821de887409 100644 --- a/backend/worker/quivr_worker/syncs/store_notion.py +++ b/backend/worker/quivr_worker/syncs/store_notion.py @@ -40,6 +40,8 @@ async def fetch_and_store_notion_files_async( else: logger.warn("No notion page fetched") + # Commit all before exiting + await session.commit() except Exception as e: await session.rollback() raise e diff --git a/backend/worker/quivr_worker/syncs/utils.py b/backend/worker/quivr_worker/syncs/utils.py index e1523b29bd0..bbc3c75f858 100644 --- a/backend/worker/quivr_worker/syncs/utils.py +++ b/backend/worker/quivr_worker/syncs/utils.py @@ -6,7 +6,7 @@ from quivr_api.logger import get_logger from quivr_api.modules.brain.repository.brains_vectors import BrainsVectors from quivr_api.modules.knowledge.repository.knowledges import KnowledgeRepository -from quivr_api.modules.knowledge.repository.storage import Storage +from quivr_api.modules.knowledge.repository.storage import SupabaseS3Storage from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService from quivr_api.modules.notification.service.notification_service import ( NotificationService, @@ -42,7 +42,7 @@ class SyncServices: sync_files_repo_service: SyncFilesRepository notification_service: NotificationService brain_vectors: BrainsVectors - storage: Storage + storage: SupabaseS3Storage @asynccontextmanager @@ -56,7 +56,6 @@ async def build_syncs_utils( await session.execute( text("SET SESSION idle_in_transaction_session_timeout = '5min';") ) - # TODO pass services from celery_worker notion_repository = NotionRepository(session) notion_service = SyncNotionService(notion_repository) knowledge_service = KnowledgeService(KnowledgeRepository(session)) @@ -84,7 +83,7 @@ async def build_syncs_utils( mapping_sync_utils[provider_name] = provider_sync_util yield mapping_sync_utils - + await session.commit() except Exception as e: await session.rollback() raise e From 1c60a9f5383b58e1f0d8643f219c4f1519d646d5 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Mon, 16 Sep 2024 14:52:15 +0200 Subject: [PATCH 08/13] chore(main): release 0.0.314 (#3210) :robot: I have created a release *beep* *boop* --- ## 0.0.314 (2024-09-16) ## What's Changed * feat: CRUD KMS (no syncs) by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3162 **Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.313...v0.0.314 --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 3ae474fcc93..990640406a0 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { "backend/core": "0.0.14", - ".": "0.0.313" + ".": "0.0.314" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f43c6e8cf8..8574ce71a55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.0.314 (2024-09-16) + +## What's Changed +* feat: CRUD KMS (no syncs) by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3162 + + +**Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.313...v0.0.314 + ## 0.0.313 (2024-09-13) ## What's Changed From 348c1a71b061c073dc2ed49ff1f3b476997eb190 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Mon, 16 Sep 2024 14:54:29 +0200 Subject: [PATCH 09/13] chore(main): release core 0.0.15 (#3203) :robot: I have created a release *beep* *boop* --- ## [0.0.15](https://github.com/QuivrHQ/quivr/compare/core-0.0.14...core-0.0.15) (2024-09-16) ### Features * CRUD KMS (no syncs) ([#3162](https://github.com/QuivrHQ/quivr/issues/3162)) ([71edca5](https://github.com/QuivrHQ/quivr/commit/71edca572ffd2901ed582005ac4b2803d9d95e57)) * save and load brain ([#3202](https://github.com/QuivrHQ/quivr/issues/3202)) ([eda619f](https://github.com/QuivrHQ/quivr/commit/eda619f4547921ab4c50458b2d44c6b5c10e40d1)) ### Bug Fixes * Update LLMEndpoint to include max_tokens parameter ([#3201](https://github.com/QuivrHQ/quivr/issues/3201)) ([13ed225](https://github.com/QuivrHQ/quivr/commit/13ed225b172407ee9826b9c01b2f7b124a8b5a10)) --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- backend/core/CHANGELOG.md | 13 +++++++++++++ backend/core/pyproject.toml | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 990640406a0..7cf5f52a0e2 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { - "backend/core": "0.0.14", + "backend/core": "0.0.15", ".": "0.0.314" } \ No newline at end of file diff --git a/backend/core/CHANGELOG.md b/backend/core/CHANGELOG.md index b3de918cf60..883f269ddd5 100644 --- a/backend/core/CHANGELOG.md +++ b/backend/core/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.0.15](https://github.com/QuivrHQ/quivr/compare/core-0.0.14...core-0.0.15) (2024-09-16) + + +### Features + +* CRUD KMS (no syncs) ([#3162](https://github.com/QuivrHQ/quivr/issues/3162)) ([71edca5](https://github.com/QuivrHQ/quivr/commit/71edca572ffd2901ed582005ac4b2803d9d95e57)) +* save and load brain ([#3202](https://github.com/QuivrHQ/quivr/issues/3202)) ([eda619f](https://github.com/QuivrHQ/quivr/commit/eda619f4547921ab4c50458b2d44c6b5c10e40d1)) + + +### Bug Fixes + +* Update LLMEndpoint to include max_tokens parameter ([#3201](https://github.com/QuivrHQ/quivr/issues/3201)) ([13ed225](https://github.com/QuivrHQ/quivr/commit/13ed225b172407ee9826b9c01b2f7b124a8b5a10)) + ## [0.0.14](https://github.com/QuivrHQ/quivr/compare/core-0.0.13...core-0.0.14) (2024-09-09) diff --git a/backend/core/pyproject.toml b/backend/core/pyproject.toml index 8a2c41841d8..70b3e2a9518 100644 --- a/backend/core/pyproject.toml +++ b/backend/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "quivr-core" -version = "0.0.14" +version = "0.0.15" description = "Quivr core RAG package" authors = [ { name = "Stan Girard", email = "stan@quivr.app" } From 6a0dbfe8339fbb51cddb818e5830f0f257186d07 Mon Sep 17 00:00:00 2001 From: AmineDiro Date: Tue, 17 Sep 2024 19:06:04 +0200 Subject: [PATCH 10/13] fix: knowledge user_id fix (#3216) --- backend/api/quivr_api/modules/knowledge/entity/knowledge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py index d890ee42d1c..7f9d10d9574 100644 --- a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py +++ b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py @@ -35,7 +35,7 @@ class Knowledge(BaseModel): source_link: Optional[str] = None file_sha1: Optional[str] = None metadata: Optional[Dict[str, str]] = None - user_id: UUID + user_id: Optional[UUID] = None brains: List[Dict[str, Any]] parent: Optional["Knowledge"] children: Optional[list["Knowledge"]] From 9063b10132e80fb15f6ecbb68aab76da7ff18306 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Tue, 17 Sep 2024 22:57:24 +0200 Subject: [PATCH 11/13] chore(main): release 0.0.315 (#3212) :robot: I have created a release *beep* *boop* --- ## 0.0.315 (2024-09-17) ## What's Changed * chore(main): release core 0.0.15 by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3203 * fix: knowledge user_id fix by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3216 **Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.314...v0.0.315 --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- CHANGELOG.md | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 7cf5f52a0e2..1d14b140f9c 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { "backend/core": "0.0.15", - ".": "0.0.314" + ".": "0.0.315" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8574ce71a55..69198f0e471 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 0.0.315 (2024-09-17) + +## What's Changed +* chore(main): release core 0.0.15 by @StanGirard in https://github.com/QuivrHQ/quivr/pull/3203 +* fix: knowledge user_id fix by @AmineDiro in https://github.com/QuivrHQ/quivr/pull/3216 + + +**Full Changelog**: https://github.com/QuivrHQ/quivr/compare/v0.0.314...v0.0.315 + ## 0.0.314 (2024-09-16) ## What's Changed From 4bb4800a76942ee31a939d3cacc94f057682177a Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Wed, 18 Sep 2024 00:17:11 +0200 Subject: [PATCH 12/13] fix(core): enforce langchain <0.3 for pydantic v1 (#3217) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --- backend/api/pyproject.toml | 2 +- backend/core/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/api/pyproject.toml b/backend/api/pyproject.toml index 62cc9d4372b..62ce71e6f9c 100644 --- a/backend/api/pyproject.toml +++ b/backend/api/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "markdownify>=0.13.1", "langchain-openai>=0.1.21", "resend>=2.4.0", - "langchain>=0.2.14", + "langchain>=0.2.14,<0.3.0", "litellm>=1.43.15", "openai>=1.40.8", "tiktoken>=0.7.0", diff --git a/backend/core/pyproject.toml b/backend/core/pyproject.toml index 70b3e2a9518..c06b5f7225a 100644 --- a/backend/core/pyproject.toml +++ b/backend/core/pyproject.toml @@ -8,7 +8,7 @@ authors = [ dependencies = [ "pydantic>=2.8.2", "langchain-core>=0.2.38", - "langchain>=0.2.14", + "langchain>=0.2.14,<0.3.0", "langgraph>=0.2.14", "httpx>=0.27.0", "rich>=13.7.1", From 4390d318a22e9902617b0ab8263cd433b2439089 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Wed, 18 Sep 2024 00:18:00 +0200 Subject: [PATCH 13/13] chore(main): release core 0.0.16 (#3218) :robot: I have created a release *beep* *boop* --- ## [0.0.16](https://github.com/QuivrHQ/quivr/compare/core-0.0.15...core-0.0.16) (2024-09-17) ### Bug Fixes * **core:** enforce langchain <0.3 for pydantic v1 ([#3217](https://github.com/QuivrHQ/quivr/issues/3217)) ([4bb4800](https://github.com/QuivrHQ/quivr/commit/4bb4800a76942ee31a939d3cacc94f057682177a)) --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). See [documentation](https://github.com/googleapis/release-please#release-please). --- .release-please-manifest.json | 2 +- backend/core/CHANGELOG.md | 7 +++++++ backend/core/pyproject.toml | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 1d14b140f9c..bdadd57e825 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { - "backend/core": "0.0.15", + "backend/core": "0.0.16", ".": "0.0.315" } \ No newline at end of file diff --git a/backend/core/CHANGELOG.md b/backend/core/CHANGELOG.md index 883f269ddd5..c15ac876ccc 100644 --- a/backend/core/CHANGELOG.md +++ b/backend/core/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.0.16](https://github.com/QuivrHQ/quivr/compare/core-0.0.15...core-0.0.16) (2024-09-17) + + +### Bug Fixes + +* **core:** enforce langchain <0.3 for pydantic v1 ([#3217](https://github.com/QuivrHQ/quivr/issues/3217)) ([4bb4800](https://github.com/QuivrHQ/quivr/commit/4bb4800a76942ee31a939d3cacc94f057682177a)) + ## [0.0.15](https://github.com/QuivrHQ/quivr/compare/core-0.0.14...core-0.0.15) (2024-09-16) diff --git a/backend/core/pyproject.toml b/backend/core/pyproject.toml index c06b5f7225a..b8b07ba71e4 100644 --- a/backend/core/pyproject.toml +++ b/backend/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "quivr-core" -version = "0.0.15" +version = "0.0.16" description = "Quivr core RAG package" authors = [ { name = "Stan Girard", email = "stan@quivr.app" }