diff --git a/core_backend/app/contents/routers.py b/core_backend/app/contents/routers.py index eb1903edf..347b48f68 100644 --- a/core_backend/app/contents/routers.py +++ b/core_backend/app/contents/routers.py @@ -1,7 +1,9 @@ from typing import Annotated, List -from fastapi import APIRouter, Depends +import pandas as pd +from fastapi import APIRouter, Depends, UploadFile from fastapi.exceptions import HTTPException +from pandas.errors import EmptyDataError, ParserError from sqlalchemy.ext.asyncio import AsyncSession from ..auth.dependencies import get_current_user @@ -17,7 +19,12 @@ save_content_to_db, update_content_in_db, ) -from .schemas import ContentCreate, ContentRetrieve +from .schemas import ( + ContentCreate, + ContentRetrieve, + CustomError, + CustomErrorList, +) router = APIRouter(prefix="/content", tags=["Content Management"]) logger = setup_logger() @@ -86,7 +93,7 @@ async def edit_content( return _convert_record_to_schema(updated_content) -@router.get("/", response_model=list[ContentRetrieve]) +@router.get("/", response_model=List[ContentRetrieve]) async def retrieve_content( user_db: Annotated[UserDB, Depends(get_current_user)], skip: int = 0, @@ -158,6 +165,216 @@ async def retrieve_content_by_id( return _convert_record_to_schema(record) +@router.post("/csv-upload", response_model=List[ContentRetrieve]) +async def bulk_upload_contents( + file: UploadFile, + user_db: Annotated[UserDB, Depends(get_current_user)], + asession: AsyncSession = Depends(get_async_session), +) -> List[ContentRetrieve] | None: + """ + Upload, check, and ingest contents in bulk from a CSV file. + + Note: If there are any issues with the CSV, the endpoint will return a 400 error + with the list of issues under detail in the response body. + """ + + # TODO: deal with tags! + + # Ensure the file is a CSV + if file.filename is None or not file.filename.endswith(".csv"): + error_list_model = CustomErrorList( + errors=[ + CustomError( + type="invalid_format", + description="Please upload a CSV file.", + ) + ] + ) + raise HTTPException(status_code=400, detail=error_list_model.dict()) + + df = _load_csv(file) + await _csv_checks(df=df, user_id=user_db.user_id, asession=asession) + + # Add each row to the content database + content_list = [] + for _, row in df.iterrows(): + content = ContentCreate( + content_title=row["content_title"], + content_text=row["content_text"], + content_language="ENGLISH", + content_tags=[], + content_metadata={}, + ) + content_db = await save_content_to_db( + user_id=user_db.user_id, content=content, asession=asession + ) + content_retrieve = _convert_record_to_schema(content_db) + content_list.append(content_retrieve) + + return content_list + + +def _load_csv(file: UploadFile) -> pd.DataFrame: + """ + Load the CSV file into a pandas DataFrame + """ + + try: + df = pd.read_csv(file.file, dtype=str) + except EmptyDataError as e: + error_list_model = CustomErrorList( + errors=[ + CustomError( + type="empty_data", + description="The CSV file is empty", + ) + ] + ) + raise HTTPException(status_code=400, detail=error_list_model.dict()) from e + except ParserError as e: + error_list_model = CustomErrorList( + errors=[ + CustomError( + type="parse_error", + description="CSV is unreadable (parsing error)", + ) + ] + ) + raise HTTPException(status_code=400, detail=error_list_model.dict()) from e + except UnicodeDecodeError as e: + error_list_model = CustomErrorList( + errors=[ + CustomError( + type="encoding_error", + description="CSV is unreadable (encoding error)", + ) + ] + ) + raise HTTPException(status_code=400, detail=error_list_model.dict()) from e + if df.empty: + error_list_model = CustomErrorList( + errors=[ + CustomError( + type="no_rows_csv", + description="The CSV file is empty", + ) + ] + ) + raise HTTPException(status_code=400, detail=error_list_model.dict()) + + return df + + +async def _csv_checks(df: pd.DataFrame, user_id: int, asession: AsyncSession) -> None: + """ + Perform checks on the CSV file to ensure it meets the requirements + """ + + # check if content_title and content_text columns are present + cols = df.columns + error_list = [] + if "content_title" not in cols or "content_text" not in cols: + error_list.append( + CustomError( + type="missing_columns", + description=( + "File must have 'content_title' and 'content_text' columns." + ), + ) + ) + # if either of these columns are missing, skip further checks + error_list_model = CustomErrorList(errors=error_list) + raise HTTPException(status_code=400, detail=error_list_model.dict()) + else: + # strip columns to catch duplicates better and empty cells + df["content_title"] = df["content_title"].str.strip() + df["content_text"] = df["content_text"].str.strip() + + # set any empty strings to None + df = df.replace("", None) + + # check if there are any empty values in either column + if df["content_title"].isnull().any(): + error_list.append( + CustomError( + type="empty_title", + description=( + "One or more empty content titles found in the CSV file." + ), + ) + ) + if df["content_text"].isnull().any(): + error_list.append( + CustomError( + type="empty_text", + description=( + "One or more empty content texts found in the CSV file." + ), + ) + ) + # check if any title exceeds 150 characters + if df["content_title"].str.len().max() > 150: + error_list.append( + CustomError( + type="title_too_long", + description="One or more content titles exceed 150 characters.", + ) + ) + # check if any text exceeds 2000 characters + if df["content_text"].str.len().max() > 2000: + error_list.append( + CustomError( + type="texts_too_long", + description="One or more content texts exceed 150 characters.", + ) + ) + + # check if there are duplicates in either column + if df.duplicated(subset=["content_title"]).any(): + error_list.append( + CustomError( + type="duplicate_titles", + description="Duplicate content titles found in the CSV file.", + ) + ) + if df.duplicated(subset=["content_text"]).any(): + error_list.append( + CustomError( + type="duplicate_texts", + description="Duplicate content texts found in the CSV file.", + ) + ) + + # check for duplicate titles and texts between the CSV and the database + contents_in_db = await get_list_of_content_from_db( + user_id, offset=0, limit=None, asession=asession + ) + content_titles_in_db = [c.content_title.strip() for c in contents_in_db] + content_texts_in_db = [c.content_text.strip() for c in contents_in_db] + if df["content_title"].isin(content_titles_in_db).any(): + error_list.append( + CustomError( + type="title_in_db", + description=( + "One or more content titles already exist in the database." + ), + ) + ) + if df["content_text"].isin(content_texts_in_db).any(): + error_list.append( + CustomError( + type="text_in_db", + description=( + "One or more content texts already exist in the database." + ), + ) + ) + + if error_list: + error_list_model = CustomErrorList(errors=error_list) + raise HTTPException(status_code=400, detail=error_list_model.dict()) + + def _convert_record_to_schema(record: ContentDB) -> ContentRetrieve: """ Convert models.ContentDB models to ContentRetrieve schema diff --git a/core_backend/app/contents/schemas.py b/core_backend/app/contents/schemas.py index 48e744033..8db5b98df 100644 --- a/core_backend/app/contents/schemas.py +++ b/core_backend/app/contents/schemas.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Annotated +from typing import Annotated, List from pydantic import BaseModel, ConfigDict, StringConstraints, validator @@ -59,3 +59,20 @@ class ContentDelete(BaseModel): """ content_id: int + + +class CustomError(BaseModel): + """ + Pydantic model for custom error + """ + + type: str + description: str + + +class CustomErrorList(BaseModel): + """ + Pydantic model for list of custom errors + """ + + errors: List[CustomError] diff --git a/core_backend/requirements.txt b/core_backend/requirements.txt index 431225dce..3b8d3bc0e 100644 --- a/core_backend/requirements.txt +++ b/core_backend/requirements.txt @@ -13,3 +13,6 @@ prometheus_client==0.19.0 google-api-python-client==2.129.0 google-api-python-client-stubs==1.25.0 langfuse==2.27.3 +pandas==2.2.2 +pandas-stubs==2.2.2.240603 +types-openpyxl==3.1.4.20240621 diff --git a/core_backend/tests/api/test_import_content.py b/core_backend/tests/api/test_import_content.py new file mode 100644 index 000000000..2cccda500 --- /dev/null +++ b/core_backend/tests/api/test_import_content.py @@ -0,0 +1,244 @@ +from io import BytesIO +from typing import Generator + +import pandas as pd +import pytest +from fastapi.testclient import TestClient + + +def _dict_to_csv_bytes(data: dict) -> BytesIO: + """ + Convert a dictionary to a CSV file in bytes + """ + + df = pd.DataFrame(data) + csv_bytes = BytesIO() + df.to_csv(csv_bytes, index=False) + csv_bytes.seek(0) + + return csv_bytes + + +class TestImportContent: + @pytest.fixture + def data_valid(self) -> BytesIO: + data = { + "content_title": ["csv title 1", "csv title 2"], + "content_text": ["csv text 1", "csv text 2"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_empty_csv(self) -> BytesIO: + data: dict = {} + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_no_rows(self) -> BytesIO: + data: dict = { + "content_title": [], + "content_text": [], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_title_spaces_only(self) -> BytesIO: + data: dict = { + "content_title": [" "], + "content_text": ["csv text 1"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_text_spaces_only(self) -> BytesIO: + data: dict = { + "content_title": ["csv title 1"], + "content_text": [" "], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_missing_columns(self) -> BytesIO: + data = { + "wrong_column_1": ["Value 1", "Value 2"], + "wrong_column_2": ["Value 3", "Value 4"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_title_missing(self) -> BytesIO: + data = { + "content_title": ["", "csv text 1"], + "content_text": ["csv title 2", "csv text 2"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_text_missing(self) -> BytesIO: + data = { + "content_title": ["csv title 1", "csv title 2"], + "content_text": ["", "csv text 2"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_long_title(self) -> BytesIO: + data = { + "content_title": ["a" * 151], + "content_text": ["Valid text"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_long_text(self) -> BytesIO: + data = { + "content_title": ["Valid title"], + "content_text": ["a" * 2001], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_duplicate_titles(self) -> BytesIO: + data = { + "content_title": ["Duplicate title", "Duplicate title"], + "content_text": ["Text 1", "Text 2"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_duplicate_texts(self) -> BytesIO: + data = { + "content_title": ["Title 1", "Title 2"], + "content_text": ["Duplicate text", "Duplicate text"], + } + return _dict_to_csv_bytes(data) + + test_data = [ + ("data_empty_csv", "empty_data"), + ("data_no_rows", "no_rows_csv"), + ("data_title_spaces_only", "empty_title"), + ("data_text_spaces_only", "empty_text"), + ("data_missing_columns", "missing_columns"), + ("data_title_missing", "empty_title"), + ("data_text_missing", "empty_text"), + ("data_long_title", "title_too_long"), + ("data_long_text", "texts_too_long"), + ("data_duplicate_titles", "duplicate_titles"), + ("data_duplicate_texts", "duplicate_texts"), + ] + + async def test_csv_import_success( + self, + client: TestClient, + data_valid: BytesIO, + fullaccess_token: str, + ) -> None: + response = client.post( + "/content/csv-upload", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + files={"file": ("test.csv", data_valid, "text/csv")}, + ) + assert response.status_code == 200 + + json_response = response.json() + for content in json_response: + content_id = content["content_id"] + response = client.delete( + f"/content/{content_id}", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + ) + assert response.status_code == 200 + + @pytest.mark.parametrize("mock_csv_data, expected_error_type", test_data) + async def test_csv_import_checks( + self, + client: TestClient, + mock_csv_data: BytesIO, + expected_error_type: str, + request: pytest.FixtureRequest, + fullaccess_token: str, + ) -> None: + # fetch data from the fixture + mock_csv_file = request.getfixturevalue(mock_csv_data) + + response = client.post( + "/content/csv-upload", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + files={"file": ("test.csv", mock_csv_file, "text/csv")}, + ) + assert response.status_code == 400 + assert response.json()["detail"]["errors"][0]["type"] == expected_error_type + + +class TestDBDuplicates: + @pytest.fixture(scope="function") + def existing_content_in_db( + self, + client: TestClient, + fullaccess_token: str, + ) -> Generator[str, None, None]: + response = client.post( + "/content", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + json={ + "content_title": "Title in DB", + "content_text": "Text in DB", + "content_language": "ENGLISH", + "content_tags": [], + "content_metadata": {}, + }, + ) + content_id = response.json()["content_id"] + yield content_id + client.delete( + f"/content/{content_id}", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + ) + + @pytest.fixture + def data_title_in_db(self) -> BytesIO: + # Assuming "Title in DB" is a title that exists in the database + data = { + "content_title": ["Title in DB"], + "content_text": ["New text"], + } + return _dict_to_csv_bytes(data) + + @pytest.fixture + def data_text_in_db(self) -> BytesIO: + # Assuming "Text in DB" is a text that exists in the database + data = { + "content_title": ["New title"], + "content_text": ["Text in DB"], + } + return _dict_to_csv_bytes(data) + + @pytest.mark.parametrize( + "mock_csv_data, expected_error_type", + [("data_title_in_db", "title_in_db"), ("data_text_in_db", "text_in_db")], + ) + async def test_csv_import_db_duplicates( + self, + client: TestClient, + fullaccess_token: str, + mock_csv_data: BytesIO, + expected_error_type: str, + request: pytest.FixtureRequest, + existing_content_in_db: str, + ) -> None: + """ + This test uses the existing_content_in_db fixture to create a content in the + database and then tries to import a CSV file with a title or text that already + exists in the database. + """ + mock_csv_file = request.getfixturevalue(mock_csv_data) + response_text_dupe = client.post( + "/content/csv-upload", + headers={"Authorization": f"Bearer {fullaccess_token}"}, + files={"file": ("test.csv", mock_csv_file, "text/csv")}, + ) + assert response_text_dupe.status_code == 400 + assert ( + response_text_dupe.json()["detail"]["errors"][0]["type"] + == expected_error_type + )