Skip to content

Commit

Permalink
mimetype without libmagic (#327)
Browse files Browse the repository at this point in the history
mimetype without libmagic
  • Loading branch information
mkorpela authored May 3, 2024
1 parent 9f5a41f commit 4b688f0
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 41 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,10 @@ pyenv activate opengpts
Once your Python environment is set up, you can install the project dependencies:

The backend service uses [poetry](https://python-poetry.org/docs/#installation) to manage dependencies.
It assumes libmagic to be [installed](https://github.com/ahupp/python-magic?tab=readme-ov-file#installation) in your
host system.

```shell
pip install poetry
pip install libmagic
pip install langchain-community
brew install libmagic
```

**Install Postgres and the Postgres Vector Extension**
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
5 changes: 3 additions & 2 deletions backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from app.api import router as api_router
from app.auth.handlers import AuthedUser
from app.lifespan import lifespan
from app.upload import ingest_runnable
from app.upload import convert_ingestion_input_to_blob, ingest_runnable

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,7 +44,8 @@ async def ingest_files(
if thread is None:
raise HTTPException(status_code=404, detail="Thread not found.")

return ingest_runnable.batch([file.file for file in files], config)
file_blobs = [convert_ingestion_input_to_blob(file) for file in files]
return ingest_runnable.batch(file_blobs, config)


@app.get("/health")
Expand Down
62 changes: 44 additions & 18 deletions backend/app/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@

from __future__ import annotations

import mimetypes
import os
from typing import BinaryIO, List, Optional

from fastapi import UploadFile
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.document_loaders.blob_loaders import Blob
from langchain_core.runnables import (
Expand All @@ -27,25 +29,52 @@
from app.parsing import MIMETYPE_BASED_PARSER


def _guess_mimetype(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
"""Guess the mime-type of a file based on its name or bytes."""
# Guess based on the file extension
mime_type, _ = mimetypes.guess_type(file_name)

# Return detected mime type from mimetypes guess, unless it's None
if mime_type:
return mime_type

# Signature-based detection for common types
if file_bytes.startswith(b"%PDF"):
return "application/pdf"
elif file_bytes.startswith(
(b"\x50\x4B\x03\x04", b"\x50\x4B\x05\x06", b"\x50\x4B\x07\x08")
):
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif file_bytes.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"):
return "application/msword"
elif file_bytes.startswith(b"\x09\x00\xff\x00\x06\x00"):
return "application/vnd.ms-excel"

# Check for CSV-like plain text content (commas, tabs, newlines)
try:
import magic
except ImportError as e:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
) from e
decoded = file_bytes[:1024].decode("utf-8", errors="ignore")
if all(char in decoded for char in (",", "\n")) or all(
char in decoded for char in ("\t", "\n")
):
return "text/csv"
elif decoded.isprintable() or decoded == "":
return "text/plain"
except UnicodeDecodeError:
pass

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_bytes)
return mime_type
return "application/octet-stream"


def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
def convert_ingestion_input_to_blob(file: UploadFile) -> Blob:
"""Convert ingestion input to blob."""
file_data = data.read()
mimetype = _guess_mimetype(file_data)
file_name = data.name
file_data = file.file.read()
file_name = file.filename

# Check if file_name is a valid string
if not isinstance(file_name, str):
raise TypeError(f"Expected string for file name, got {type(file_name)}")

mimetype = _guess_mimetype(file_name, file_data)
return Blob.from_data(
data=file_data,
path=file_name,
Expand Down Expand Up @@ -104,10 +133,7 @@ def namespace(self) -> str:
)
return self.assistant_id if self.assistant_id is not None else self.thread_id

def invoke(
self, input: BinaryIO, config: Optional[RunnableConfig] = None
) -> List[str]:
blob = _convert_ingestion_input_to_blob(input)
def invoke(self, blob: Blob, config: Optional[RunnableConfig] = None) -> List[str]:
out = ingest_blob(
blob,
MIMETYPE_BASED_PARSER,
Expand Down
9 changes: 2 additions & 7 deletions backend/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ tiktoken = "^0.5.1"
langchain = ">=0.0.338"
langgraph = "^0.0.38"
pydantic = "<2.0"
python-magic = "^0.4.27"
langchain-openai = "^0.1.3"
beautifulsoup4 = "^4.12.3"
boto3 = "^1.34.28"
Expand Down
20 changes: 13 additions & 7 deletions backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from io import BytesIO

from langchain.text_splitter import RecursiveCharacterTextSplitter

from app.upload import IngestRunnable, _guess_mimetype
from fastapi import UploadFile
from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob
from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore

Expand All @@ -17,9 +17,15 @@ def test_ingestion_runnable() -> None:
input_key="file_contents",
assistant_id="TheParrot",
)
data = BytesIO(b"test")
data.name = "filename"
ids = runnable.invoke(data)
# Simulate file data
file_data = BytesIO(b"test data")
file_data.seek(0)
# Create UploadFile object
file = UploadFile(filename="testfile.txt", file=file_data)

# Convert the file to blob
blob = convert_ingestion_input_to_blob(file)
ids = runnable.invoke(blob)
assert len(ids) == 1


Expand All @@ -28,7 +34,7 @@ def test_mimetype_guessing() -> None:
name_to_mime = {}
for file in sorted(get_sample_paths()):
data = file.read_bytes()
name_to_mime[file.name] = _guess_mimetype(data)
name_to_mime[file.name] = _guess_mimetype(file.name, data)

assert {
"sample.docx": (
Expand All @@ -38,6 +44,6 @@ def test_mimetype_guessing() -> None:
"sample.html": "text/html",
"sample.odt": "application/vnd.oasis.opendocument.text",
"sample.pdf": "application/pdf",
"sample.rtf": "text/rtf",
"sample.rtf": "application/rtf",
"sample.txt": "text/plain",
} == name_to_mime

0 comments on commit 4b688f0

Please sign in to comment.