Skip to content

Commit

Permalink
fix: fixing pdf parsing (#3349)
Browse files Browse the repository at this point in the history
Our default pdf parser is Unstructured, for which we were using the
'fast' strategy, which fails in parsing some pdf. We instead use the
'auto' strategy which is more flexible and powerful.

Closes TICK-86

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):

---------

Co-authored-by: chloedia <[email protected]>
  • Loading branch information
jacopo-chevallard and chloedia authored Oct 14, 2024
1 parent 90848eb commit 367242a
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 16 deletions.
33 changes: 33 additions & 0 deletions backend/api/quivr_api/modules/sync/utils/normalize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import re
import unicodedata

Expand All @@ -15,3 +16,35 @@ def remove_special_characters(input):
except Exception as e:
logger.error(f"Error removing special characters: {e}")
return input


def sanitize_filename(filename: str) -> str:
"""
Sanitize the filename to make it usable.
Args:
filename (str): The original filename.
Returns:
str: The sanitized filename.
This function:
1. Removes or replaces invalid characters
2. Handles double extensions
3. Ensures the filename is not empty
4. Truncates long filenames
"""
valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)

name, ext = os.path.splitext(valid_chars)

name = name.replace(".", "_")

if not name:
name = "unnamed"
max_length = 255 - len(ext)
if len(name) > max_length:
name = name[:max_length]
sanitized_filename = f"{name}{ext}"

return sanitized_filename
3 changes: 3 additions & 0 deletions backend/api/quivr_api/modules/sync/utils/syncutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ISyncService,
ISyncUserService,
)
from quivr_api.modules.sync.utils.normalize import sanitize_filename
from quivr_api.modules.sync.utils.sync import BaseSync
from quivr_api.modules.upload.service.upload_file import (
check_file_exists,
Expand Down Expand Up @@ -168,6 +169,8 @@ async def process_sync_file(
]:
raise ValueError(f"Incompatible file extension for {downloaded_file}")

storage_path = sanitize_filename(storage_path)

response = await upload_file_storage(
downloaded_file.file_data,
storage_path,
Expand Down
13 changes: 8 additions & 5 deletions backend/api/quivr_api/modules/upload/controller/upload_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from quivr_api.modules.notification.service.notification_service import (
NotificationService,
)
from quivr_api.modules.sync.utils.normalize import sanitize_filename
from quivr_api.modules.upload.service.upload_file import (
upload_file_storage,
)
Expand Down Expand Up @@ -85,12 +86,14 @@ async def upload_file(
brain_id=str(brain_id),
)
)
file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"

background_tasks.add_task(
maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
maybe_send_telemetry, "upload_file", {"file_name": file_name}
)

filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
filename_with_brain_id = str(brain_id) + "/" + file_name
filename_with_brain_id = sanitize_filename(filename_with_brain_id)

buff_reader = io.BufferedReader(uploadFile.file) # type: ignore
try:
Expand All @@ -110,9 +113,9 @@ async def upload_file(

knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
file_name=uploadFile.filename,
file_name=file_name,
extension=os.path.splitext(
uploadFile.filename # pyright: ignore reportPrivateUsage=none
file_name # pyright: ignore reportPrivateUsage=none
)[-1].lower(),
source=integration if integration else "local",
source_link=integration_link, # FIXME: Should return the s3 link @chloedia
Expand All @@ -127,7 +130,7 @@ async def upload_file(
"process_file_task",
kwargs={
"file_name": filename_with_brain_id,
"file_original_name": uploadFile.filename,
"file_original_name": file_name,
"brain_id": brain_id,
"notification_id": upload_notification.id,
"knowledge_id": knowledge.id,
Expand Down
26 changes: 16 additions & 10 deletions backend/core/MegaParse/megaparse/Converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,22 +320,28 @@ async def convert(
else:
raise ValueError(f"Method {self.method} not supported")

if not gpt4o_cleaner:
return LangChainDocument(
page_content=parsed_md,
metadata={"filename": file_path.name, "type": "pdf"},
)
else:
if gpt4o_cleaner:
md_processor = MarkdownProcessor(
parsed_md,
strict=True,
remove_pagination=True,
)
md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
return LangChainDocument(
page_content=md_cleaned,
metadata={"filename": file_path.name, "type": "pdf"},
)
parsed_md = md_cleaned

if (
len(parsed_md) < 5
and file_path.stat().st_size > 100
and self.strategy == "fast"
):
logger.info(f"Switching to auto strategy for {file_path.name}")
self.strategy = "auto"
return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)

return LangChainDocument(
page_content=parsed_md,
metadata={"filename": file_path.name, "type": "pdf"},
)

def save_md(self, md_content: str, file_path: Path | str) -> None:
with open(file_path, "w") as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def processor_metadata(self):
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
document: Document = await mega_parse.aload()
print("\n\n document: ", document.page_content)
if len(document.page_content) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
Expand Down
3 changes: 3 additions & 0 deletions backend/worker/quivr_worker/celery_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
running currently.
"""
active_tasks = celery.control.inspect().active()
if not active_tasks:
return False

for worker, running_tasks in active_tasks.items():
for task in running_tasks:
if task["name"] == task_name: # type: ignore
Expand Down

0 comments on commit 367242a

Please sign in to comment.