-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
144 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from typing import List, AsyncGenerator | ||
import pytextract | ||
import tempfile | ||
import os | ||
import pytextract | ||
|
||
from querent.processors.async_processor import AsyncProcessor | ||
from querent.ingestors.ingestor_factory import IngestorFactory | ||
from querent.ingestors.base_ingestor import BaseIngestor | ||
from querent.config.ingestor_config import IngestorBackend | ||
from querent.common.types.collected_bytes import CollectedBytes | ||
|
||
|
||
class DocIngestorFactory(IngestorFactory): | ||
SUPPORTED_EXTENSIONS = {"doc", "docx"} | ||
|
||
async def supports(self, file_extension: str) -> bool: | ||
return file_extension.lower() in self.SUPPORTED_EXTENSIONS | ||
|
||
async def create( | ||
self, file_extension: str, processors: List[AsyncProcessor] | ||
) -> BaseIngestor: | ||
if not await self.supports(file_extension): | ||
return None | ||
return DocIngestor(processors) | ||
|
||
|
||
class DocIngestor(BaseIngestor): | ||
def __init__(self, processors: List[AsyncProcessor]): | ||
super().__init__(IngestorBackend.DOC) | ||
self.processors = processors | ||
|
||
async def ingest( | ||
self, poll_function: AsyncGenerator[CollectedBytes, None] | ||
) -> AsyncGenerator[str, None]: | ||
current_file = None | ||
collected_bytes = b"" | ||
try: | ||
async for chunk_bytes in poll_function: | ||
if chunk_bytes.is_error(): | ||
# TODO handle error | ||
continue | ||
if current_file is None: | ||
current_file = chunk_bytes.file | ||
elif current_file != chunk_bytes.file: | ||
# we have a new file, process the old one | ||
async for text in self.extract_and_process_doc( | ||
CollectedBytes(file=current_file, data=collected_bytes) | ||
): | ||
yield text | ||
collected_bytes = b"" | ||
current_file = chunk_bytes.file | ||
collected_bytes += chunk_bytes.data | ||
except Exception as e: | ||
# TODO handle exception | ||
yield "" | ||
finally: | ||
# process the last file | ||
async for text in self.extract_and_process_doc( | ||
CollectedBytes(file=current_file, data=collected_bytes) | ||
): | ||
yield text | ||
pass | ||
|
||
async def extract_and_process_doc( | ||
self, collected_bytes: CollectedBytes | ||
) -> AsyncGenerator[str, None]: | ||
text = await self.extract_text_from_doc(collected_bytes) | ||
# print(text) | ||
processed_text = await self.process_data(text) | ||
yield processed_text | ||
|
||
async def extract_text_from_doc(self, collected_bytes: CollectedBytes) -> str: | ||
suffix = "." + collected_bytes.extension | ||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp_file: | ||
temp_file.write(collected_bytes.data) | ||
|
||
temp_file_path = temp_file.name | ||
try: | ||
txt = pytextract.process(temp_file_path).decode("utf-8") | ||
return txt | ||
finally: | ||
os.remove(temp_file_path) | ||
|
||
async def process_data(self, text: str) -> List[str]: | ||
processed_data = text | ||
for processor in self.processors: | ||
processed_data = await processor.process(processed_data) | ||
return processed_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,3 +160,4 @@ pydub | |
SpeechRecognition | ||
pytesseract | ||
pillow | ||
pytextract |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
"""Test cases for audio ingestors""" | ||
from pathlib import Path | ||
import pytest | ||
import asyncio | ||
|
||
from querent.collectors.fs.fs_collector import FSCollectorFactory | ||
from querent.config.collector_config import FSCollectorConfig | ||
from querent.common.uri import Uri | ||
from querent.ingestors.ingestor_manager import IngestorFactoryManager | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_collect_and_ingest_audio(): | ||
collector_factory = FSCollectorFactory() | ||
uri = Uri("file://" + str(Path("./tests/data/doc/").resolve())) | ||
config = FSCollectorConfig(root_path=uri.path) | ||
collector = collector_factory.resolve(uri, config) | ||
|
||
ingestor_factory_manager = IngestorFactoryManager() | ||
ingestor_factory = await ingestor_factory_manager.get_factory("doc") | ||
|
||
ingestor = await ingestor_factory.create("doc", []) | ||
|
||
# Collect and ingest the PDF | ||
ingested_call = ingestor.ingest(collector.poll()) | ||
counter = 0 | ||
|
||
async def poll_and_print(): | ||
counter = 0 | ||
async for ingested in ingested_call: | ||
assert ingested is not None | ||
if len(ingested) != 0: | ||
counter += 1 | ||
|
||
assert counter == 2 | ||
|
||
await poll_and_print() | ||
|
||
|
||
if __name__ == "__main__": | ||
asyncio.run(test_collect_and_ingest_audio()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters