Skip to content

Commit

Permalink
update to the new dedoc data model
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaKozlov committed Jun 2, 2024
1 parent b92d8f3 commit 6c678da
Show file tree
Hide file tree
Showing 24 changed files with 1,488 additions and 543 deletions.
3 changes: 3 additions & 0 deletions .github/TODO
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1 Перейти на поетри
2 Перейти на httpx
3 надо подумать что делать с разными версиями дедка
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: '2.4'

services:
dedoc:
image: "dedocproject/dedoc:version_2022_04_12"
image: "dedocproject/dedoc:v2.2.1"
ports:
- 1231:1231

Expand Down
12 changes: 8 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ FROM python:3.8


RUN mkdir /podrostoc
ADD requirements.txt /podrostoc
ADD test_requirements.txt /podrostoc
RUN pip3 install -r /podrostoc/requirements.txt -r /podrostoc/test_requirements.txt
RUN pip3 install --no-cache-dir poetry

ADD pyproject.toml /podrostoc
RUN poetry config virtualenvs.create false
WORKDIR /podrostoc
RUN cd /podrostoc ; poetry install

ADD podrostoc /podrostoc/podrostoc
ADD tests /podrostoc/tests
ENV PYTHONPATH /podrostoc/:/podrostoc/tests
ENV PYTHONPATH /podrostoc/:/podrostoc/tests:/podrostoc/podrostoc/
8 changes: 3 additions & 5 deletions podrostoc/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ def version(self) -> str:
r = requests.get(url)
return r.content.decode().strip()

def parse_file(self,
file_path: str,
parameters: Dict[str, str]) -> ParsedDocument:
def parse_file(self, file_path: str, parameters: Dict[str, str]) -> ParsedDocument:
response = self._send_request(file_path=file_path, data=parameters)
parsed_document = self._handle_response(response=response)
return parsed_document
Expand All @@ -34,8 +32,8 @@ def _send_request(self, file_path: str, data: dict = None) -> Response:

file_name = os.path.basename(file_path)
url = f"http://{self.dedoc_host}:{self.dedoc_port}/upload"
with open(file_path, 'rb') as file:
files = {'file': (file_name, file)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
response = requests.post(url, files=files, data=data)
return response

Expand Down
7 changes: 7 additions & 0 deletions podrostoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@


class Annotation(BaseModel):
"""
The piece of information about the text line: it’s appearance or links to another document object.
For example Annotation(1, 13, “italic”, “True”) says that text between 1st and 13th symbol was written in italic.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.Annotation
"""

start: int
end: int
name: str
Expand Down
19 changes: 19 additions & 0 deletions podrostoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import List

from pydantic import BaseModel

from data_structures.line_with_meta import LineWithMeta


class CellWithMeta(BaseModel):
"""
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.CellWithMeta
"""

# TODO add descriptions
lines: List[LineWithMeta]
rowspan: int
colspan: int
invisible: bool
13 changes: 10 additions & 3 deletions podrostoc/data_structures/document_content.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from typing import List

from pydantic.main import BaseModel
from pydantic import BaseModel, Field

from podrostoc.data_structures.table import Table
from podrostoc.data_structures.tree_node import TreeNode


class DocumentContent(BaseModel):
tables: List[Table]
structure: TreeNode
"""Content of the document - structured text and tables.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.DocumentContent
"""

tables: List[Table] = Field(description="Tables from the document.")
structure: TreeNode = Field(
description="Text of the document, organized in a tree."
)
28 changes: 17 additions & 11 deletions podrostoc/data_structures/document_metadata.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
from typing import Optional

from pydantic.main import BaseModel
from pydantic import BaseModel, Field


class DocumentMetadata(BaseModel):
uid: str
file_name: str
size: int
modified_time: int
created_time: int
access_time: int
file_type: Optional[str]
other_fields: Optional[dict]
"""
Document metadata like its name, size, author, etc.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.DocumentMetadata
"""

uid: str = Field(description="document uid")
file_name: str = Field(description="document file name")
temporary_file_name: str # TODO add the description
size: int = Field(description="document size in bytes")
modified_time: int = Field(
description="last modification time in seconds since epoch"
)
created_time: int = Field(description="creation time in seconds since epoch")
access_time: int = Field(description="creation time in seconds since epoch")
file_type: str # TODO add the description
15 changes: 15 additions & 0 deletions podrostoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from pydantic import BaseModel


class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.LineMetadata
"""

paragraph_type: str
page_id: int
line_id: Optional[int]
16 changes: 16 additions & 0 deletions podrostoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List

from pydantic import BaseModel

from data_structures.annotation import Annotation


class LineWithMeta(BaseModel):
"""
Textual line with text annotations.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.LineWithMeta
"""

text: str
annotations: List[Annotation]
10 changes: 0 additions & 10 deletions podrostoc/data_structures/paragraph_metadata.py

This file was deleted.

26 changes: 20 additions & 6 deletions podrostoc/data_structures/parsed_document.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
from typing import List, Optional

from pydantic import BaseModel
from pydantic import BaseModel, Field

from podrostoc.data_structures.document_content import DocumentContent
from podrostoc.data_structures.document_metadata import DocumentMetadata


class ParsedDocument(BaseModel):
version: str
warnings: List[str]
metadata: DocumentMetadata
content: DocumentContent
attachments: Optional[List['ParsedDocument']] = None
"""
Response from the dedoc.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.ParsedDocument
"""

content: DocumentContent = Field(
description="document content, such as text and tables"
)
metadata: DocumentMetadata = Field(
description="document metadata such as creation date, modification date, etc."
)
version: str = Field(description="dedoc version")
warnings: List[str] = Field(
description="dedoc warnings, during the document handling"
)
attachments: Optional[List["ParsedDocument"]] = Field(
default=None, description="ducument attachments, such as notes in the pdf file"
)
11 changes: 10 additions & 1 deletion podrostoc/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,18 @@

from pydantic.main import BaseModel

from data_structures.cell_with_meta import CellWithMeta
from podrostoc.data_structures.table_metadata import TableMetadata


class Table(BaseModel):
cells: List[List[str]]
"""
Holds information about tables in the document.
We assume that a table has rectangle form (has the same number of columns in each row).
Table representation is row-based i.e. external list contains list of rows.
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.Table
"""

cells: List[List[CellWithMeta]]
metadata: TableMetadata
13 changes: 10 additions & 3 deletions podrostoc/data_structures/tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@

from pydantic.main import BaseModel

from data_structures.line_metadata import LineMetadata
from podrostoc.data_structures.annotation import Annotation
from podrostoc.data_structures.paragraph_metadata import ParagraphMetadata


class TreeNode(BaseModel):
"""
Helps to represent document as recursive tree structure.
It has list of children TreeNode nodes (empty list for a leaf node).
url: https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api_schema.html#dedoc.api.schema.TreeNode
"""

node_id: str
text: str
annotations: List[Annotation]
metadata: ParagraphMetadata
subparagraphs: List['TreeNode']
metadata: LineMetadata
subparagraphs: List["TreeNode"]
46 changes: 46 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.poetry]
name = "podrostoc"
version = "0.0.1"
authors = [
"Ilya Kozlov <[email protected]>"
]
maintainers = [
"Ilya Kozlov <[email protected]>"
]
classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3 :: Only",
]

description = "Dedoct client."
keywords = ["document analysis", "logical structure extraction", "OCR", "deep learning", "computer vision"]
readme = "README.md"
license = "MIT"


[tool.poetry.dependencies]
python = "^3.8"
pydantic = "^1.9.1"
requests = "^2.25.0"


[tool.poetry.group.test.dependencies]
cowsay = "^4.0.0"
entrypoints = "0.3"
flake8 = "3.7.9"
flake8-annotations = "1.0.0"
flake8-tuple = "0.4.1"
mccabe = "0.6.1"
pycodestyle = "2.5.0"
pyflakes = "2.1.1"
six = "1.16.0"
flake8-builtins = "1.5.3"
flake8-black = "0.3.6"

2 changes: 0 additions & 2 deletions requirements.txt

This file was deleted.

2 changes: 0 additions & 2 deletions test_requirements.txt

This file was deleted.

34 changes: 25 additions & 9 deletions tests/api/test_api.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,57 @@
import os
import unittest

from podrostoc.client import DedocClient
from data_structures.table import Table
from client import DedocClient
from podrostoc.exceptions.dedoc_exception import DedocException


class TestApi(unittest.TestCase):
host = os.environ.get('DEDOC_HOST', 'localhost')
host = os.environ.get("DEDOC_HOST", "localhost")
client = DedocClient(dedoc_host=host, dedoc_port=1231)

def test_bad_file(self) -> None:
file = os.path.join(os.path.dirname(__file__), "..", "data", "file.bin")
file = os.path.abspath(file)
with self.assertRaises(DedocException):
with self.assertRaises(DedocException) as exception_context:
_ = self.client.parse_file(file_path=file, parameters={})
self.assertEqual(exception_context.exception.status_code, 415)

def test_csv_file(self) -> None:
file_name = "books.csv"
file = os.path.join(os.path.dirname(__file__), "..", "data", file_name)
file = os.path.abspath(file)
result = self.client.parse_file(file_path=file, parameters={})
self.assertEqual(1, len(result.content.tables))
table: Table
[table] = result.content.tables
self.assertEqual(11, len(table.cells))
self.assertEqual(file_name, result.metadata.file_name)
self.assertListEqual("id,cat,name,price,inStock,author,series_t,sequence_i,genre_s".split(","), table.cells[0])
first_row = [
"\n".join(line.text for line in cell.lines) for cell in table.cells[0]
]
self.assertListEqual(
"id,cat,name,price,inStock,author,series_t,sequence_i,genre_s".split(","),
first_row,
)

def test_csv_semicolons_file(self) -> None:
file = os.path.join(os.path.dirname(__file__), "..", "data", "books_semicolons.csv")
file = os.path.join(
os.path.dirname(__file__), "..", "data", "books_semicolons.csv"
)
file = os.path.abspath(file)
result = self.client.parse_file(file_path=file, parameters=dict(delimiter=';'))
result = self.client.parse_file(file_path=file, parameters=dict(delimiter=";"))
self.assertEqual(1, len(result.content.tables))
[table] = result.content.tables
first_row = [
"\n".join(line.text for line in cell.lines) for cell in table.cells[0]
]
self.assertEqual(11, len(table.cells))
self.assertListEqual("id,cat,name,price,inStock,author,series_t,sequence_i,genre_s".split(","), table.cells[0])
self.assertListEqual(
"id,cat,name,price,inStock,author,series_t,sequence_i,genre_s".split(","),
first_row,
)

def test_version(self) -> None:
version = self.client.version
self.assertEqual(3, len(version.split('.')))
self.assertEqual(10, len(version))
self.assertEqual(3, len(version.split(".")))
Loading

0 comments on commit 6c678da

Please sign in to comment.