Skip to content

Commit

Permalink
Roman/expose dpi param (#966)
Browse files Browse the repository at this point in the history
* Bump inference version

* Pass through the dpi param if available

* Update CHANGELOG

* Check dpi param passed in via unit test

* Bump inference version

* Fix unit test around file info to work on mac as well
  • Loading branch information
rbiseck3 authored Jul 26, 2023
1 parent f7e46af commit b39e0d7
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 8 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
### Features

* Adds Outlook connector
* Add support for dpi parameter in inference library

### Fixes

* Fixes issue with email partitioning where From field was being assigned the To field value.

## 0.8.2-dev7
## 0.8.2

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion requirements/local-inference.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
-c constraints.in
-c base.txt
unstructured-inference==0.5.5
unstructured-inference==0.5.7
2 changes: 1 addition & 1 deletion requirements/local-inference.txt
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ tzdata==2023.3
# via
# -c requirements/base.txt
# pandas
unstructured-inference==0.5.5
unstructured-inference==0.5.7
# via -r requirements/local-inference.in
urllib3==1.26.16
# via
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/file_utils/test_exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_get_directory_file_info(tmpdir):
assert isinstance(file_info, pd.DataFrame)
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}

means = file_info.groupby("filetype").mean()
means = file_info.groupby("filetype").mean(numeric_only=True)
assert means.columns.to_list() == ["filesize"]


Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/partition/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,20 @@ def test_partition_pdf_with_copy_protection():
assert {element.metadata.page_number for element in elements} == {1, 2}


def test_partition_pdf_with_dpi():
filename = os.path.join("example-docs", "copy-protected.pdf")
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100)
mock_process.assert_called_once_with(
filename,
is_image=False,
ocr_languages="eng",
extract_tables=False,
model_name=None,
pdf_image_dpi=100,
)


def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert len(elements) > 50
Expand Down
14 changes: 10 additions & 4 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,18 @@ def _partition_pdf_or_image_local(

model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
if file is None:
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
process_file_with_model_kwargs = {
"is_image": is_image,
"ocr_languages": ocr_languages,
"extract_tables": infer_table_structure,
"model_name": model_name,
}
if pdf_image_dpi:
process_file_with_model_kwargs["pdf_image_dpi"] = pdf_image_dpi
layout = process_file_with_model(
filename,
is_image=is_image,
ocr_languages=ocr_languages,
extract_tables=infer_table_structure,
model_name=model_name,
**process_file_with_model_kwargs,
)
else:
layout = process_data_with_model(
Expand Down

0 comments on commit b39e0d7

Please sign in to comment.