fix: GH issue 1057 etree parser error (csv) (#1112)

Addresses #1057 for CSV. Related to PR #1077. * update partition_csv to always use soupparser_fromstring to parse html text
Unstructured-IO · Aug 14, 2023 · 8026646 · 8026646
1 parent 612f9da
commit 8026646
Show file tree

Hide file tree

Showing 7 changed files with 12 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.9.3-dev2
+## 0.9.3-dev3
 
 ### Enhancements
 
+* Update `partition_csv` to always use `soupparser_fromstring` to parse `html text`
 * Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text`
 * Add `metadata.section` to capture epub table of contents data
 * Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID

diff --git a/..._ingest/expected-structured-output/airtable-diff/app5YQxSfp220fWtm/tblBoUk54tWXGqYai.json b/..._ingest/expected-structured-output/airtable-diff/app5YQxSfp220fWtm/tblBoUk54tWXGqYai.json
diff --git a/..._ingest/expected-structured-output/airtable-diff/app5YQxSfp220fWtm/tblxdPc7L2meGIZLE.json b/..._ingest/expected-structured-output/airtable-diff/app5YQxSfp220fWtm/tblxdPc7L2meGIZLE.json
diff --git a/..._ingest/expected-structured-output/airtable-diff/appJ43QmP8I17zu88/tblbj2vBlL2dN2xqq.json b/..._ingest/expected-structured-output/airtable-diff/appJ43QmP8I17zu88/tblbj2vBlL2dN2xqq.json
diff --git a/..._ingest/expected-structured-output/airtable-diff/appJ43QmP8I17zu88/tblfu7DzEcCWNKwP4.json b/..._ingest/expected-structured-output/airtable-diff/appJ43QmP8I17zu88/tblfu7DzEcCWNKwP4.json
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.9.3-dev2"  # pragma: no cover
+__version__ = "0.9.3-dev3"  # pragma: no cover
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -2,7 +2,6 @@
 from typing import IO, BinaryIO, List, Optional, Union, cast
 
 import pandas as pd
-from lxml.html import document_fromstring
 from lxml.html.soupparser import fromstring as soupparser_fromstring
 
 from unstructured.documents.elements import (
@@ -13,7 +12,6 @@
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
-    contains_emoji,
     exactly_one,
     get_last_modified_date,
     get_last_modified_date_from_file,
@@ -60,8 +58,7 @@ def partition_csv(
         table = pd.read_csv(f)
 
     html_text = table.to_html(index=False, header=False, na_rep="")
-    html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring
-    text = html_string_parser(html_text).text_content()
+    text = soupparser_fromstring(html_text).text_content()
 
     if include_metadata:
         metadata = ElementMetadata(