Skip to content

Commit

Permalink
enhancement: new extract function for detecting image URLs (#1212)
Browse files Browse the repository at this point in the history
- Adds new feature discussed in GitHub Issue #1117 and in slack
  • Loading branch information
cdpierse authored Aug 30, 2023
1 parent d33d8b5 commit de855bb
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 1 deletion.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

* Adds `chunk_by_title` to break a document into sections based on the presence of `Title`
elements.

* add new extraction function `extract_image_urls_from_html` to extract all img related URL from html text.

### Fixes

* Make cv2 dependency optional
Expand Down
50 changes: 50 additions & 0 deletions test_unstructured/cleaners/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,53 @@ def test_extract_us_phone_number(text, expected):
)
def test_extract_ordered_bullets(text, expected):
assert extract.extract_ordered_bullets(text=text) == expected


@pytest.mark.parametrize(
("text", "expected"),
[
(
"https://my-image.jpg",
(["https://my-image.jpg"]),
),
(
"https://my-image.png with some text",
(["https://my-image.png"]),
),
(
"https://my-image/with/some/path.png",
(["https://my-image/with/some/path.png"]),
),
(
"some text https://my-image.jpg with another http://my-image.bmp",
(["https://my-image.jpg", "http://my-image.bmp"]),
),
(
"http://not-an-image.com",
([]),
),
(
"some text",
([]),
),
(
"some text https://my-image.JPG with another http://my-image.BMP",
(["https://my-image.JPG", "http://my-image.BMP"]),
),
(
"http://my-path-with-CAPS/my-image.JPG",
(["http://my-path-with-CAPS/my-image.JPG"]),
),
(
"http://my-path/my%20image.JPG",
(["http://my-path/my%20image.JPG"]),
),
# url with reference #
(
"https://my-image.jpg#ref",
(["https://my-image.jpg"]),
),
],
)
def test_extract_image_urls_from_html(text, expected):
assert extract.extract_image_urls_from_html(text=text) == expected
5 changes: 5 additions & 0 deletions unstructured/cleaners/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unstructured.nlp.patterns import (
EMAIL_ADDRESS_PATTERN,
EMAIL_DATETIMETZ_PATTERN,
IMAGE_URL_PATTERN,
IP_ADDRESS_NAME_PATTERN,
IP_ADDRESS_PATTERN_RE,
MAPI_ID_PATTERN,
Expand Down Expand Up @@ -136,3 +137,7 @@ def extract_ordered_bullets(text) -> tuple:
b = "".join(b)
c = "".join(c) if c else None
return a, b, c


def extract_image_urls_from_html(text: str) -> List[str]:
return re.findall(IMAGE_URL_PATTERN, text)
7 changes: 7 additions & 0 deletions unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,10 @@

# taken from https://stackoverflow.com/a/3845829/12406158
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"

IMAGE_URL_PATTERN = (
r"(?i)https?://"
r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+"
r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
r"\.(?:jpg|jpeg|png|gif|bmp)"
)

0 comments on commit de855bb

Please sign in to comment.