Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor FL to remove tenacity #643

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ html5lib = "*"
requests = "*"
openpyxl = "*"
pdfplumber = "*"
tenacity = "*"
click = "*"
xlrd = "*"
retry = "*"
Expand Down
81 changes: 44 additions & 37 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,6 @@ def run(self):
"requests",
"openpyxl",
"xlrd",
"tenacity",
"retry",
],
license="Apache 2.0 license",
Expand Down
35 changes: 6 additions & 29 deletions warn/scrapers/fl.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import datetime
import logging
import re
from os.path import exists
from pathlib import Path

import pdfplumber
import requests
import tenacity
import urllib3
from bs4 import BeautifulSoup

from .. import utils
Expand Down Expand Up @@ -54,7 +50,7 @@ def scrape(
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
url = "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
response = requests.get(url, headers=headers, verify=False)
response = utils.get_url(url, headers=headers)
logger.debug(f"Request status is {response.status_code} for {url}")
soup = BeautifulSoup(response.text, "html.parser")
pageholder = soup.select("div.content")[0]
Expand Down Expand Up @@ -95,13 +91,7 @@ def scrape(

# scrapes each html page for the current year
# returns a list of the year's html pages
# note: no max amount of retries (recursive scraping)
@tenacity.retry(
wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(requests.HTTPError),
)
def _scrape_html(cache, url, headers, page=1):
urllib3.disable_warnings() # sidestep SSL error
# extract year from URL
year = _extract_year(url)
html_cache_key = f"fl/{year}_page_{page}.html"
Expand All @@ -121,7 +111,7 @@ def _scrape_html(cache, url, headers, page=1):
raise FileNotFoundError
except FileNotFoundError:
# scrape & cache html
response = requests.get(url, headers=headers, verify=False)
response = utils.get_url(url, headers=headers)
logger.debug(f"Request status is {response.status_code} for {url}")
response.raise_for_status()
page_text = response.text
Expand Down Expand Up @@ -168,29 +158,16 @@ def _html_to_rows(page_text):


# download and scrape pdf
@tenacity.retry(
wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(requests.HTTPError),
)
def _scrape_pdf(cache, cache_dir, url, headers):
# sidestep SSL error
urllib3.disable_warnings()
# extract year from URL
year = _extract_year(url)
pdf_cache_key = f"fl/{year}.pdf"
download = ""
pdf_path = cache_dir / pdf_cache_key
# download pdf if not in the cache
if not exists(pdf_cache_key):
response = requests.get(url, headers=headers, verify=False)
logger.debug(f"Request status is {response.status_code} for {url}")
response.raise_for_status()
# download & cache pdf
download = response.content
with open(f"{cache_dir}/{pdf_cache_key}", "wb") as f:
f.write(download)
logger.debug(f"Successfully scraped PDF from {url} to cache: {pdf_cache_key}")
if not cache.exists(pdf_cache_key):
cache.download(pdf_cache_key, url, headers=headers)
# scrape tables from PDF
with pdfplumber.open(f"{cache_dir}/{pdf_cache_key}") as pdf:
with pdfplumber.open(pdf_path) as pdf:
pages = pdf.pages
output_rows = []
for page_num, page in enumerate(pages):
Expand Down