From 42a0d85e7f46c8ae39480ce5b1a045ba2b7395a5 Mon Sep 17 00:00:00 2001 From: naumansharifwork <157045300+naumansharifwork@users.noreply.github.com> Date: Mon, 16 Sep 2024 00:03:09 +0500 Subject: [PATCH] updated utils.py to include post url and get_cookies functions (#113) --- clean/utils.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/clean/utils.py b/clean/utils.py index dda7195..b1b8689 100644 --- a/clean/utils.py +++ b/clean/utils.py @@ -287,3 +287,61 @@ def get_repeated_asset_url(self, objects: List[MetadataDict]): else: seen_urls.add(asset_url) return repeated_urls + + +@retry(tries=3, delay=15, backoff=2) +def post_url( + url, user_agent="Big Local News (biglocalnews.org)", session=None, **kwargs +): + """Request the provided URL and return a response object. + + Args: + url (str): the url to be requested + user_agent (str): the user-agent header passed with the request (default: biglocalnews.org) + session: a session object to use when making the request. optional + """ + logger.debug(f"Requesting {url}") + + # Set the headers + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["User-Agent"] = user_agent + + # Go get it + if session is not None: + logger.debug(f"Requesting with session {session}") + response = session.post(url, **kwargs) + else: + response = requests.post(url, **kwargs) + logger.debug(f"Response code: {response.status_code}") + + # Verify that the response is 200 + assert response.ok + + # Return the response + return response + + +@retry(tries=3, delay=15, backoff=2) +def get_cookies(url, user_agent="Big Local News (biglocalnews.org)", **kwargs): + """Request the provided URL and return cookie object. + + Args: + url (str): the url to be requested + user_agent (str): the user-agent header passed with the request (default: biglocalnews.org) + """ + logger.debug(f"Requesting {url}") + + # Set the headers + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["User-Agent"] = user_agent + response = requests.get(url, **kwargs) + + # Verify that the response is 200 + assert response.ok + + cookies = response.cookies.get_dict() + + # Return the response + return cookies