Skip to content

Commit

Permalink
🍪 Added save_cookies function for handling browser cookies
Browse files Browse the repository at this point in the history
  • Loading branch information
asim-shrestha committed Sep 14, 2024
2 parents d734a79 + e0e83c0 commit a7a270e
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 7 deletions.
20 changes: 20 additions & 0 deletions harambe/cookies_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from harambe.types import Cookie
from datetime import datetime, timezone, timedelta


def fix_cookie(cookie: Cookie) -> Cookie:
"""
Fix the cookie expiry by setting it to one day ahead only if it's expiring within a day.
:param cookie: The cookie to modify.
:return: The modified cookie.
"""
current_time = datetime.now(tz=timezone.utc)
if "expires" in cookie:
expiry_time = datetime.fromtimestamp(cookie["expires"], tz=timezone.utc)
if (expiry_time - current_time) < timedelta(days=1):
cookie["expires"] = (current_time + timedelta(days=1)).timestamp()
else:
cookie["expires"] = (current_time + timedelta(days=1)).timestamp()

return cookie
24 changes: 23 additions & 1 deletion harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ResourceRequestHandler,
ResourceType,
)
from harambe.cookies_handler import fix_cookie
from harambe.normalize_url import normalize_url
from harambe.observer import (
DownloadMeta,
Expand All @@ -51,6 +52,7 @@
ScrapeResult,
SetupType,
Stage,
Cookie,
)


Expand Down Expand Up @@ -96,6 +98,7 @@ def __init__(
else None
)
self._saved_data: set[ScrapeResult] = set()
self._saved_cookies: List[Cookie] = []

if not observer:
observer = [LoggingObserver()]
Expand Down Expand Up @@ -268,6 +271,26 @@ async def capture_pdf(
)
return res[0]

async def save_cookies(self, cookies: Optional[List[Cookie]] = None) -> None:
"""
Save the cookies from the current browser context or use the provided cookies.
This function retrieves all the cookies from the current browser context if none are provided,
saves them to the SDK instance, and notifies all observers about the action performed.
:param cookies: Optional list of cookie dictionaries to save. If None, cookies are retrieved from the current page context.
"""
existing_cookies = {cookie["name"]: cookie for cookie in self._saved_cookies}
if not cookies:
cookies = await self.page.context.cookies()

for cookie in cookies:
cookie = fix_cookie(cookie)
existing_cookies[cookie["name"]] = cookie

self._saved_cookies = list(existing_cookies.values())
await self._notify_observers("on_save_cookies", self._saved_cookies)

async def _notify_observers(
self,
method: ObservationTrigger,
Expand All @@ -287,7 +310,6 @@ async def _notify_observers(
duplicated = False
if check_duplication:
duplicated = getattr(self._deduper, method)(*args, **kwargs)

if not duplicated:
return await asyncio.gather(
*[getattr(o, method)(*args, **kwargs) for o in self._observers]
Expand Down
22 changes: 20 additions & 2 deletions harambe/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from urllib.parse import quote

from harambe.tracker import FileDataTracker
from harambe.types import URL, Context, Options, Stage
from harambe.types import URL, Context, Options, Stage, Cookie

ObservationTrigger = Literal[
"on_save_data", "on_queue_url", "on_download", "on_paginate"
"on_save_data", "on_queue_url", "on_download", "on_paginate", "on_save_cookies"
]


Expand Down Expand Up @@ -43,6 +43,10 @@ async def on_download(
async def on_paginate(self, next_url: str) -> None:
raise NotImplementedError()

@abstractmethod
async def on_save_cookies(self, cookies: List[Cookie]) -> None:
raise NotImplementedError()


class LoggingObserver(OutputObserver):
async def on_save_data(self, data: dict[str, Any]) -> None:
Expand All @@ -63,6 +67,9 @@ async def on_download(
async def on_paginate(self, next_url: str) -> None:
pass

async def on_save_cookies(self, cookies: List[Cookie]) -> None:
print(f"Cookies saved : {cookies}")


class LocalStorageObserver(OutputObserver):
def __init__(self, domain: str, stage: Stage):
Expand All @@ -87,12 +94,16 @@ async def on_download(
async def on_paginate(self, next_url: str) -> None:
pass

async def on_save_cookies(self, cookies: List[Cookie]) -> None:
self._tracker.save_data({"cookies": cookies})


class InMemoryObserver(OutputObserver):
def __init__(self) -> None:
self._data: List[dict[str, Any]] = []
self._urls: List[Tuple[URL, Context, Options]] = []
self._files: List[Tuple[str, bytes]] = []
self._cookies: List[Cookie] = []

async def on_save_data(self, data: dict[str, Any]) -> None:
self._data.append(data)
Expand All @@ -112,6 +123,9 @@ async def on_download(
async def on_paginate(self, next_url: str) -> None:
pass

async def on_save_cookies(self, cookies: List[Cookie]) -> None:
self._cookies.extend(cookies)

@property
def data(self) -> List[dict[str, Any]]:
return self._data
Expand All @@ -123,3 +137,7 @@ def urls(self) -> List[Tuple[URL, Context, Options]]:
@property
def files(self) -> List[Tuple[str, bytes]]:
return self._files

@property
def cookies(self) -> List[Cookie]:
return self._cookies
13 changes: 10 additions & 3 deletions harambe/pagination.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import hashlib
import json
from typing import Any, Optional, Iterable

from typing import Any, Optional, Iterable, List
from pydantic import BaseModel

from harambe.types import URL, Context, Options
from harambe.types import URL, Context, Options, Cookie


class PageInfo(BaseModel):
Expand All @@ -28,6 +27,14 @@ def on_save_data(self, data: dict[str, Any]) -> bool:

return self._add_data(data)

def on_save_cookies(self, cookies: List[Cookie]):
"""
Save cookies and check if they are duplicated
:param cookies: cookies to be saved
:return: bool indicating if the cookies are duplicated, true if it is duplicated
"""
return self._add_data(cookies)

def on_queue_url(
self, url: URL, _: Optional[Context], __: Optional[Options]
) -> bool:
Expand Down
13 changes: 13 additions & 0 deletions harambe/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,16 @@ class HarnessOptions(TypedDict, total=False):
disable_go_to_url: bool
on_start: Optional[Callback]
on_end: Optional[Callback]


class Cookie(TypedDict):
name: str
value: str
domain: str
path: str
expires: int | float
size: int
httpOnly: bool
secure: bool
session: bool
sameSite: str
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "harambe-sdk"
version = "0.27.0"
version = "0.28.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = ["awtkns <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit a7a270e

Please sign in to comment.