Skip to content

Commit

Permalink
feat: add daily dumps of DB main tables as JSONL files
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed May 2, 2024
1 parent 232fea8 commit 0bdca3a
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 3 deletions.
2 changes: 2 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ class Settings(BaseSettings):
sentry_dns: str | None = None
log_level: LoggingLevel = LoggingLevel.INFO
images_dir: Path = STATIC_DIR / "img"
data_dir: Path = STATIC_DIR / "data"
environment: Environment = Environment.org

model_config = SettingsConfigDict(env_file=".env", extra="ignore")


settings = Settings()
settings.images_dir.mkdir(parents=True, exist_ok=True)
settings.data_dir.mkdir(parents=True, exist_ok=True)
23 changes: 22 additions & 1 deletion app/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import datetime
from pathlib import Path

from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
from apscheduler.schedulers.blocking import BlockingScheduler
from openfoodfacts import Flavor
from openfoodfacts.utils import get_logger

from app.config import settings
from app.db import session
from app.tasks import import_product_db
from app.tasks import dump_db, import_product_db

logger = get_logger(__name__)

Expand All @@ -19,11 +23,28 @@ def import_product_db_job() -> None:
import_product_db(db=db, flavor=flavor)


def dump_db_job() -> None:
"""Dump the database to the data directory."""
# Create a temporary directory to store the dump
tmp_dir = Path(f"/tmp/dump-{datetime.datetime.now().isoformat()}").resolve()

with session() as db:
dump_db(db, tmp_dir)

for file_path in tmp_dir.iterdir():
# Move the file to the final location
file_path.rename(settings.data_dir / file_path.name)
tmp_dir.rmdir()


def run() -> None:
scheduler = BlockingScheduler()
scheduler.add_executor(ThreadPoolExecutor(20))
scheduler.add_jobstore(MemoryJobStore())
scheduler.add_job(
import_product_db_job, "cron", max_instances=1, hour=10, minute=0, jitter=60
)
scheduler.add_job(
dump_db_job, "cron", max_instances=1, hour=23, minute=0, jitter=60
)
scheduler.start()
23 changes: 21 additions & 2 deletions app/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
from pathlib import Path

import tqdm
from openfoodfacts import DatasetType, Flavor, ProductDataset
Expand All @@ -7,8 +8,8 @@
from sqlalchemy import or_, select, update
from sqlalchemy.orm import Session

from app import crud
from app.models import Price, Product, Proof
from app import crud, schemas
from app.models import Location, Price, Product, Proof
from app.schemas import LocationCreate, ProductCreate, UserCreate
from app.utils import (
OFF_FIELDS,
Expand Down Expand Up @@ -200,3 +201,21 @@ def create_price_location(db: Session, price: Price) -> None:
else:
# Increment the price count of the location
crud.increment_location_price_count(db, location=db_location)


def dump_db(db: Session, output_dir: Path) -> None:
"""Dump the database to JSONL files."""
logger.info("Creating dumps of the database")
output_dir.mkdir(parents=True, exist_ok=True)

for table_name, model_cls, schema_cls in (
("prices", Price, schemas.PriceFull),
("proofs", Proof, schemas.ProofFull),
("locations", Location, schemas.LocationFull),
):
logger.info(f"Dumping {table_name}")
output_path = output_dir / f"{table_name}.jsonl"
with output_path.open("w") as f:
for (item,) in tqdm.tqdm(db.execute(select(model_cls)), desc=table_name):
f.write(schema_cls(**item.__dict__).model_dump_json())
f.write("\n")

0 comments on commit 0bdca3a

Please sign in to comment.