Skip to content

Commit

Permalink
feat: sync ALL products (obf, opff, opf) (#264)
Browse files Browse the repository at this point in the history
  • Loading branch information
raphodn authored Apr 3, 2024
1 parent 420a77c commit 9f876c4
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 33 deletions.
4 changes: 3 additions & 1 deletion app/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
from apscheduler.schedulers.blocking import BlockingScheduler
from openfoodfacts import Flavor
from openfoodfacts.utils import get_logger

from app.db import session
Expand All @@ -11,7 +12,8 @@

def import_product_db_job() -> None:
db = session()
import_product_db(db=db)
for flavor in [Flavor.off, Flavor.obf, Flavor.opff, Flavor.opf]:
import_product_db(db=db, flavor=flavor)


def run() -> None:
Expand Down
24 changes: 16 additions & 8 deletions app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,23 @@ def create_price_product(db: Session, price: Price) -> None:
crud.increment_product_price_count(db, product=db_product)


def import_product_db(db: Session, batch_size: int = 1000) -> None:
def import_product_db(
db: Session, flavor: Flavor = Flavor.off, batch_size: int = 1000
) -> None:
"""Import from DB JSONL dump to insert/update product table.
:param db: the session to use
:param batch_size: the number of products to insert/update in a single
transaction, defaults to 1000
"""
logger.info("Launching import_product_db")
logger.info(f"Launching import_product_db ({flavor})")
existing_codes = set(db.execute(select(Product.code)).scalars())
logger.info("Number of existing codes: %d", len(existing_codes))
dataset = ProductDataset(
dataset_type=DatasetType.jsonl, force_download=True, download_newer=True
flavor=flavor,
dataset_type=DatasetType.jsonl,
force_download=True,
download_newer=True,
)

added_count = 0
Expand All @@ -83,8 +88,8 @@ def import_product_db(db: Session, batch_size: int = 1000) -> None:
)
seen_codes = set()
for product in tqdm.tqdm(dataset):
# Skip products without a code
if "code" not in product:
# Skip products without a code, or with wrong code
if ("code" not in product) or (not product["code"].isdigit()):
continue
product_code = product["code"]

Expand All @@ -93,6 +98,9 @@ def import_product_db(db: Session, batch_size: int = 1000) -> None:
continue
seen_codes.add(product_code)

# Some products have no "lang" field (especially non-OFF products)
product_lang = product.get("lang", product.get("lc", "en"))
# Store images & last_modified_t
product_images: JSONType = product.get("images", {})
product_last_modified_t = product.get("last_modified_t")

Expand Down Expand Up @@ -122,9 +130,9 @@ def import_product_db(db: Session, batch_size: int = 1000) -> None:
key: product[key] if (key in product) else None for key in OFF_FIELDS
}
product_dict["image_url"] = generate_openfoodfacts_main_image_url(
product_code, product_images, product["lang"]
product_code, product_images, product_lang, flavor=flavor
)
product_dict["source"] = Flavor.off
product_dict["source"] = flavor
product_dict["source_last_synced"] = datetime.datetime.now()
product_dict = normalize_product_fields(product_dict)

Expand All @@ -144,7 +152,7 @@ def import_product_db(db: Session, batch_size: int = 1000) -> None:
# or if it has no source (created in Open Prices before OFF)
.where(
or_(
Product.source == Flavor.off,
Product.source == flavor,
Product.source == None, # noqa: E711, E501
)
)
Expand Down
18 changes: 11 additions & 7 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ def init_sentry(
]


def openfoodfacts_product_search(code: str) -> JSONType | None:
def openfoodfacts_product_search(
code: str, flavor: Flavor = Flavor.off
) -> JSONType | None:
client = API(
user_agent=get_user_agent(),
username=None,
password=None,
country=Country.world,
flavor=Flavor.off,
flavor=flavor,
version=APIVersion.v2,
environment=settings.environment,
)
Expand Down Expand Up @@ -96,7 +98,7 @@ def normalize_product_fields(product: JSONType) -> JSONType:


def generate_openfoodfacts_main_image_url(
code: str, images: JSONType, lang: str
code: str, images: JSONType, lang: str, flavor: Flavor = Flavor.off
) -> str | None:
"""Generate the URL of the main image of a product.
Expand All @@ -118,18 +120,20 @@ def generate_openfoodfacts_main_image_url(
image_rev = images[image_key]["rev"]
image_id = f"{image_key}.{image_rev}.400"
return generate_image_url(
code, image_id=image_id, flavor=Flavor.off, environment=settings.environment
code, image_id=image_id, flavor=flavor, environment=settings.environment
)

return None


def fetch_product_openfoodfacts_details(product: Product) -> JSONType | None:
def fetch_product_openfoodfacts_details(
product: Product, flavor: Flavor = Flavor.off
) -> JSONType | None:
product_dict = {}
try:
response = openfoodfacts_product_search(code=product.code)
response = openfoodfacts_product_search(code=product.code, flavor=flavor)
if response and response["status"]:
product_dict["source"] = Flavor.off
product_dict["source"] = flavor
for off_field in OFF_FIELDS:
if off_field in response["product"]:
product_dict[off_field] = response["product"][off_field]
Expand Down
22 changes: 6 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ alembic = "~1.12.1"
Babel = "~2.13.1"
fastapi = "~0.103.1"
jinja2 = "~3.1.3"
openfoodfacts = "~0.2.0"
openfoodfacts = "0.2.1"
psycopg2-binary = "~2.9.9"
pydantic-settings = "~2.0.3"
python-multipart = "~0.0.7"
Expand Down

0 comments on commit 9f876c4

Please sign in to comment.