feat(GDPR): improve script to manage Picard (#430)

openfoodfacts · Sep 12, 2024 · 955cecc · 955cecc
1 parent ed09039
commit 955cecc
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 55 deletions.
diff --git a/scripts/gdpr/README.md b/scripts/gdpr/README.md
@@ -1,23 +1,26 @@
-# GDPR request data
+# Uploading GDPR price data
 
 ## Context
 
-One of our data sources is GDPR request to supermarkets. See https://wiki.openfoodfacts.org/GDPR_request
+One of our data sources is GDPR request to supermarkets (with fidelity cards).
+
+See https://wiki.openfoodfacts.org/GDPR_request
 
 ## List of supermarkets
 
-|Supermarket|Data|Preprocessing|
-|-----------|---|---|
-|Auchan     |1 single file||
+|Supermarket|Data              |Preprocessing|
+|-----------|------------------|---|
+|Auchan     |1 single file     ||
 |Carrefour  |1 file with 2 tabs|- merge files<br/>- skip discounts|
-|E.Leclerc  |2 files|- merge files|
-|Intermarché|1 single file||
+|E.Leclerc  |2 files           |- merge files|
+|Intermarché|1 single file     ||
+|Picard     |1 file with multiple tables|- create seperate files<br>- merge files|
 
 ## Usage
 
-### Step 1: get an API token
+### Step 1: get your API token from Open Prices
 
-https://prices.openfoodfacts.org/api/docs#/Auth/authentication_api_v1_auth_post
+https://prices.openfoodfacts.org/api/docs#/auth/auth_create
 
 ### Step 2: upload a proof
 
@@ -42,7 +45,7 @@ Depending on the source, you'll need to provide the correct `LOCATION` key, and
 Use the token returned in Step 1.
 
 ```
-FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python data/gdpr/create_prices_from_gdpr_csv.py
+FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
 ```
 
 Last changes when you're ready:
@@ -55,12 +58,25 @@ Last changes when you're ready:
 
 Script name: `merge_two_csv_files.csv`
 
+Goal: merge and enrich data from the second csv file into the first csv file.
+
+#### E.Leclerc
+
 E.Leclerc returns 2 different files, one containing a list of receipts (with dates & locations), and the other a list of products with their receipt id. So we need to first merge the 2 files into 1.
 ```
 (TODO)
 ```
 
+#### Carrefour
+
 For Carrefour, the file contains 2 tabs, 1 called "Tickets" and the other called "Remise".
 ```
-FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python data/gdpr/merge_two_csv_files.py
+FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME_LIST="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python scripts/gdpr/merge_two_csv_files.py
+```
+
+#### Picard
+
+Picard returns 1 spreadsheet containing multiple tables. We first need to store the Product table & the Tickets table in 2 seperate csv files.
+```
+FILEPATH_1=Picard_Produits.csv FILEPATH_2=Picard_Tickets.csv PIVOT_FIELD_NAME_LIST="NUMERO DE TICKET" EXCLUDE_FIELD_NAME_LIST="PRIX TTC" poetry run python scripts/gdpr/merge_two_csv_files.py
 ```
diff --git a/scripts/gdpr/create_prices_from_gdpr_csv.py b/scripts/gdpr/create_prices_from_gdpr_csv.py
@@ -5,10 +5,12 @@
 import time
 
 import requests
+from utils import get_picard_product_from_subcode
 
 OPEN_PRICES_CREATE_PRICE_ENDPOINT = f'{os.environ.get("API_ENDPOINT")}/prices'
 OPEN_PRICES_TOKEN = os.environ.get("API_TOKEN")
-GDPR_FIELD_MAPPING_FILEPATH = "data/gdpr/gdpr_field_mapping.csv"
+
+GDPR_FIELD_MAPPING_FILEPATH = "scripts/gdpr/gdpr_field_mapping.csv"
 
 DEFAULT_PRICE_CURRENCY = "EUR"
 PRICE_FIELDS = [
@@ -44,10 +46,14 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
     # remove any whitespace
     gdpr_field_value = gdpr_field_value.strip()
 
-    # shop specific rules
-    if gdpr_source == "AUCHAN":
-        if op_field == "price":
+    # field-specific rules
+    if op_field in ["price", "quantity"]:
+        if gdpr_field_value:
             gdpr_field_value = float(gdpr_field_value.replace(",", "."))
+
+    # shop-specific rules
+    if gdpr_source == "AUCHAN":
+        pass
     elif gdpr_source == "CARREFOUR":
         # input: |3178050000749|
         # output: 3178050000749
@@ -62,15 +68,18 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
     elif gdpr_source == "ELECLERC":
         pass
     elif gdpr_source == "INTERMARCHE":
-        if op_field in ["price", "quantity"]:
-            # divide price by quantity
-            gdpr_field_value = float(gdpr_field_value.replace(",", "."))
         # input: 27/05/2021
         # output: 2021-05-27
         if op_field == "date":
             gdpr_field_value = datetime.datetime.strptime(
                 gdpr_field_value, "%d/%m/%Y"
             ).strftime("%Y-%m-%d")
+    elif gdpr_source == "PICARD":
+        # Picard codes are a subset of the EAN codes
+        # They have a length of 5 (4 if missing leading 0)
+        if op_field == "product_code":
+            if len(gdpr_field_value) == 4:
+                gdpr_field_value = f"0{gdpr_field_value}"
 
     return gdpr_field_value
 
@@ -79,15 +88,15 @@ def gdpr_source_price_cleanup_rules(gdpr_source, gdpr_op_price):
     """
     Rules to cleanup the price object
     """
-    if gdpr_source == "AUCHAN":
-        pass
-    elif gdpr_source == "CARREFOUR":
-        pass
-    elif gdpr_source == "ELECLERC":
-        pass
-    elif gdpr_source == "INTERMARCHE":
-        # price must be divided by quantity
-        gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
+    # price must be divided by quantity
+    if "quantity" in gdpr_op_price:
+        if gdpr_op_price["quantity"]:
+            gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
+
+    # discount boolean flag
+    if "discount" in gdpr_op_price:
+        if gdpr_op_price["discount"]:
+            gdpr_op_price["price_is_discounted"] = True
 
     return gdpr_op_price
 
@@ -135,6 +144,12 @@ def gdpr_source_filter_rules(op_price_list, gdpr_source=""):
                 passes_test = False
         elif gdpr_source == "INTERMARCHE":
             pass
+        elif gdpr_source == "PICARD":
+            full_product_code = get_picard_product_from_subcode(op_price)
+            if full_product_code:
+                op_price["product_code"] = full_product_code
+            else:
+                passes_test = False
 
         if passes_test:
             op_price_list_filtered.append(op_price)
@@ -219,7 +234,7 @@ def create_price(price):
 if __name__ == "__main__":
     """
     How-to run:
-    > FILEPATH= poetry run python data/gdpr/create_prices_from_gdpr_csv.py
+    > FILEPATH= poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
     Required params: see REQUIRED_ENV_PARAMS
     """
     # Step 1: read input file
@@ -256,21 +271,21 @@ def create_price(price):
     )
     print(len(open_prices_price_list))
 
-    # Step 4a: filter prices depending on specific source rules
-    print("===== Applying source filtering rules")
-    open_prices_price_list_filtered_1 = gdpr_source_filter_rules(
-        open_prices_price_list, gdpr_source=source
+    # Step 4a: filter prices depending on location
+    print("===== Applying location filtering rules")
+    open_prices_price_list_filtered_1 = gdpr_source_location_rules(
+        open_prices_price_list
     )
     print(len(open_prices_price_list_filtered_1))
 
-    # Step 4b: filter prices depending on location
-    print("===== Applying location filtering rules")
-    open_prices_price_list_filtered_2 = gdpr_source_location_rules(
-        open_prices_price_list_filtered_1
+    # Step 4b: filter prices depending on specific source rules
+    print("===== Applying source filtering rules")
+    open_prices_price_list_filtered_2 = gdpr_source_filter_rules(
+        open_prices_price_list_filtered_1, gdpr_source=source
     )
     print(len(open_prices_price_list_filtered_2))
 
-    print("===== Output example (extra fields will be ignored):")
+    print("===== Output example (extra fields will be ignored)")
     print(open_prices_price_list_filtered_2[0])
 
     # Step 5: send prices to backend via API

diff --git a/scripts/gdpr/gdpr_field_mapping.csv b/scripts/gdpr/gdpr_field_mapping.csv
@@ -1,8 +1,8 @@
-OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT
-product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD
-product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0
-price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points
-discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS,  les bons d'achats…  appliqués lors du passage en caisse)",,,,,
-quantity,,,Quantité,,,,Qte Vendues,
-date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY
-location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,
+OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT,PICARD_FIELD,PICARD_COMMENT
+product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD,CODE PRODUIT,a 5-number code. need to do an extra API search to find the corresponding product
+product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0,LIBELLE ARTICLE,
+price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points,PRIX TTC,has commas instead of points
+discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS,  les bons d'achats…  appliqués lors du passage en caisse)",,,,,,IDENTIFIANT REMISE,a string ID to another table
+quantity,,,Quantité,,,,Qte Vendues,,NOMBRE UNITES,
+date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY,DATE TICKET,
+location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,,NOM DU MAGASIN,
diff --git a/scripts/gdpr/merge_two_csv_files.py b/scripts/gdpr/merge_two_csv_files.py
@@ -13,20 +13,37 @@ def read_csv(filepath):
     return data
 
 
-def merge_data_of_two_lists(list_1, list_2, pivot_list=["ticket"]):
-    print(pivot_list)
+def merge_data_of_two_lists(
+    list_1, list_2, pivot_field_name_list=["ticket"], exclude_field_name_list=[]
+):
     data_merged = list()
 
     for row_1 in list_1:
         row_2 = None
+        # find corresponding row in list_2
         for row in list_2:
-            if all(row_1[pivot] == row[pivot] for pivot in pivot_list):
+            if all(
+                row_1[pivot_field_name] == row[pivot_field_name]
+                for pivot_field_name in pivot_field_name_list
+            ):
                 row_2 = row
         if not row_2:
             row_2 = {
-                **{key: row_1[key] for key in list_2[0].keys() if key in pivot_list},
-                **{key: "" for key in list_2[0].keys() if key not in pivot_list},
+                **{
+                    key: row_1[key]
+                    for key in list_2[0].keys()
+                    if key in pivot_field_name_list
+                },
+                **{
+                    key: ""
+                    for key in list_2[0].keys()
+                    if key not in pivot_field_name_list
+                },
             }
+        # cleanup row_2
+        for exclude_field_name in exclude_field_name_list:
+            row_2.pop(exclude_field_name, None)
+        # merge
         data_merged.append({**row_1, **row_2})
 
     return data_merged
@@ -44,12 +61,14 @@ def write_csv(data, filepath):
 if __name__ == "__main__":
     """
     How-to run:
-    > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME= poetry run python data/gdpr/merge_two_csv_files.py  # noqa
+    > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME_LIST= EXCLUDE_FIELD_NAME_LIST= poetry run python scripts/gdpr/merge_two_csv_files.py  # noqa
     """
     filepath_1 = os.environ.get("FILEPATH_1")
     filepath_2 = os.environ.get("FILEPATH_2")
-    pivot_field_name = os.environ.get("PIVOT_FIELD_NAME")
-    pivot_field_name_list = pivot_field_name.split(",")
+    pivot_field_name_str = os.environ.get("PIVOT_FIELD_NAME_LIST")
+    pivot_field_name_list = pivot_field_name_str.split(",")
+    exclude_field_name_str = os.environ.get("EXCLUDE_FIELD_NAME_LIST")
+    exclude_field_name_list = exclude_field_name_str.split(",")
     output_filepath = filepath_1.split(".csv")[0] + "_merged.csv"
 
     print(f"Step 1: reading {filepath_1}")
@@ -60,9 +79,14 @@ def write_csv(data, filepath):
     data_2 = read_csv(filepath_2)
     print(f"{len(data_2)} lines")
 
-    print(f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list}")
+    print(
+        f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list} (and excluding: {exclude_field_name_list})"
+    )
     data_merged = merge_data_of_two_lists(
-        data_1, data_2, pivot_list=pivot_field_name_list
+        data_1,
+        data_2,
+        pivot_field_name_list=pivot_field_name_list,
+        exclude_field_name_list=exclude_field_name_list,
     )
     print(f"{len(data_merged)} lines")
 

diff --git a/scripts/gdpr/utils.py b/scripts/gdpr/utils.py
@@ -0,0 +1,68 @@
+import requests
+
+OFF_SEARCHLICIOUS_API_ENDPOINT = "https://search.openfoodfacts.org/search"
+PICARD_GS1_PREFIX = "327016"
+
+
+def get_picard_product_from_subcode(op_price_dict):
+    # the Picard product_code is incomplete
+    # use Search-a-licious API to get the full product code
+    # if needed, prompt the user to select the correct one
+    passes_test = True
+    full_product_code = None
+
+    print(
+        "----- Input:",
+        op_price_dict["product_code"],
+        op_price_dict["product_name"],
+        op_price_dict["price"],
+    )
+    for q_index, q_params in enumerate(
+        [
+            f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}? brands:picard",
+            f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}?",
+            f"code:*{op_price_dict['product_code']}? brands:picard",
+            f"code:*{op_price_dict['product_code']}?&page_size=50",
+        ]
+    ):
+        response = requests.get(
+            OFF_SEARCHLICIOUS_API_ENDPOINT,
+            params={"q": q_params},
+        )
+        print(response.url)
+        if response.status_code == 200:
+            response_product_count = response.json()["count"]
+            print("Products found:", response_product_count)
+            if response_product_count:
+                # confidence strong enough: take the first product
+                if (q_index < 2) and (response_product_count == 1):
+                    full_product_code = response.json()["hits"][0]["code"]
+                else:
+                    # multiple results: prompt the user to select
+                    response_product_list = response.json()["hits"]
+                    for index, response_product in enumerate(response_product_list):
+                        print(
+                            index + 1,
+                            ":",
+                            response_product.get("code"),
+                            response_product.get("product_name", ""),
+                            response_product.get("brands_tags", ""),
+                            response_product.get("stores", ""),
+                        )
+                    user_choice_number_str = input(
+                        "Which product ? Type 0 to skip. Or provide the correct code. "
+                    )
+                    if len(user_choice_number_str) == 1:
+                        full_product_code = response_product_list[
+                            int(user_choice_number_str) - 1
+                        ]["code"]
+                        print("Chosen product code:", full_product_code)
+                    elif 3 < len(user_choice_number_str) <= 13:
+                        full_product_code = user_choice_number_str
+                        print("Chosen product code:", full_product_code)
+                    else:
+                        print("Product not found...")
+                        passes_test = False
+                break
+
+    return passes_test, full_product_code