From d82d9338e96804d84bfa597089fdd9a45c5dc491 Mon Sep 17 00:00:00 2001
From: Colin Maudry <colin@maudry.com>
Date: Wed, 21 Jun 2023 17:04:09 +0200
Subject: [PATCH] variables, stream_config and documentation

---
 .template.env                   |  1 +
 CONTRIBUTING.md                 | 11 +++++++++++
 docker-compose.yml              |  1 +
 pipeline/dags/dags/settings.py  | 12 ++++++++++++
 pipeline/dags/import_sources.py |  2 ++
 5 files changed, 27 insertions(+)

diff --git a/.template.env b/.template.env
index b71aab53..48db4f77 100644
--- a/.template.env
+++ b/.template.env
@@ -41,6 +41,7 @@ AIRFLOW_CONN_S3_SOURCES=
 BAN_API_URL=https://api-adresse.data.gouv.fr
 CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv
 CD72_FILE_URL=
+CD39_FILE_URL=
 DORA_API_URL=https://api.dora.fabrique.social.gouv.fr/api/v1/
 EMPLOIS_API_TOKEN=
 EMPLOIS_API_URL=https://emplois.inclusion.beta.gouv.fr/api/v1/structures/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8c2f65d4..e67c3669 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,6 +70,17 @@ code -n data-inclusion
 
 Each subdirectory (`./pipeline`, `./api`, etc.) has its own contributing guidelines on how to setup an environment for development.
 
+## Adding a public HTTP tabular source
+
+To extract, read and store the source data in S3:
+
+- add the URL in the `.template.env` file (use the format XXX_FILE_URL)
+- add the environment variable in `docker-compose.yml`
+- add the stream config in `pipeline/dags/dags/settings.py`
+- add the source id + functions in `pipeline/dags/import_sources.py` (extract and read functions)
+
+For transformations, create the relevant files in `dbt/sources`, `dbt/intermediate` and `dbt/marts`.
+
 ## Contribution
 
 Issues and PRs are welcome.
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 3d596fea..44e475ca 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -25,6 +25,7 @@ x-airflow-common:
     AIRFLOW_VAR_DBT_PROJECT_DIR: /opt/airflow/dbt
     AIRFLOW_VAR_BAN_API_URL: ${BAN_API_URL}
     AIRFLOW_VAR_CD35_FILE_URL: ${CD35_FILE_URL}
+    AIRFLOW_VAR_CD39_FILE_URL: ${CD39_FILE_URL}
     AIRFLOW_VAR_CD72_FILE_URL: ${CD72_FILE_URL}
     AIRFLOW_VAR_DATAGOUV_API_KEY: ${DATAGOUV_API_KEY}
     AIRFLOW_VAR_DATAGOUV_API_URL: ${DATAGOUV_API_URL}
diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py
index 29e76dab..8e3a34f1 100644
--- a/pipeline/dags/dags/settings.py
+++ b/pipeline/dags/dags/settings.py
@@ -80,6 +80,18 @@
             },
         ],
     },
+    {
+        "id": "cd39",
+        "schedule_interval": "@once",
+        "snapshot": True,
+        "streams": [
+            {
+                "id": "structures_services",
+                "filename": "Base_de_données_Jura-Tableau.csv",
+                "url": Variable.get("CD39_FILE_URL", None),
+            },
+        ],
+    },
     {
         "id": "cd72",
         "schedule_interval": "@once",
diff --git a/pipeline/dags/import_sources.py b/pipeline/dags/import_sources.py
index 9e32c03a..e2aeac6c 100644
--- a/pipeline/dags/import_sources.py
+++ b/pipeline/dags/import_sources.py
@@ -70,6 +70,7 @@ def _extract(
     EXTRACT_FN_BY_SOURCE_ID = {
         "annuaire-du-service-public": utils.extract_http_content,
         "cd35": utils.extract_http_content,
+        "cd39": utils.extract_http_content,
         "cd72": utils.extract_http_content,
         "dora": dora.extract,
         "emplois-de-linclusion": emplois_de_linclusion.extract,
@@ -129,6 +130,7 @@ def _load(
     READ_FN_BY_SOURCE_ID = {
         "annuaire-du-service-public": annuaire_du_service_public.read,
         "cd35": lambda path: utils.read_csv(path, sep=";"),
+        "cd39": lambda path: utils.read_csv(path, sep=","),
         "cd72": lambda path: utils.read_excel(path, sheet_name="Structures"),
         "dora": utils.read_json,
         "emplois-de-linclusion": utils.read_json,