From d82d9338e96804d84bfa597089fdd9a45c5dc491 Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Wed, 21 Jun 2023 17:04:09 +0200 Subject: [PATCH] variables, stream_config and documentation --- .template.env | 1 + CONTRIBUTING.md | 11 +++++++++++ docker-compose.yml | 1 + pipeline/dags/dags/settings.py | 12 ++++++++++++ pipeline/dags/import_sources.py | 2 ++ 5 files changed, 27 insertions(+) diff --git a/.template.env b/.template.env index b71aab53..48db4f77 100644 --- a/.template.env +++ b/.template.env @@ -41,6 +41,7 @@ AIRFLOW_CONN_S3_SOURCES= BAN_API_URL=https://api-adresse.data.gouv.fr CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv CD72_FILE_URL= +CD39_FILE_URL= DORA_API_URL=https://api.dora.fabrique.social.gouv.fr/api/v1/ EMPLOIS_API_TOKEN= EMPLOIS_API_URL=https://emplois.inclusion.beta.gouv.fr/api/v1/structures/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8c2f65d4..e67c3669 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,6 +70,17 @@ code -n data-inclusion Each subdirectory (`./pipeline`, `./api`, etc.) has its own contributing guidelines on how to setup an environment for development. +## Adding a public HTTP tabular source + +To extract, read and store the source data in S3: + +- add the URL in the `.template.env` file (use the format XXX_FILE_URL) +- add the environment variable in `docker-compose.yml` +- add the stream config in `pipeline/dags/dags/settings.py` +- add the source id + functions in `pipeline/dags/import_sources.py` (extract and read functions) + +For transformations, create the relevant files in `dbt/sources`, `dbt/intermediate` and `dbt/marts`. + ## Contribution Issues and PRs are welcome. \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 3d596fea..44e475ca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,6 +25,7 @@ x-airflow-common: AIRFLOW_VAR_DBT_PROJECT_DIR: /opt/airflow/dbt AIRFLOW_VAR_BAN_API_URL: ${BAN_API_URL} AIRFLOW_VAR_CD35_FILE_URL: ${CD35_FILE_URL} + AIRFLOW_VAR_CD39_FILE_URL: ${CD39_FILE_URL} AIRFLOW_VAR_CD72_FILE_URL: ${CD72_FILE_URL} AIRFLOW_VAR_DATAGOUV_API_KEY: ${DATAGOUV_API_KEY} AIRFLOW_VAR_DATAGOUV_API_URL: ${DATAGOUV_API_URL} diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py index 29e76dab..8e3a34f1 100644 --- a/pipeline/dags/dags/settings.py +++ b/pipeline/dags/dags/settings.py @@ -80,6 +80,18 @@ }, ], }, + { + "id": "cd39", + "schedule_interval": "@once", + "snapshot": True, + "streams": [ + { + "id": "structures_services", + "filename": "Base_de_données_Jura-Tableau.csv", + "url": Variable.get("CD39_FILE_URL", None), + }, + ], + }, { "id": "cd72", "schedule_interval": "@once", diff --git a/pipeline/dags/import_sources.py b/pipeline/dags/import_sources.py index 9e32c03a..e2aeac6c 100644 --- a/pipeline/dags/import_sources.py +++ b/pipeline/dags/import_sources.py @@ -70,6 +70,7 @@ def _extract( EXTRACT_FN_BY_SOURCE_ID = { "annuaire-du-service-public": utils.extract_http_content, "cd35": utils.extract_http_content, + "cd39": utils.extract_http_content, "cd72": utils.extract_http_content, "dora": dora.extract, "emplois-de-linclusion": emplois_de_linclusion.extract, @@ -129,6 +130,7 @@ def _load( READ_FN_BY_SOURCE_ID = { "annuaire-du-service-public": annuaire_du_service_public.read, "cd35": lambda path: utils.read_csv(path, sep=";"), + "cd39": lambda path: utils.read_csv(path, sep=","), "cd72": lambda path: utils.read_excel(path, sheet_name="Structures"), "dora": utils.read_json, "emplois-de-linclusion": utils.read_json,