From d6cf4cfd3f56c8a350c4696c987e3965f1670d22 Mon Sep 17 00:00:00 2001 From: Guido Riembauer Date: Mon, 27 Nov 2023 14:00:11 +0100 Subject: [PATCH 1/5] download_all,handle corrupt archives --- README.md | 9 ++-- config_example.yaml | 6 +-- pyproject.toml | 2 +- sadasadam/cli.py | 24 ++++----- sadasadam/download.py | 123 ++++++++++++++++++++++++++++++++---------- 5 files changed, 113 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 7639b26..fb7f9bd 100644 --- a/README.md +++ b/README.md @@ -67,9 +67,7 @@ The steps above will automatically install the Python library [eodag](https://eo Before running SADASADAM, eodag needs to be configured (see [eodag documentation](https://eodag.readthedocs.io/en/stable/getting_started_guide/configure.html)). The eodag config file needs to be filled with credentials for satellite data providers. SADASADAM calls eodag to download only Sentinel-2 and Landsat-8/9 Level 1C data. Therefore, providing credentials to the `cop_dataspace` and `usgs` sections of the eodag config file -is recommended. In order to make the downloaded data accessible to FORCE, -**the download path `outputs_prefix` of the eodag config file needs to be defined in the SADASADAM config file parameter `download_dir` as well** -(see below). It is recommended to define `extract: False` in the eodag config file as SADASADAM automatically extracts the downloaded data according to the input requirements of FORCE. +is recommended. It is recommended to define `extract: False` in the eodag config file as SADASADAM automatically extracts the downloaded data according to the input requirements of FORCE. A priority of providers can be defined in the eodag config file. We noticed the unexpected behaviour that download of Sentinel-2 from `cop_dataspace` fails (error related to `peps` provider credentials), if both `cop_dataspace` and `usgs` have the same priority. @@ -85,7 +83,8 @@ SADASADAM can be executed with one single command, but internally, the script ca ##### Download of satellite data SADASADAM will try to download all Sentinel-2 and Landsat-8/9 Level 1C scenes that match the filter options passed in the SADASADAM config file. -It makes use of user credentials and download paths defined in the eodag config file (see section above). +It makes use of user credentials and download paths defined in the eodag config file (see section above). The download path however can also be overwritten by +the `download_dir` parameter of the SADASADAM config file. All data are extracted, corrupt archives are removed and tried to download again. ##### FORCE processing @@ -133,7 +132,7 @@ cloud_cover: 75 # maximum percentage of cloud cover in scene ##### FORCE & postprocessing options ``` -download_dir: '/path/to/eodag/download_dir' # Path to the download directory defined in the eodag conf file. FORCE will use all valid satellite +download_dir: '/path/to/download_dir' # Path to the download directory. FORCE will use all valid satellite # scenes (extracted Landsat-8/9 and Sentinel-2 in .SAFE format) in this directory as input. temp_force_dir: '/path/to/temp_force_dir' # Path to a directory that can hold intermediate FORCE results. A new FORCE directory with a timestamp will be created here. wvdb_dir: '/path/to/wvdb_dir' # Path to store the water vapor database. This database is required for Landsat processing in FORCE. diff --git a/config_example.yaml b/config_example.yaml index 512399a..7c61145 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -31,10 +31,10 @@ east: 11.97 west: 10.44 # type=str, help='Start date of temporal extent in ISO format (YYYY-MM-DD)' -start: '2023-08-01' +start: '2023-08-05' # type=str, help='End date of temporal extent ISO format (YYYY-MM-DD)' -end: '2023-08-15' +end: '2023-08-10' # type=int, help='Max. cloud cover (1-100) of scenes to download.' cloud_cover: 100 @@ -42,7 +42,7 @@ cloud_cover: 100 # type=str, help='Path to folder where output is stored' output_dir: '/path/to/output/dir/' -# type=str, help='Path to download products. Needs to be the same as defined in the eodag config file' +# type=str, help='Path to download products.' download_dir: '/path/to/download/dir/' # type=str, help='Path to folder where FORCE processing is done' diff --git a/pyproject.toml b/pyproject.toml index 9407b6e..7b0e53e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ sadasadam = "sadasadam.cli:main" [project] name = "sadasadam" -version = "0.1.0" +version = "0.1.1" authors = [ { name="Guido Riembauer", email="riembauer@mundialis.de" }, { name="Momen Mawad", email="mawad@mundialis.de" }, diff --git a/sadasadam/cli.py b/sadasadam/cli.py index 69d5bd2..aa049a1 100644 --- a/sadasadam/cli.py +++ b/sadasadam/cli.py @@ -26,10 +26,7 @@ import yaml from sadasadam.force import ForceProcess -from sadasadam.download import ( - download_with_eodag, - extract_and_delete_tar_gz_files, -) +from sadasadam.download import download_and_extract def check_bool(variable): @@ -213,20 +210,17 @@ def main(): "lonmax": east, "latmax": north, } - # start the download process - for product_name in products: - download_with_eodag( - product_type=product_name, - geom=geom, - start_date=start, - end_date=end, - cloudcover=cloud_cover, - ) - + download_and_extract( + products=products, + geom=geom, + start_date=start, + end_date=end, + cloudcover=cloud_cover, + download_dir=download_dir, + ) # Start FORCE if download_only is False: - extract_and_delete_tar_gz_files(download_dir) print("Setting up FORCE processing...") # start FORCE process force_proc = ForceProcess( diff --git a/sadasadam/download.py b/sadasadam/download.py index 06c6814..630a72e 100644 --- a/sadasadam/download.py +++ b/sadasadam/download.py @@ -23,43 +23,61 @@ import os import shutil +import zipfile from eodag import EODataAccessGateway def download_with_eodag( - product_type, geom, start_date, end_date, cloudcover=100 + product_type, geom, start_date, end_date, download_dir, cloudcover=100 ): """Function to download satellite data using eodag library""" # initialize eodag dag = EODataAccessGateway() # search for products - - search_results, total_count = dag.search( - productType=product_type, - # accepts WKT polygons, shapely.geometry, ... - geom=geom, - start=start_date, - end=end_date, - # Set cloud cover - cloudCover=cloudcover, - raise_errors=True, - ) + # iterate over pages because search_all does not return anything + total_search_results = [] + items_per_page = 20 + search_kwargs = { + "items_per_page": items_per_page, + "productType": product_type, + "geom": geom, + "start": start_date, + "end": end_date, + "cloudCover": cloudcover, + "raise_errors": True, + } + search_results_tmp, total_count = dag.search(**search_kwargs) + # iterate over pages + pages = int(total_count // items_per_page + 1) + for i in range(1, pages + 1): + search_results, total_count = dag.search(page=i, **search_kwargs) + total_search_results.append(search_results) + total_search_results_flat = [ + sitem for item in total_search_results for sitem in item + ] print( - f"Found {total_count} matching scenes of type {product_type}, " + f"Found {len(total_search_results_flat)} matching scenes of type {product_type}, " "starting download..." ) - dag.download_all(search_results) + dag.download_all(total_search_results_flat, outputs_prefix=download_dir) def extract_and_delete_tar_gz_files(directory): """ - Function to extract .tar.gz files recursively from a directory - and delete them + Function to extract .tar.gz and .SAFE.zip files + recursively from a directory and delete them """ + corrupt_files = [] for file in os.listdir(directory): - if file.endswith((".SAFE.zip", ".tar.gz")): + if file.endswith((".SAFE.zip", ".tar.gz", ".SAFE")): file_path = os.path.join(directory, file) + warning_text = ( + "Warning: - " + f"Unable to extract: {file_path}. " + "Retrying Download..." + ) + landsat_extract_dir = None try: if file.endswith(".tar.gz"): landsat_extract_dir_name = file.split(".")[0] @@ -75,18 +93,69 @@ def extract_and_delete_tar_gz_files(directory): directory, landsat_extract_dir_name ) - # Extract the .tar.gz file to the created directory - shutil.unpack_archive( - file_path, extract_dir=landsat_extract_dir - ) + target_dir = landsat_extract_dir + unpack = True elif file.endswith(".SAFE.zip"): - shutil.unpack_archive(file_path, extract_dir=directory) - # Delete the .tar.gz file after extraction + zfile = zipfile.ZipFile(file_path) + zfile_test = zfile.testzip() + if zfile_test is not None: + print(warning_text) + corrupt_files.append(file_path) + unpack = False + else: + target_dir = directory + unpack = True + + elif file.endswith(".SAFE"): + # this should fail if the .SAFE is a corrupt + # downloaded file and not previously extracted + test = os.listdir(file_path) + unpack = False + + if unpack is True: + shutil.unpack_archive(file_path, extract_dir=target_dir) + # Delete file after extraction os.remove(file_path) except Exception as exception: - print( - f"Warning: {exception} - " - "Unable to extract or delete: {file_path}" - ) + print(warning_text) + corrupt_files.append(file_path) + os.remove(file_path) + if landsat_extract_dir: + shutil.rmtree(landsat_extract_dir) continue + + return corrupt_files + + +def download_and_extract( + products, + geom, + start_date, + end_date, + download_dir, + cloudcover=100, + max_tries=3, +): + """ + Function to download satellite data using eodag library, extract, + and retry download if files are corrupt + """ + run_download = True + count = 0 + while run_download is True: + for product_name in products: + download_with_eodag( + product_type=product_name, + geom=geom, + start_date=start_date, + end_date=end_date, + cloudcover=cloudcover, + download_dir=download_dir, + ) + corrupt_files = extract_and_delete_tar_gz_files(download_dir) + if len(corrupt_files) == 0: + run_download = False + count += 1 + if count == max_tries: + run_download = False From 8249eaf7cfe54112e6cbf1513a364e50e9e51a86 Mon Sep 17 00:00:00 2001 From: Guido Riembauer Date: Mon, 27 Nov 2023 14:11:24 +0100 Subject: [PATCH 2/5] linting --- sadasadam/download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sadasadam/download.py b/sadasadam/download.py index 630a72e..901bc21 100644 --- a/sadasadam/download.py +++ b/sadasadam/download.py @@ -57,8 +57,8 @@ def download_with_eodag( sitem for item in total_search_results for sitem in item ] print( - f"Found {len(total_search_results_flat)} matching scenes of type {product_type}, " - "starting download..." + f"Found {len(total_search_results_flat)} matching scenes " + "of type {product_type}, starting download..." ) dag.download_all(total_search_results_flat, outputs_prefix=download_dir) @@ -110,7 +110,7 @@ def extract_and_delete_tar_gz_files(directory): elif file.endswith(".SAFE"): # this should fail if the .SAFE is a corrupt # downloaded file and not previously extracted - test = os.listdir(file_path) + os.listdir(file_path) unpack = False if unpack is True: @@ -118,7 +118,7 @@ def extract_and_delete_tar_gz_files(directory): # Delete file after extraction os.remove(file_path) except Exception as exception: - print(warning_text) + print(f"{exception}: {warning_text}") corrupt_files.append(file_path) os.remove(file_path) if landsat_extract_dir: From f66ff4cb4f913f5f7b719408d2d437a57a99eb8d Mon Sep 17 00:00:00 2001 From: Guido Riembauer Date: Tue, 28 Nov 2023 09:33:41 +0100 Subject: [PATCH 3/5] use search_all; fix .SAFE handling --- config_example.yaml | 4 ++-- sadasadam/download.py | 33 +++++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/config_example.yaml b/config_example.yaml index 7c61145..70308ba 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -31,10 +31,10 @@ east: 11.97 west: 10.44 # type=str, help='Start date of temporal extent in ISO format (YYYY-MM-DD)' -start: '2023-08-05' +start: '2023-08-01' # type=str, help='End date of temporal extent ISO format (YYYY-MM-DD)' -end: '2023-08-10' +end: '2023-08-15' # type=int, help='Max. cloud cover (1-100) of scenes to download.' cloud_cover: 100 diff --git a/sadasadam/download.py b/sadasadam/download.py index 901bc21..3e76262 100644 --- a/sadasadam/download.py +++ b/sadasadam/download.py @@ -35,8 +35,6 @@ def download_with_eodag( # initialize eodag dag = EODataAccessGateway() # search for products - # iterate over pages because search_all does not return anything - total_search_results = [] items_per_page = 20 search_kwargs = { "items_per_page": items_per_page, @@ -45,22 +43,15 @@ def download_with_eodag( "start": start_date, "end": end_date, "cloudCover": cloudcover, - "raise_errors": True, } - search_results_tmp, total_count = dag.search(**search_kwargs) + search_results = dag.search_all(**search_kwargs) + num_results = len(search_results) # iterate over pages - pages = int(total_count // items_per_page + 1) - for i in range(1, pages + 1): - search_results, total_count = dag.search(page=i, **search_kwargs) - total_search_results.append(search_results) - total_search_results_flat = [ - sitem for item in total_search_results for sitem in item - ] print( - f"Found {len(total_search_results_flat)} matching scenes " - "of type {product_type}, starting download..." + f"Found {num_results} matching scenes " + f"of type {product_type}, starting download..." ) - dag.download_all(total_search_results_flat, outputs_prefix=download_dir) + dag.download_all(search_results, outputs_prefix=download_dir) def extract_and_delete_tar_gz_files(directory): @@ -78,6 +69,7 @@ def extract_and_delete_tar_gz_files(directory): "Retrying Download..." ) landsat_extract_dir = None + remove = True try: if file.endswith(".tar.gz"): landsat_extract_dir_name = file.split(".")[0] @@ -112,13 +104,16 @@ def extract_and_delete_tar_gz_files(directory): # downloaded file and not previously extracted os.listdir(file_path) unpack = False + remove = False if unpack is True: shutil.unpack_archive(file_path, extract_dir=target_dir) # Delete file after extraction - os.remove(file_path) + if remove is True: + os.remove(file_path) except Exception as exception: - print(f"{exception}: {warning_text}") + exception = None + print(warning_text) corrupt_files.append(file_path) os.remove(file_path) if landsat_extract_dir: @@ -159,3 +154,9 @@ def download_and_extract( count += 1 if count == max_tries: run_download = False + if len(corrupt_files) > 0: + print( + f"Scene/s {'; '.join(corrupt_files)} seem to be " + f"corrupt even after {max_tries} downloads. " + "Files are removed and processing continues without them" + ) From 3d4969c9b4ea2d5bb68387e44a0501c71cd62914 Mon Sep 17 00:00:00 2001 From: Guido Riembauer Date: Tue, 28 Nov 2023 09:37:10 +0100 Subject: [PATCH 4/5] linting --- sadasadam/download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sadasadam/download.py b/sadasadam/download.py index 3e76262..bfea698 100644 --- a/sadasadam/download.py +++ b/sadasadam/download.py @@ -112,8 +112,7 @@ def extract_and_delete_tar_gz_files(directory): if remove is True: os.remove(file_path) except Exception as exception: - exception = None - print(warning_text) + print(f"{warning_text}: {exception}") corrupt_files.append(file_path) os.remove(file_path) if landsat_extract_dir: From 71f4938b4196a9e7f61ee27a03822b69e499d638 Mon Sep 17 00:00:00 2001 From: Guido Riembauer Date: Tue, 28 Nov 2023 09:40:06 +0100 Subject: [PATCH 5/5] remove obsolete comment --- sadasadam/download.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sadasadam/download.py b/sadasadam/download.py index bfea698..44e0cd8 100644 --- a/sadasadam/download.py +++ b/sadasadam/download.py @@ -46,7 +46,6 @@ def download_with_eodag( } search_results = dag.search_all(**search_kwargs) num_results = len(search_results) - # iterate over pages print( f"Found {num_results} matching scenes " f"of type {product_type}, starting download..."