Skip to content

Commit

Permalink
Merge pull request #10 from mundialis/corrupt_zip
Browse files Browse the repository at this point in the history
Corrupt Archive handling
  • Loading branch information
griembauer authored Nov 29, 2023
2 parents 888d3d2 + 71f4938 commit a9f11ad
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 50 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ The steps above will automatically install the Python library [eodag](https://eo
Before running SADASADAM, eodag needs to be configured (see [eodag documentation](https://eodag.readthedocs.io/en/stable/getting_started_guide/configure.html)).
The eodag config file needs to be filled with credentials for satellite data providers. SADASADAM calls eodag to download only Sentinel-2
and Landsat-8/9 Level 1C data. Therefore, providing credentials to the `cop_dataspace` and `usgs` sections of the eodag config file
is recommended. In order to make the downloaded data accessible to FORCE,
**the download path `outputs_prefix` of the eodag config file needs to be defined in the SADASADAM config file parameter `download_dir` as well**
(see below). It is recommended to define `extract: False` in the eodag config file as SADASADAM automatically extracts the downloaded data according to the input requirements of FORCE.
is recommended. It is recommended to define `extract: False` in the eodag config file as SADASADAM automatically extracts the downloaded data according to the input requirements of FORCE.

A priority of providers can be defined in the eodag config file. We noticed the unexpected behaviour that download of Sentinel-2
from `cop_dataspace` fails (error related to `peps` provider credentials), if both `cop_dataspace` and `usgs` have the same priority.
Expand All @@ -85,7 +83,8 @@ SADASADAM can be executed with one single command, but internally, the script ca
##### Download of satellite data

SADASADAM will try to download all Sentinel-2 and Landsat-8/9 Level 1C scenes that match the filter options passed in the SADASADAM config file.
It makes use of user credentials and download paths defined in the eodag config file (see section above).
It makes use of user credentials and download paths defined in the eodag config file (see section above). The download path however can also be overwritten by
the `download_dir` parameter of the SADASADAM config file. All data are extracted, corrupt archives are removed and tried to download again.

##### FORCE processing

Expand Down Expand Up @@ -133,7 +132,7 @@ cloud_cover: 75 # maximum percentage of cloud cover in scene
##### FORCE & postprocessing options

```
download_dir: '/path/to/eodag/download_dir' # Path to the download directory defined in the eodag conf file. FORCE will use all valid satellite
download_dir: '/path/to/download_dir' # Path to the download directory. FORCE will use all valid satellite
# scenes (extracted Landsat-8/9 and Sentinel-2 in .SAFE format) in this directory as input.
temp_force_dir: '/path/to/temp_force_dir' # Path to a directory that can hold intermediate FORCE results. A new FORCE directory with a timestamp will be created here.
wvdb_dir: '/path/to/wvdb_dir' # Path to store the water vapor database. This database is required for Landsat processing in FORCE.
Expand Down
2 changes: 1 addition & 1 deletion config_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ cloud_cover: 100
# type=str, help='Path to folder where output is stored'
output_dir: '/path/to/output/dir/'

# type=str, help='Path to download products. Needs to be the same as defined in the eodag config file'
# type=str, help='Path to download products.'
download_dir: '/path/to/download/dir/'

# type=str, help='Path to folder where FORCE processing is done'
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ sadasadam = "sadasadam.cli:main"

[project]
name = "sadasadam"
version = "0.1.0"
version = "0.1.1"
authors = [
{ name="Guido Riembauer", email="[email protected]" },
{ name="Momen Mawad", email="[email protected]" },
Expand Down
24 changes: 9 additions & 15 deletions sadasadam/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@
import yaml

from sadasadam.force import ForceProcess
from sadasadam.download import (
download_with_eodag,
extract_and_delete_tar_gz_files,
)
from sadasadam.download import download_and_extract


def check_bool(variable):
Expand Down Expand Up @@ -213,20 +210,17 @@ def main():
"lonmax": east,
"latmax": north,
}

# start the download process
for product_name in products:
download_with_eodag(
product_type=product_name,
geom=geom,
start_date=start,
end_date=end,
cloudcover=cloud_cover,
)

download_and_extract(
products=products,
geom=geom,
start_date=start,
end_date=end,
cloudcover=cloud_cover,
download_dir=download_dir,
)
# Start FORCE
if download_only is False:
extract_and_delete_tar_gz_files(download_dir)
print("Setting up FORCE processing...")
# start FORCE process
force_proc = ForceProcess(
Expand Down
124 changes: 96 additions & 28 deletions sadasadam/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,43 +23,52 @@

import os
import shutil
import zipfile

from eodag import EODataAccessGateway


def download_with_eodag(
product_type, geom, start_date, end_date, cloudcover=100
product_type, geom, start_date, end_date, download_dir, cloudcover=100
):
"""Function to download satellite data using eodag library"""
# initialize eodag
dag = EODataAccessGateway()
# search for products

search_results, total_count = dag.search(
productType=product_type,
# accepts WKT polygons, shapely.geometry, ...
geom=geom,
start=start_date,
end=end_date,
# Set cloud cover
cloudCover=cloudcover,
raise_errors=True,
)
items_per_page = 20
search_kwargs = {
"items_per_page": items_per_page,
"productType": product_type,
"geom": geom,
"start": start_date,
"end": end_date,
"cloudCover": cloudcover,
}
search_results = dag.search_all(**search_kwargs)
num_results = len(search_results)
print(
f"Found {total_count} matching scenes of type {product_type}, "
"starting download..."
f"Found {num_results} matching scenes "
f"of type {product_type}, starting download..."
)
dag.download_all(search_results)
dag.download_all(search_results, outputs_prefix=download_dir)


def extract_and_delete_tar_gz_files(directory):
"""
Function to extract .tar.gz files recursively from a directory
and delete them
Function to extract .tar.gz and .SAFE.zip files
recursively from a directory and delete them
"""
corrupt_files = []
for file in os.listdir(directory):
if file.endswith((".SAFE.zip", ".tar.gz")):
if file.endswith((".SAFE.zip", ".tar.gz", ".SAFE")):
file_path = os.path.join(directory, file)
warning_text = (
"Warning: - "
f"Unable to extract: {file_path}. "
"Retrying Download..."
)
landsat_extract_dir = None
remove = True
try:
if file.endswith(".tar.gz"):
landsat_extract_dir_name = file.split(".")[0]
Expand All @@ -75,18 +84,77 @@ def extract_and_delete_tar_gz_files(directory):
directory, landsat_extract_dir_name
)

# Extract the .tar.gz file to the created directory
shutil.unpack_archive(
file_path, extract_dir=landsat_extract_dir
)
target_dir = landsat_extract_dir
unpack = True

elif file.endswith(".SAFE.zip"):
shutil.unpack_archive(file_path, extract_dir=directory)
# Delete the .tar.gz file after extraction
os.remove(file_path)
zfile = zipfile.ZipFile(file_path)
zfile_test = zfile.testzip()
if zfile_test is not None:
print(warning_text)
corrupt_files.append(file_path)
unpack = False
else:
target_dir = directory
unpack = True

elif file.endswith(".SAFE"):
# this should fail if the .SAFE is a corrupt
# downloaded file and not previously extracted
os.listdir(file_path)
unpack = False
remove = False

if unpack is True:
shutil.unpack_archive(file_path, extract_dir=target_dir)
# Delete file after extraction
if remove is True:
os.remove(file_path)
except Exception as exception:
print(f"{warning_text}: {exception}")
corrupt_files.append(file_path)
os.remove(file_path)
if landsat_extract_dir:
shutil.rmtree(landsat_extract_dir)
continue

return corrupt_files


def download_and_extract(
products,
geom,
start_date,
end_date,
download_dir,
cloudcover=100,
max_tries=3,
):
"""
Function to download satellite data using eodag library, extract,
and retry download if files are corrupt
"""
run_download = True
count = 0
while run_download is True:
for product_name in products:
download_with_eodag(
product_type=product_name,
geom=geom,
start_date=start_date,
end_date=end_date,
cloudcover=cloudcover,
download_dir=download_dir,
)
corrupt_files = extract_and_delete_tar_gz_files(download_dir)
if len(corrupt_files) == 0:
run_download = False
count += 1
if count == max_tries:
run_download = False
if len(corrupt_files) > 0:
print(
f"Warning: {exception} - "
"Unable to extract or delete: {file_path}"
f"Scene/s {'; '.join(corrupt_files)} seem to be "
f"corrupt even after {max_tries} downloads. "
"Files are removed and processing continues without them"
)
continue

0 comments on commit a9f11ad

Please sign in to comment.