From 9ca2f13a083e9ca37cd368106864f32e33176499 Mon Sep 17 00:00:00 2001 From: Henning Bredel <881756+ridoo@users.noreply.github.com> Date: Tue, 17 Oct 2023 10:32:45 +0200 Subject: [PATCH] Enumerate unzipped files (#10842) * Keep all uploaded zip content accessible iterdir() is platform dependent, that is the order of the returned items may be different on different platforms. In cases where a zip file contains multiple base_file candidates it will be overridden by the last one found (which varies on different platforms). Also, different files with the same extension (file1.csv, file2.csv) will not be accessible from file_paths as they get overridden, too. The fix enumerates all files to make them accessible from file_paths. * Sorts files during unzip Ensures that unpacked content is sorted before getting handled * Resolve minor issues * Ensure index on extensions found multiple times --- geonode/storage/data_retriever.py | 31 +++++++++++++++++++------ geonode/storage/tests.py | 16 ++++++++++++- geonode/storage/tests/data/example.zip | Bin 140 -> 324 bytes 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/geonode/storage/data_retriever.py b/geonode/storage/data_retriever.py index 059a3122a74..bc23997a8a2 100644 --- a/geonode/storage/data_retriever.py +++ b/geonode/storage/data_retriever.py @@ -205,17 +205,34 @@ def _unzip(self, zip_name: str) -> Mapping: at the end the zip is deleted """ zip_file = self.file_paths["base_file"] - the_zip = zipfile.ZipFile(zip_file, allowZip64=True) - the_zip.extractall(self.temporary_folder) + with zipfile.ZipFile(zip_file, allowZip64=True) as the_zip: + the_zip.extractall(self.temporary_folder) + available_choices = get_allowed_extensions() not_main_files = ["xml", "sld", "zip", "kmz"] base_file_choices = [x for x in available_choices if x not in not_main_files] - for _file in Path(self.temporary_folder).iterdir(): - if any([_file.name.endswith(_ext) for _ext in base_file_choices]): - self.file_paths["base_file"] = Path(str(_file)) - elif not zipfile.is_zipfile(str(_file)): + sorted_files = sorted(Path(self.temporary_folder).iterdir()) + for _file in sorted_files: + if not zipfile.is_zipfile(str(_file)): + if any([_file.name.endswith(_ext) for _ext in base_file_choices]): + self.file_paths["base_file"] = Path(str(_file)) ext = _file.name.split(".")[-1] - self.file_paths[f"{ext}_file"] = Path(str(_file)) + if f"{ext}_file" in self.file_paths: + existing = self.file_paths[f"{ext}_file"] + self.file_paths[f"{ext}_file"] = [ + Path(str(_file)), + *(existing if isinstance(existing, list) else [existing]), + ] + else: + self.file_paths[f"{ext}_file"] = Path(str(_file)) + + tmp = self.file_paths.copy() + for key, value in self.file_paths.items(): + if isinstance(value, list): + for index, file_path in enumerate(value): + n = f"{key}_{index}" if index > 0 else key + tmp[n] = file_path + self.file_paths = tmp # remiving the zip file os.remove(zip_name) diff --git a/geonode/storage/tests.py b/geonode/storage/tests.py index 48d1f3ebfe2..9a96186ab3e 100644 --- a/geonode/storage/tests.py +++ b/geonode/storage/tests.py @@ -573,7 +573,21 @@ def test_zip_file_should_correctly_recognize_main_extension_with_csv(self): self.assertIsNotNone(storage_manager.data_retriever.temporary_folder) _files = storage_manager.get_retrieved_paths() - self.assertTrue("example.csv" in _files.get("base_file")) + # Selected base_file is not defined in case of multiple csv files + self.assertTrue(_files.get("base_file").endswith(".csv")) + + def test_zip_file_should_correctly_index_file_extensions(self): + # reinitiate the storage manager with the zip file + storage_manager = self.sut( + remote_files={"base_file": os.path.join(f"{self.project_root}", "tests/data/example.zip")} + ) + storage_manager.clone_remote_files() + + self.assertIsNotNone(storage_manager.data_retriever.temporary_folder) + _files = storage_manager.get_retrieved_paths() + self.assertIsNotNone(_files.get("csv_file")) + # extensions found more than once get indexed + self.assertIsNotNone(_files.get("csv_file_1")) @override_settings( SUPPORTED_DATASET_FILE_TYPES=[ diff --git a/geonode/storage/tests/data/example.zip b/geonode/storage/tests/data/example.zip index 099888b8ef30952a98553b0ffe978c41fdd87b58..bb54c689f548c71a5455a78973833b76f8feecb9 100644 GIT binary patch literal 324 zcmWIWW@Zs#W?neK-5V1YrgS22mj92IAC;#N2|MRK4WlveO>EK89P^Cc4F> zc3QSwb2!Ao5P(xJCs1#GNk(cBPMwTQ_RP5KRRQV-0fmMojUXDy60B=?{ko8PJxC2O+ HfjA5RA1F*t literal 140 zcmWIWW@Zs#U|`^22#J>neK-5V1Q1UYh`E6{wIVUMASYEXxw!1Khp&&}mU9e~++tEY rE!(a+9O7UI@MdHZVZf~&XaWNxhyVj5!vefn*+629Kxhf1ZNUlv5U3nd