From 434b956d12cba04ba132d63b5d583e511dfedda0 Mon Sep 17 00:00:00 2001
From: Daniel Levine <levineds@meta.com>
Date: Tue, 30 Jul 2024 15:46:04 -0700
Subject: [PATCH] [BE] Remove large files from fairchem and add references to
 new location as needed (#761)

* Remove large files from fairchem and add references to new location as needed

* ruff differs from isort specification...

* add fine-tuning supporting-info since it is over 2MB

* add unittest

* linting

* typo

* import

* Use better function name and re-use fairchem_root function

---------

Co-authored-by: Muhammed Shuaibi <45150244+mshuaibii@users.noreply.github.com>
---
 docs/core/datasets/oc20dense.md               |   2 +-
 docs/tutorials/NRR/NRR_example.md             |   2 +-
 src/fairchem/applications/AdsorbML/README.md  |   2 +-
 .../2023_neurips_challenge/challenge_eval.py  |   7 +
 .../core/scripts/download_large_files.py      |  76 ++++
 src/fairchem/data/oc/README.md                |   3 +-
 src/fairchem/data/oc/core/bulk.py             |   4 +
 src/fairchem/data/oc/databases/update.py      |  17 +-
 src/fairchem/data/odac/README.md              |   4 +-
 src/fairchem/data/odac/force_field/README.md  |   2 +-
 .../promising_mof_energies/energy.py          | 358 +++++++++---------
 .../promising_mof_features/readme             |   4 +-
 .../applications/cattsunami/tests/conftest.py |  15 +-
 tests/core/test_download_large_files.py       |  16 +
 14 files changed, 319 insertions(+), 193 deletions(-)
 create mode 100644 src/fairchem/core/scripts/download_large_files.py
 create mode 100644 tests/core/test_download_large_files.py

diff --git a/docs/core/datasets/oc20dense.md b/docs/core/datasets/oc20dense.md
index fb07a09ad..64639889c 100644
--- a/docs/core/datasets/oc20dense.md
+++ b/docs/core/datasets/oc20dense.md
@@ -11,7 +11,7 @@ The OC20Dense dataset is a validation dataset which was used to assess model per
 |ASE Trajectories    |29G    |112G   | [ee937e5290f8f720c914dc9a56e0281f](https://dl.fbaipublicfiles.com/opencatalystproject/data/adsorbml/oc20_dense_trajectories.tar.gz)   |
 
 The following files are also provided to be used for evaluation and general information:
-* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information -
+* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information. If this file is not present, run the command `python src/fairchem/core/scripts/download_large_files.py adsorbml` from the root of the fairchem repo to download it. -
   * `system_id`: Unique system identifier for an adsorbate, bulk, surface combination.
   * `config_id`: Unique configuration identifier, where `rand` and `heur` correspond to random and heuristic initial configurations, respectively.
   * `mpid`: Materials Project bulk identifier.
diff --git a/docs/tutorials/NRR/NRR_example.md b/docs/tutorials/NRR/NRR_example.md
index b69e078d1..cc5ab6d07 100644
--- a/docs/tutorials/NRR/NRR_example.md
+++ b/docs/tutorials/NRR/NRR_example.md
@@ -62,7 +62,7 @@ To do this, we will enumerate adsorbate-slab configurations and run ML relaxatio
 
 +++
 
-Be sure to set the path in `fairchem/data/oc/configs/paths.py` to point to the correct place or pass the paths as an argument. The database pickles can be found in `fairchem/data/oc/databases/pkls`. We will show one explicitly here as an example and then run all of them in an automated fashion for brevity.
+Be sure to set the path in `fairchem/data/oc/configs/paths.py` to point to the correct place or pass the paths as an argument. The database pickles can be found in `fairchem/data/oc/databases/pkls` (some pkl files are only downloaded by running the command `python src/fairchem/core/scripts/download_large_files.py oc` from the root of the fairchem repo). We will show one explicitly here as an example and then run all of them in an automated fashion for brevity.
 
 ```{code-cell} ipython3
 import fairchem.data.oc
diff --git a/src/fairchem/applications/AdsorbML/README.md b/src/fairchem/applications/AdsorbML/README.md
index ca5be5737..700c06b67 100644
--- a/src/fairchem/applications/AdsorbML/README.md
+++ b/src/fairchem/applications/AdsorbML/README.md
@@ -21,7 +21,7 @@ NOTE - ASE trajectories exclude systems that were not converged or had invalid c
 |ASE Trajectories    |29G    |112G   | [ee937e5290f8f720c914dc9a56e0281f](https://dl.fbaipublicfiles.com/opencatalystproject/data/adsorbml/oc20_dense_trajectories.tar.gz)   |
 
 The following files are also provided to be used for evaluation and general information:
-* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information -
+* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information. If this file is not present, run the command `python src/fairchem/core/scripts/download_large_files.py adsorbml` from the root of the fairchem repo to download it. -
   * `system_id`: Unique system identifier for an adsorbate, bulk, surface combination.
   * `config_id`: Unique configuration identifier, where `rand` and `heur` correspond to random and heuristic initial configurations, respectively.
   * `mpid`: Materials Project bulk identifier.
diff --git a/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py b/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py
index d7e801fe0..01c492bba 100644
--- a/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py
+++ b/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 
+from fairchem.core.scripts import download_large_files
+
 
 def is_successful(best_pred_energy, best_dft_energy, SUCCESS_THRESHOLD=0.1):
     """
@@ -161,6 +163,11 @@ def main():
 
     # targets and metadata are expected to be in
     # the same directory as this script
+    if (
+        not Path(__file__).with_name("oc20dense_val_targets.pkl").exists()
+        or not Path(__file__).with_name("ml_relaxed_dft_targets.pkl").exists()
+    ):
+        download_large_files.download_file_group("adsorbml")
     targets = pickle.load(
         open(Path(__file__).with_name("oc20dense_val_targets.pkl"), "rb")
     )
diff --git a/src/fairchem/core/scripts/download_large_files.py b/src/fairchem/core/scripts/download_large_files.py
new file mode 100644
index 000000000..f79fa2156
--- /dev/null
+++ b/src/fairchem/core/scripts/download_large_files.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from urllib.request import urlretrieve
+
+from fairchem.core.common.tutorial_utils import fairchem_root
+
+S3_ROOT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/large_files/"
+
+FILE_GROUPS = {
+    "odac": [
+        Path("configs/odac/s2ef/scaling_factors/painn.pt"),
+        Path("src/fairchem/data/odac/force_field/data_w_oms.json"),
+        Path(
+            "src/fairchem/data/odac/promising_mof/promising_mof_features/JmolData.jar"
+        ),
+        Path(
+            "src/fairchem/data/odac/promising_mof/promising_mof_energies/adsorption_energy.txt"
+        ),
+        Path("src/fairchem/data/odac/supercell_info.csv"),
+    ],
+    "oc": [Path("src/fairchem/data/oc/databases/pkls/bulks.pkl")],
+    "adsorbml": [
+        Path(
+            "src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/oc20dense_mapping.pkl"
+        ),
+        Path(
+            "src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/ml_relaxed_dft_targets.pkl"
+        ),
+    ],
+    "cattsunami": [
+        Path("tests/applications/cattsunami/tests/autoframe_inputs_dissociation.pkl"),
+        Path("tests/applications/cattsunami/tests/autoframe_inputs_transfer.pkl"),
+    ],
+    "docs": [
+        Path("docs/tutorials/NRR/NRR_example_bulks.pkl"),
+        Path("docs/core/fine-tuning/supporting-information.json"),
+    ],
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "file_group",
+        type=str,
+        help="Group of files to download",
+        default="ALL",
+        choices=["ALL", *list(FILE_GROUPS)],
+    )
+    return parser.parse_args()
+
+
+def download_file_group(file_group):
+    if file_group in FILE_GROUPS:
+        files_to_download = FILE_GROUPS[file_group]
+    elif file_group == "ALL":
+        files_to_download = [item for group in FILE_GROUPS.values() for item in group]
+    else:
+        raise ValueError(
+            f'Requested file group {file_group} not recognized. Please select one of {["ALL", *list(FILE_GROUPS)]}'
+        )
+
+    fc_root = fairchem_root().parents[1]
+    for file in files_to_download:
+        if not (fc_root / file).exists():
+            print(f"Downloading {file}...")
+            urlretrieve(S3_ROOT + file.name, fc_root / file)
+        else:
+            print(f"{file} already exists")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    download_file_group(args.file_group)
diff --git a/src/fairchem/data/oc/README.md b/src/fairchem/data/oc/README.md
index 20205d1d5..06aba8887 100644
--- a/src/fairchem/data/oc/README.md
+++ b/src/fairchem/data/oc/README.md
@@ -9,6 +9,7 @@ This repository hosts the adsorbate-catalyst input generation workflow used in t
 
 To install just run in your favorite environment with python >= 3.9
 * `pip install fairchem-data-oc`
+* `python src/fairchem/core/scripts/download_large_files.py oc`
 
 ## Workflow
 
@@ -155,7 +156,7 @@ python structure_generator.py \
 
 ### Bulks
 
-A database of bulk materials taken from existing databases (i.e. Materials Project) and relaxed with consistent RPBE settings may be found in `ocdata/databases/pkls/bulks.pkl`. To preview what bulks are available, view the corresponding mapping between indices and bulks (bulk id and composition): https://dl.fbaipublicfiles.com/opencatalystproject/data/input_generation/mapping_bulks_2021sep20.txt
+A database of bulk materials taken from existing databases (i.e. Materials Project) and relaxed with consistent RPBE settings may be found in `databases/pkls/bulks.pkl` (if not, run the command `python src/fairchem/core/scripts/download_large_files.py oc` from the root of the fairchem repo).  To preview what bulks are available, view the corresponding mapping between indices and bulks (bulk id and composition): https://dl.fbaipublicfiles.com/opencatalystproject/data/input_generation/mapping_bulks_2021sep20.txt
 
 ### Adsorbates
 
diff --git a/src/fairchem/data/oc/core/bulk.py b/src/fairchem/data/oc/core/bulk.py
index 9568ad362..6710b4388 100644
--- a/src/fairchem/data/oc/core/bulk.py
+++ b/src/fairchem/data/oc/core/bulk.py
@@ -9,6 +9,8 @@
 from fairchem.data.oc.core.slab import Slab
 from fairchem.data.oc.databases.pkls import BULK_PKL_PATH
 
+from fairchem.core.scripts import download_large_files
+
 if TYPE_CHECKING:
     import ase
 
@@ -51,6 +53,8 @@ def __init__(
             self.src_id = None
         else:
             if bulk_db is None:
+                if bulk_db_path == BULK_PKL_PATH and not os.path.exists(BULK_PKL_PATH):
+                    download_large_files.download_file_group("oc")
                 with open(bulk_db_path, "rb") as fp:
                     bulk_db = pickle.load(fp)
 
diff --git a/src/fairchem/data/oc/databases/update.py b/src/fairchem/data/oc/databases/update.py
index f9ca1f645..bab75709c 100644
--- a/src/fairchem/data/oc/databases/update.py
+++ b/src/fairchem/data/oc/databases/update.py
@@ -6,12 +6,15 @@
 from __future__ import annotations
 
 import pickle
+from pathlib import Path
 
 import ase.io
 from ase.atoms import Atoms
 from ase.calculators.singlepoint import SinglePointCalculator as SPC
 from tqdm import tqdm
 
+from fairchem.core.scripts import download_large_files
+
 
 # Monkey patch fix
 def pbc_patch(self):
@@ -29,7 +32,7 @@ def set_pbc_patch(self, pbc):
 
 def update_pkls():
     with open(
-        "ocdata/databases/pkls/adsorbates.pkl",
+        "oc/databases/pkls/adsorbates.pkl",
         "rb",
     ) as fp:
         data = pickle.load(fp)
@@ -38,13 +41,15 @@ def update_pkls():
         pbc = data[idx][0].cell._pbc
         data[idx][0]._pbc = pbc
     with open(
-        "ocdata/databases/pkls/adsorbates_new.pkl",
+        "oc/databases/pkls/adsorbates_new.pkl",
         "wb",
     ) as fp:
         pickle.dump(data, fp)
 
+    if not Path("oc/databases/pkls/bulks.pkl").exists():
+        download_large_files.download_file_group("oc")
     with open(
-        "ocdata/databases/pkls/bulks.pkl",
+        "oc/databases/pkls/bulks.pkl",
         "rb",
     ) as fp:
         data = pickle.load(fp)
@@ -64,7 +69,7 @@ def update_pkls():
 
         bulks.append((atoms, bulk_id))
     with open(
-        "ocdata/databases/pkls/bulks_new.pkl",
+        "oc/databases/pkls/bulks_new.pkl",
         "wb",
     ) as f:
         pickle.dump(bulks, f)
@@ -73,7 +78,7 @@ def update_pkls():
 def update_dbs():
     for db_name in ["adsorbates", "bulks"]:
         db = ase.io.read(
-            f"ocdata/databases/ase/{db_name}.db",
+            f"oc/databases/ase/{db_name}.db",
             ":",
         )
         new_data = []
@@ -90,7 +95,7 @@ def update_dbs():
             new_data.append(atoms)
 
         ase.io.write(
-            f"ocdata/databases/ase/{db_name}_new.db",
+            f"oc/databases/ase/{db_name}_new.db",
             new_data,
         )
 
diff --git a/src/fairchem/data/odac/README.md b/src/fairchem/data/odac/README.md
index d6529edd7..f46ababd0 100644
--- a/src/fairchem/data/odac/README.md
+++ b/src/fairchem/data/odac/README.md
@@ -4,9 +4,11 @@ To download the ODAC23 dataset, please see the links [here](https://fair-chem.gi
 
 Pre-trained ML models and configs are available [here](https://fair-chem.github.io/core/model_checkpoints.html#open-direct-air-capture-2023-odac23).
 
+Large ODAC files can be downloaded by running the command `python src/fairchem/core/scripts/download_large_files.py odac` from the root of the fairchem repo.
+
 This repository contains the list of [promising MOFs](https://github.com/FAIR-Chem/fairchem/tree/main/src/fairchem/data/odac/promising_mof) discovered in the ODAC23 paper, as well as details of the [classifical force field calculations](https://github.com/FAIR-Chem/fairchem/tree/main/src/fairchem/data/odac/force_field). 
 
-Information about supercells can be found in [supercell_info.csv](https://github.com/FAIR-Chem/fairchem/blob/main/src/fairchem/data/odac/supercell_info.csv) for each example.
+Information about supercells can be found in [supercell_info.csv](https://dl.fbaipublicfiles.com/opencatalystproject/data/large_files/supercell_info.csv) for each example (this file is downloaded to the local repo only when the above script is run).
 
 ## Citing
 
diff --git a/src/fairchem/data/odac/force_field/README.md b/src/fairchem/data/odac/force_field/README.md
index debe565bd..25714603f 100644
--- a/src/fairchem/data/odac/force_field/README.md
+++ b/src/fairchem/data/odac/force_field/README.md
@@ -2,7 +2,7 @@
 
 This folder contains data and scripts related to the classical FF analysis performed in this work.
 
-- The `data_w_oms.json` file contains all successful FF interaction energy calculations with both system information and DFT-computed interaction energies. Calculations were performed across the in-domain training, validation, and test sets.
+- The `data_w_oms.json` file contains all successful FF interaction energy calculations with both system information and DFT-computed interaction energies. Calculations were performed across the in-domain training, validation, and test sets. If this file is not present, run the command `python src/fairchem/core/scripts/download_large_files.py odac` from the root of the fairchem repo to download it.
 - The `data_w_ml.json` file contains the same information for systems with successful ML interaction energy predictions. Only systems in the in-domain test set are included here.
 - The `FF_analysis.py` script performs the error calculations discussed in the paper and generates the four panels of Figure 5. All of the data used in this analysis is contained in 'data_w_oms.json" for reproducibility.
 - The `FF_calcs` folder contains example calculations for classical FF interaction energy predictions.
diff --git a/src/fairchem/data/odac/promising_mof/promising_mof_energies/energy.py b/src/fairchem/data/odac/promising_mof/promising_mof_energies/energy.py
index 6a9d37924..547806cc0 100644
--- a/src/fairchem/data/odac/promising_mof/promising_mof_energies/energy.py
+++ b/src/fairchem/data/odac/promising_mof/promising_mof_energies/energy.py
@@ -1,8 +1,14 @@
 from __future__ import annotations
 
+import os
+
 import matplotlib.pyploat as plt
 import pandas as pd
 
+from fairchem.core.scripts import download_large_files
+
+if not os.path.exists("adsorption_energy.txt"):
+    download_large_files.download_file_group("odac")
 raw_ads_energy_data = pd.read_csv("adsorption_energy.txt", header=None, sep=" ")
 complete_data = pd.DataFrame(
     index=range(raw_ads_energy_data.shape[0]),
@@ -170,12 +176,12 @@
             current_lowest_energy
             < lowest_energy_data_co2.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_co2.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
-            lowest_energy_data_co2.loc[index_this_case, "configuration_index"] = (
-                current_configuration_index
-            )
+            lowest_energy_data_co2.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
+            lowest_energy_data_co2.loc[
+                index_this_case, "configuration_index"
+            ] = current_configuration_index
             lowest_energy_data_co2.loc[index_this_case, "Name"] = current_name
 
 
@@ -212,12 +218,12 @@
             current_lowest_energy
             < lowest_energy_data_h2o.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_h2o.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
-            lowest_energy_data_h2o.loc[index_this_case, "configuration_index"] = (
-                current_configuration_index
-            )
+            lowest_energy_data_h2o.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
+            lowest_energy_data_h2o.loc[
+                index_this_case, "configuration_index"
+            ] = current_configuration_index
             lowest_energy_data_h2o.loc[index_this_case, "Name"] = current_name
 
 lowest_energy_data_co_ads = pd.DataFrame(
@@ -254,12 +260,12 @@
             current_lowest_energy
             < lowest_energy_data_co_ads.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_co_ads.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
-            lowest_energy_data_co_ads.loc[index_this_case, "configuration_index"] = (
-                current_configuration_index
-            )
+            lowest_energy_data_co_ads.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
+            lowest_energy_data_co_ads.loc[
+                index_this_case, "configuration_index"
+            ] = current_configuration_index
             lowest_energy_data_co_ads.loc[index_this_case, "Name"] = current_name
 
 
@@ -298,12 +304,12 @@
             current_lowest_energy
             < lowest_energy_data_co_ads_2.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_co_ads_2.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
-            lowest_energy_data_co_ads_2.loc[index_this_case, "configuration_index"] = (
-                current_configuration_index
-            )
+            lowest_energy_data_co_ads_2.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
+            lowest_energy_data_co_ads_2.loc[
+                index_this_case, "configuration_index"
+            ] = current_configuration_index
             lowest_energy_data_co_ads_2.loc[index_this_case, "Name"] = current_name
 
 
@@ -439,9 +445,9 @@
             current_lowest_energy
             < lowest_energy_data_co2_defective.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_co2_defective.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
+            lowest_energy_data_co2_defective.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
             lowest_energy_data_co2_defective.loc[
                 index_this_case, "configuration_index"
             ] = current_configuration_index
@@ -485,9 +491,9 @@
             current_lowest_energy
             < lowest_energy_data_h2o_defective.loc[index_this_case, "ads_energy_ev"]
         ):
-            lowest_energy_data_h2o_defective.loc[index_this_case, "ads_energy_ev"] = (
-                current_lowest_energy
-            )
+            lowest_energy_data_h2o_defective.loc[
+                index_this_case, "ads_energy_ev"
+            ] = current_lowest_energy
             lowest_energy_data_h2o_defective.loc[
                 index_this_case, "configuration_index"
             ] = current_configuration_index
@@ -542,9 +548,9 @@
             lowest_energy_data_co_ads_defective.loc[
                 index_this_case, "configuration_index"
             ] = current_configuration_index
-            lowest_energy_data_co_ads_defective.loc[index_this_case, "Name"] = (
-                current_name
-            )
+            lowest_energy_data_co_ads_defective.loc[
+                index_this_case, "Name"
+            ] = current_name
 
 lowest_energy_data_co_ads_2_defective = pd.DataFrame(
     columns=complete_data_merged_defective_co_ads_2.columns
@@ -600,9 +606,9 @@
             lowest_energy_data_co_ads_2_defective.loc[
                 index_this_case, "configuration_index"
             ] = current_configuration_index
-            lowest_energy_data_co_ads_2_defective.loc[index_this_case, "Name"] = (
-                current_name
-            )
+            lowest_energy_data_co_ads_2_defective.loc[
+                index_this_case, "Name"
+            ] = current_name
 
 
 adsorption_data_defective = pd.DataFrame(
@@ -646,136 +652,132 @@
 
     # adsorption_data_defective_defective.iloc[count,0]=mof_name
 
-    adsorption_data_defective.loc[count, "n_converged_CO2"] = (
-        complete_data_merged_defective[
-            (complete_data_merged_defective["MOF"] == mof_name)
-            & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
-            & (complete_data_merged_defective["defect_index"] == current_defect_index)
-            & (complete_data_merged_defective["n_CO2"] == 1)
-            & (complete_data_merged_defective["n_H2O"] == 0)
-        ].shape[0]
-    )
-    adsorption_data_defective.loc[count, "n_converged_H2O"] = (
-        complete_data_merged_defective[
-            (complete_data_merged_defective["MOF"] == mof_name)
-            & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
-            & (complete_data_merged_defective["defect_index"] == current_defect_index)
-            & (complete_data_merged_defective["n_CO2"] == 0)
-            & (complete_data_merged_defective["n_H2O"] == 1)
-        ].shape[0]
-    )
-    adsorption_data_defective.loc[count, "n_converged_co"] = (
-        complete_data_merged_defective[
-            (complete_data_merged_defective["MOF"] == mof_name)
-            & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
-            & (complete_data_merged_defective["defect_index"] == current_defect_index)
-            & (complete_data_merged_defective["n_CO2"] == 1)
-            & (complete_data_merged_defective["n_H2O"] == 1)
-        ].shape[0]
-    )
-    adsorption_data_defective.loc[count, "n_converged_co_2"] = (
-        complete_data_merged_defective[
-            (complete_data_merged_defective["MOF"] == mof_name)
-            & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
-            & (complete_data_merged_defective["defect_index"] == current_defect_index)
-            & (complete_data_merged_defective["n_CO2"] == 1)
-            & (complete_data_merged_defective["n_H2O"] == 2)
-        ].shape[0]
-    )
+    adsorption_data_defective.loc[
+        count, "n_converged_CO2"
+    ] = complete_data_merged_defective[
+        (complete_data_merged_defective["MOF"] == mof_name)
+        & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
+        & (complete_data_merged_defective["defect_index"] == current_defect_index)
+        & (complete_data_merged_defective["n_CO2"] == 1)
+        & (complete_data_merged_defective["n_H2O"] == 0)
+    ].shape[
+        0
+    ]
+    adsorption_data_defective.loc[
+        count, "n_converged_H2O"
+    ] = complete_data_merged_defective[
+        (complete_data_merged_defective["MOF"] == mof_name)
+        & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
+        & (complete_data_merged_defective["defect_index"] == current_defect_index)
+        & (complete_data_merged_defective["n_CO2"] == 0)
+        & (complete_data_merged_defective["n_H2O"] == 1)
+    ].shape[
+        0
+    ]
+    adsorption_data_defective.loc[
+        count, "n_converged_co"
+    ] = complete_data_merged_defective[
+        (complete_data_merged_defective["MOF"] == mof_name)
+        & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
+        & (complete_data_merged_defective["defect_index"] == current_defect_index)
+        & (complete_data_merged_defective["n_CO2"] == 1)
+        & (complete_data_merged_defective["n_H2O"] == 1)
+    ].shape[
+        0
+    ]
+    adsorption_data_defective.loc[
+        count, "n_converged_co_2"
+    ] = complete_data_merged_defective[
+        (complete_data_merged_defective["MOF"] == mof_name)
+        & (complete_data_merged_defective["defect_conc"] == current_defect_conc)
+        & (complete_data_merged_defective["defect_index"] == current_defect_index)
+        & (complete_data_merged_defective["n_CO2"] == 1)
+        & (complete_data_merged_defective["n_H2O"] == 2)
+    ].shape[
+        0
+    ]
 
     if not lowest_energy_data_co2_defective[
         (lowest_energy_data_co2_defective["MOF"] == mof_name)
         & (lowest_energy_data_co2_defective["defect_conc"] == current_defect_conc)
         & (lowest_energy_data_co2_defective["defect_index"] == current_defect_index)
     ].empty:
-        adsorption_data_defective.loc[count, "ads_CO2"] = (
-            lowest_energy_data_co2_defective[
-                (lowest_energy_data_co2_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co2_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co2_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 6]
-        )
-        adsorption_data_defective.loc[count, "config_CO2"] = (
-            lowest_energy_data_co2_defective[
-                (lowest_energy_data_co2_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co2_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co2_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 5]
-        )
+        adsorption_data_defective.loc[
+            count, "ads_CO2"
+        ] = lowest_energy_data_co2_defective[
+            (lowest_energy_data_co2_defective["MOF"] == mof_name)
+            & (lowest_energy_data_co2_defective["defect_conc"] == current_defect_conc)
+            & (lowest_energy_data_co2_defective["defect_index"] == current_defect_index)
+        ].iloc[
+            0, 6
+        ]
+        adsorption_data_defective.loc[
+            count, "config_CO2"
+        ] = lowest_energy_data_co2_defective[
+            (lowest_energy_data_co2_defective["MOF"] == mof_name)
+            & (lowest_energy_data_co2_defective["defect_conc"] == current_defect_conc)
+            & (lowest_energy_data_co2_defective["defect_index"] == current_defect_index)
+        ].iloc[
+            0, 5
+        ]
     if not lowest_energy_data_h2o_defective[
         (lowest_energy_data_h2o_defective["MOF"] == mof_name)
         & (lowest_energy_data_h2o_defective["defect_conc"] == current_defect_conc)
         & (lowest_energy_data_h2o_defective["defect_index"] == current_defect_index)
     ].empty:
-        adsorption_data_defective.loc[count, "ads_H2O"] = (
-            lowest_energy_data_h2o_defective[
-                (lowest_energy_data_h2o_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_h2o_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_h2o_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 6]
-        )
-        adsorption_data_defective.loc[count, "config_H2O"] = (
-            lowest_energy_data_h2o_defective[
-                (lowest_energy_data_h2o_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_h2o_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_h2o_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 5]
-        )
+        adsorption_data_defective.loc[
+            count, "ads_H2O"
+        ] = lowest_energy_data_h2o_defective[
+            (lowest_energy_data_h2o_defective["MOF"] == mof_name)
+            & (lowest_energy_data_h2o_defective["defect_conc"] == current_defect_conc)
+            & (lowest_energy_data_h2o_defective["defect_index"] == current_defect_index)
+        ].iloc[
+            0, 6
+        ]
+        adsorption_data_defective.loc[
+            count, "config_H2O"
+        ] = lowest_energy_data_h2o_defective[
+            (lowest_energy_data_h2o_defective["MOF"] == mof_name)
+            & (lowest_energy_data_h2o_defective["defect_conc"] == current_defect_conc)
+            & (lowest_energy_data_h2o_defective["defect_index"] == current_defect_index)
+        ].iloc[
+            0, 5
+        ]
     if not lowest_energy_data_co_ads_defective[
         (lowest_energy_data_co_ads_defective["MOF"] == mof_name)
         & (lowest_energy_data_co_ads_defective["defect_conc"] == current_defect_conc)
         & (lowest_energy_data_co_ads_defective["defect_index"] == current_defect_index)
     ].empty:
-        adsorption_data_defective.loc[count, "ads_co"] = (
-            lowest_energy_data_co_ads_defective[
-                (lowest_energy_data_co_ads_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co_ads_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co_ads_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 6]
-        )
-        adsorption_data_defective.loc[count, "config_co"] = (
-            lowest_energy_data_co_ads_defective[
-                (lowest_energy_data_co_ads_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co_ads_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co_ads_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 5]
-        )
+        adsorption_data_defective.loc[
+            count, "ads_co"
+        ] = lowest_energy_data_co_ads_defective[
+            (lowest_energy_data_co_ads_defective["MOF"] == mof_name)
+            & (
+                lowest_energy_data_co_ads_defective["defect_conc"]
+                == current_defect_conc
+            )
+            & (
+                lowest_energy_data_co_ads_defective["defect_index"]
+                == current_defect_index
+            )
+        ].iloc[
+            0, 6
+        ]
+        adsorption_data_defective.loc[
+            count, "config_co"
+        ] = lowest_energy_data_co_ads_defective[
+            (lowest_energy_data_co_ads_defective["MOF"] == mof_name)
+            & (
+                lowest_energy_data_co_ads_defective["defect_conc"]
+                == current_defect_conc
+            )
+            & (
+                lowest_energy_data_co_ads_defective["defect_index"]
+                == current_defect_index
+            )
+        ].iloc[
+            0, 5
+        ]
     if not lowest_energy_data_co_ads_2_defective[
         (lowest_energy_data_co_ads_2_defective["MOF"] == mof_name)
         & (lowest_energy_data_co_ads_2_defective["defect_conc"] == current_defect_conc)
@@ -784,32 +786,36 @@
             == current_defect_index
         )
     ].empty:
-        adsorption_data_defective.loc[count, "ads_co_2"] = (
-            lowest_energy_data_co_ads_2_defective[
-                (lowest_energy_data_co_ads_2_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co_ads_2_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co_ads_2_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 6]
-        )
-        adsorption_data_defective.loc[count, "config_co_2"] = (
-            lowest_energy_data_co_ads_2_defective[
-                (lowest_energy_data_co_ads_2_defective["MOF"] == mof_name)
-                & (
-                    lowest_energy_data_co_ads_2_defective["defect_conc"]
-                    == current_defect_conc
-                )
-                & (
-                    lowest_energy_data_co_ads_2_defective["defect_index"]
-                    == current_defect_index
-                )
-            ].iloc[0, 5]
-        )
+        adsorption_data_defective.loc[
+            count, "ads_co_2"
+        ] = lowest_energy_data_co_ads_2_defective[
+            (lowest_energy_data_co_ads_2_defective["MOF"] == mof_name)
+            & (
+                lowest_energy_data_co_ads_2_defective["defect_conc"]
+                == current_defect_conc
+            )
+            & (
+                lowest_energy_data_co_ads_2_defective["defect_index"]
+                == current_defect_index
+            )
+        ].iloc[
+            0, 6
+        ]
+        adsorption_data_defective.loc[
+            count, "config_co_2"
+        ] = lowest_energy_data_co_ads_2_defective[
+            (lowest_energy_data_co_ads_2_defective["MOF"] == mof_name)
+            & (
+                lowest_energy_data_co_ads_2_defective["defect_conc"]
+                == current_defect_conc
+            )
+            & (
+                lowest_energy_data_co_ads_2_defective["defect_index"]
+                == current_defect_index
+            )
+        ].iloc[
+            0, 5
+        ]
 
 
 # read the mofs missing DDEC charges
diff --git a/src/fairchem/data/odac/promising_mof/promising_mof_features/readme b/src/fairchem/data/odac/promising_mof/promising_mof_features/readme
index afb41617a..4910e85ea 100644
--- a/src/fairchem/data/odac/promising_mof/promising_mof_features/readme
+++ b/src/fairchem/data/odac/promising_mof/promising_mof_features/readme
@@ -7,10 +7,10 @@ Three criterias have to be satisfied: 1. 2 rings are parallel; 2. the distance o
 2. metal-oxygen-metal bridges: [$(select {metal})]~[$(select oxygen)]~[$(select {metal})]
 3. uncoordinated nitrogen atoms: [$([#7X2r5])]
 
-We recommend using the jmolData.jar for high-throughput calculations. jmol.jar, which takes more time to run, is good for visualization and debug.
+We recommend using the JmolData.jar for high-throughput calculations. jmol.jar, which takes more time to run, is good for visualization and debug.
 Steps:
 1. Change the content of 'list_MOF.txt' to the paths of the MOFs 
-2. Use 'java  -jar JmolData.jar -on  -s features.txt' to run the script
+2. Use 'java  -jar JmolData.jar -on  -s features.txt' to run the script. If JmolData.jar is missing, run the command `python src/fairchem/core/scripts/download_large_files.py odac` from the root of the fairchem repo to download it.
 3. The output will be saved in the 'output.txt' in the same directory by default, and it can be modified at the last line of the code.
 	'output.txt' has 10 columns:
 		1. ID is the index in 'list_MOF.txt'.
diff --git a/tests/applications/cattsunami/tests/conftest.py b/tests/applications/cattsunami/tests/conftest.py
index 24222d9cf..9afdc0a96 100644
--- a/tests/applications/cattsunami/tests/conftest.py
+++ b/tests/applications/cattsunami/tests/conftest.py
@@ -1,6 +1,9 @@
-from pathlib import Path
+import os
 import pickle
+from pathlib import Path
+
 import pytest
+from fairchem.core.scripts import download_large_files
 
 
 @pytest.fixture(scope="class")
@@ -17,11 +20,17 @@ def desorption_inputs(request):
 
 @pytest.fixture(scope="class")
 def dissociation_inputs(request):
-    with open(Path(__file__).parent / "autoframe_inputs_dissociation.pkl", "rb") as fp:
+    pkl_path = Path(__file__).parent / "autoframe_inputs_dissociation.pkl"
+    if not pkl_path.exists():
+        download_large_files.download_file_group("cattsunami")
+    with open(pkl_path, "rb") as fp:
         request.cls.inputs = pickle.load(fp)
 
 
 @pytest.fixture(scope="class")
 def transfer_inputs(request):
-    with open(Path(__file__).parent / "autoframe_inputs_transfer.pkl", "rb") as fp:
+    pkl_path = Path(__file__).parent / "autoframe_inputs_transfer.pkl"
+    if not pkl_path.exists():
+        download_large_files.download_file_group("cattsunami")
+    with open(pkl_path, "rb") as fp:
         request.cls.inputs = pickle.load(fp)
diff --git a/tests/core/test_download_large_files.py b/tests/core/test_download_large_files.py
new file mode 100644
index 000000000..991f8ce34
--- /dev/null
+++ b/tests/core/test_download_large_files.py
@@ -0,0 +1,16 @@
+import os
+from unittest.mock import patch
+
+from fairchem.core.scripts import download_large_files as dl_large
+
+
+@patch.object(dl_large, "urlretrieve")
+def test_download_large_files(url_mock):
+    def urlretrieve_mock(x, y):
+        if not os.path.exists(os.path.dirname(y)):
+            raise ValueError(
+                f"The path to {y} does not exist. fairchem directory structure has changed,"
+            )
+
+    url_mock.side_effect = urlretrieve_mock
+    dl_large.download_file_group("ALL")