MDAnalysis · richardjgowers · Oct 10, 2018 · Oct 10, 2018 · Oct 10, 2018 · Oct 11, 2018
diff --git a/MDAnalysisData/__init__.py b/MDAnalysisData/__init__.py
@@ -8,7 +8,7 @@
 __all__ = ['datasets']
 
 from . import datasets
-
+from .base import fetch, DATASET_NAMES
 
 
 

diff --git a/MDAnalysisData/adk_equilibrium.py b/MDAnalysisData/adk_equilibrium.py
@@ -5,88 +5,60 @@
 https://figshare.com/articles/Molecular_dynamics_trajectory_for_benchmarking_MDAnalysis/5108170/1
 """
 
-from os.path import dirname, exists, join
-from os import makedirs, remove
-import codecs
-
 import logging
 
-from .base import get_data_home
-from .base import _fetch_remote
-from .base import RemoteFileMetadata
-from .base import Bunch
+from .base import RemoteFileMetadata, Dataset, fetch
 
-NAME = "adk_equilibrium"
-DESCRIPTION = "adk_equilibrium.rst"
-# The original data can be found at the figshare URL.
-# The SHA256 checksum of the zip file changes with every download so we
-# cannot check its checksum. Instead we download individual files.
-# separately. The keys of this dict are also going to be the keys in the
-# Bunch that is returned.
-ARCHIVE = {
-    'topology': RemoteFileMetadata(
-        filename='adk4AKE.psf',
-        url='https://ndownloader.figshare.com/files/8672230',
-        checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4',
-    ),
-    'trajectory':  RemoteFileMetadata(
-        filename='1ake_007-nowater-core-dt240ps.dcd',
-        url='https://ndownloader.figshare.com/files/8672074',
-        checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf',
-    ),
-}
 
 logger = logging.getLogger(__name__)
 
 
 def fetch_adk_equilibrium(data_home=None, download_if_missing=True):
-    """Load the AdK 1us equilibrium trajectory (without water)
+    """Load AdK 1us equilibrium trajectory (without water)
 
     Parameters
     ----------
     data_home : optional, default: None
         Specify another download and cache folder for the datasets. By default
         all MDAnalysisData data is stored in '~/MDAnalysis_data' subfolders.
-        This dataset is stored in ``<data_home>/adk_equilibrium``.
+        This dataset is stored in ``<data_home>/adk_transitions_DIMS``.
     download_if_missing : optional, default=True
         If ``False``, raise a :exc:`IOError` if the data is not locally available
         instead of trying to download the data from the source site.
 
     Returns
     -------
-    dataset : dict-like object with the following attributes:
-    dataset.topology : filename
-        Filename of the topology file
-    dataset.trajectory : filename
-        Filename of the trajectory file
-    dataset.DESCR : string
-        Description of the trajectory.
-
-
-    See :ref:`adk-equilibrium-dataset` for description.
+    dataset : dict-like with following attributes:
+      topology : filename
+           Filename of the topology file
+      trajectory : filename
+           Filename of the trajectory file
+      DESCR : string
+           Description of the trajectory.
     """
-    name = NAME
-    data_location = join(get_data_home(data_home=data_home),
-                         name)
-    if not exists(data_location):
-        makedirs(data_location)
-
-    records = Bunch()
-    for file_type, meta in ARCHIVE.items():
-        local_path = join(data_location, meta.filename)
-        records[file_type] = local_path
-
-        if not exists(local_path):
-            if not download_if_missing:
-                raise IOError("Data {0}={1} not found and `download_if_missing` is "
-                              "False".format(file_type, local_path))
-            logger.info("Downloading {0}: {1} -> {2}...".format(
-                file_type, meta.url, local_path))
-            archive_path = _fetch_remote(meta, dirname=data_location)
-
-    module_path = dirname(__file__)
-    with codecs.open(join(module_path, 'descr', DESCRIPTION),
-                     encoding="utf-8") as dfile:
-        records.DESCR = dfile.read()
+    return fetch(AdK_Equilibrium.NAME, data_home=data_home,
+                 download_if_missing=download_if_missing)
+
+class AdK_Equilibrium(Dataset):
+    __doc__ = fetch_adk_equilibrium.__doc__
+    NAME = "adk_equilibrium"
+    DESCRIPTION = "adk_equilibrium.rst"
+
+    # The original data can be found at the figshare URL.
+    # The SHA256 checksum of the zip file changes with every download so we
+    # cannot check its checksum. Instead we download individual files.
+    # separately. The keys of this dict are also going to be the keys in the
+    # Bunch that is returned.
+    ARCHIVE = {
+        'topology': RemoteFileMetadata(
+            filename='adk4AKE.psf',
+            url='https://ndownloader.figshare.com/files/8672230',
+            checksum='1aa947d58fb41b6805dc1e7be4dbe65c6a8f4690f0bd7fc2ae03e7bd437085f4',
+        ),
+        'trajectory':  RemoteFileMetadata(
+            filename='1ake_007-nowater-core-dt240ps.dcd',
+            url='https://ndownloader.figshare.com/files/8672074',
+            checksum='598fcbcfcc425f6eafbe9997238320fcacc6a4613ecce061e1521732bab734bf',
+        ),
+    }
 
-    return records
diff --git a/MDAnalysisData/base.py b/MDAnalysisData/base.py
@@ -31,9 +31,11 @@
 
 import shutil
 from collections import namedtuple
-from os import environ, listdir, makedirs
+from os import environ, listdir, makedirs, remove
 from os.path import dirname, exists, expanduser, isdir, join, splitext
 import hashlib
+import codecs
+
 
 
 
@@ -94,6 +96,62 @@ def __setstate__(self, state):
 RemoteFileMetadata = namedtuple('RemoteFileMetadata',
                                 ['filename', 'url', 'checksum'])
 
+DATASET_NAMES = {}
+
+class _DatasetRegister(type):
+    def __new__(meta, name, bases, class_dict):
+        cls = type.__new__(meta, name, bases, class_dict)
+        if not cls.NAME is None:
+            DATASET_NAMES[cls.NAME] = cls
+        return cls
+
+
+class Dataset(Bunch, metaclass=_DatasetRegister):
+    NAME = None
+    DESCRIPTION = None
+    ARCHIVE = None
+
+    def __init__(self, data_home=None, download_if_missing=True):
+        data_location = join(get_data_home(data_home=data_home),
+                             self.NAME)
+
+        if not exists(data_location):
+            makedirs(data_location)
+
+        contents = {}
+        for file_type, meta in self.ARCHIVE.items():
+            local_path = join(data_location, meta.filename)
+            contents[file_type] = local_path
+
+            if not exists(local_path):
+                if not download_if_missing:
+                    raise IOError("Data {0}={1} not found and `download_if_missing` is "
+                                  "False".format(file_type, local_path))
+                logger.info("Downloading {0}: {1} -> {2}...".format(
+                    file_type, meta.url, local_path))
+                archive_path = _fetch_remote(meta, dirname=data_location)
+
+        module_path = dirname(__file__)
+        with codecs.open(join(module_path, 'descr', self.DESCRIPTION),
+                         encoding="utf-8") as dfile:
+            contents['DESCR'] = dfile.read()
+
+
+        # finally, init the Bunch object
+        super().__init__(**contents)
+
+    def __repr__(self):
+        return self.__doc__
+
+
+def fetch(dataset, data_home=None, download_if_missing=True):
+    """Grab a named dataset"""
+    try:
+        return DATASET_NAMES[dataset](data_home=data_home,
+                                      download_if_missing=True)
+    except KeyError:
+        raise KeyError("unknown dataset: {}".format(dataset))
+
 
 def get_data_home(data_home=None):
     """Return the path of the MDAnalysisData data dir.

diff --git a/MDAnalysisData/datasets.py b/MDAnalysisData/datasets.py
@@ -7,7 +7,7 @@
 
 
 from .base import get_data_home, clear_data_home
-from .adk_equilibrium import fetch_adk_equilibrium
+from . adk_equilibrium import fetch_adk_equilibrium
 from .adk_transitions import (fetch_adk_transitions_DIMS,
                               fetch_adk_transitions_FRODA)
 from .ifabp_water import fetch_ifabp_water
@@ -16,7 +16,6 @@
 __all__ = [
     'get_data_home',
     'clear_data_home',
-    'fetch_adk_equilibrium',
     'fetch_adk_transitions_DIMS',
     'fetch_adk_transitions_FRODA',
     'fetch_ifabp_water',