Merge pull request #154 from brittonsmith/ahfnew

Ahfnew
ytree-project · Nov 17, 2022 · d2266f7 · d2266f7
2 parents 3c4a5d9 + d8f66e6
commit d2266f7
Show file tree

Hide file tree

Showing 9 changed files with 277 additions and 39 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -170,13 +170,13 @@ jobs:
 
       - restore_cache:
           name: "Restore test data cache."
-          key: test-data-abbey
+          key: test-data-abernathy
 
       - download-test-data
 
       - save_cache:
           name: "Save test data cache."
-          key: test-data-abbey
+          key: test-data-abernathy
           paths:
             - ~/ytree_test
 

diff --git a/doc/source/Loading.rst b/doc/source/Loading.rst
@@ -11,11 +11,15 @@ use the freely available :ref:`sample-data`.
 Amiga Halo Finder
 -----------------
 
-The `Amiga Halo Finder <http://popia.ft.uam.es/AHF/Download.html>`__ format
-stores data in a series of files, with one each per snapshot.  Parameters
-are stored in ".parameters" and ".log" files, halo information in
-".AHF_halos" files, and descendent/ancestor links are stored in ".AHF_mtree"
-files.  Make sure to keep all of these together.  To load, provide the name
+There are a couple data formats associated with the `Amiga Halo Finder
+<http://popia.ft.uam.es/AHF/>`__. Both formats save a series of files
+associated with each snapshot. Parameters are stored in ".parameters"
+and ".log" files and halo properties in ".AHF_halos" files. In the
+older format, descendent/ancestor links are stored in several
+".AHF_mtree" files, one per snapshot. In the newer format, all halo
+linking information is stored in a single file beginning with
+"MergerTree\_" and ending with "-CRMratio2". Make sure to keep all
+these files together in the same directory. To load, provide the name
 of the first ".parameter" file.
 
 .. code-block:: python
@@ -24,6 +28,14 @@ of the first ".parameter" file.
    >>> a = ytree.load("ahf_halos/snap_N64L16_000.parameter",
    ...                hubble_constant=0.7)
 
+Alternatively, the "MergerTree\_" file can also be provided for the
+newer format.
+
+.. code-block:: python
+
+   >>> import ytree
+   >>> a = ytree.load("AHF_100_tiny/MergerTree_GIZMO-NewMDCLUSTER_0047.txt-CRMratio2")
+
 .. note:: Four important notes about loading AHF data:
 
           1. The dimensionless Hubble parameter is not provided in AHF

diff --git a/tests/frontends/test_ahf.py b/tests/frontends/test_ahf.py
@@ -1,5 +1,6 @@
 from ytree.frontends.ahf import \
-    AHFArbor
+    AHFArbor, \
+    AHFNewArbor
 from ytree.utilities.testing import \
     ArborTest, \
     TempDirTest
@@ -9,3 +10,9 @@ class AHFArborTest(TempDirTest, ArborTest):
     test_filename = "ahf_halos/snap_N64L16_000.parameter"
     num_data_files = 136
     tree_skip = 100
+
+class AHFNewArborTest(TempDirTest, ArborTest):
+    arbor_type = AHFNewArbor
+    test_filename = "AHF_100_tiny/GIZMO-NewMDCLUSTER_0047.snap_128.parameter"
+    num_data_files = 5
+    tree_skip = 100
diff --git a/ytree/data_structures/arbor.py b/ytree/data_structures/arbor.py
@@ -1144,6 +1144,14 @@ def _plant_trees(self):
         if self.is_planted:
             return
 
+        # This is a somewhat hacky way of catching halos with links
+        # spanning more than one data set. That said, dict access
+        # is much faster than I thought and perhaps this whole
+        # routine needs to be refactored.
+        if self._has_uids:
+            all_dict = {}
+            missed_connections = []
+
         # this can be called once with the list, but fields are
         # not guaranteed to be returned in order.
         if self._has_uids:
@@ -1182,15 +1190,29 @@ def _plant_trees(self):
                     root = i == 0 or descid == -1
                     # The data says a descendent exists, but it's not there.
                     # This shouldn't happen, but it does sometimes.
+                    # This can also happen when a descendent is more than
+                    # one snapshot removed.
+                    mcollect = False
                     if not root and descid not in lastids:
                         root = True
+                        my_descid = descid
                         descid = data[desc_id_f][it] = -1
+                        if self._has_uids:
+                            mcollect = True
                     tree_node = TreeNode(my_uid, arbor=self, root=root)
                     tree_node._fi = it
                     tree_node.data_file = data_file
                     batch[it] = tree_node
+
+                    if self._has_uids:
+                        all_dict[my_uid] = tree_node
+
                     if root:
-                        trees.append(tree_node)
+                        if mcollect:
+                            tree_node._desc_uid = my_descid
+                            missed_connections.append(tree_node)
+                        else:
+                            trees.append(tree_node)
                     else:
                         ancs[descid].append(tree_node)
                     uid += 1
@@ -1218,6 +1240,18 @@ def _plant_trees(self):
             pbar.update(i+1)
         pbar.finish()
 
+        if self._has_uids:
+            for node in missed_connections:
+                my_desc_uid = node._desc_uid
+                my_desc = all_dict[my_desc_uid]
+                delattr(node, "_desc_uid")
+
+                node._descendent = my_desc
+                node.root = my_desc.root
+                if my_desc._ancestors is None:
+                    my_desc._ancestors = []
+                my_desc._ancestors.append(node)
+
         self._trees = np.array(trees)
         self._size = self._trees.size
 

diff --git a/ytree/frontends/ahf/__init__.py b/ytree/frontends/ahf/__init__.py
@@ -14,4 +14,5 @@
 #-----------------------------------------------------------------------------
 
 from ytree.frontends.ahf.arbor import \
-    AHFArbor
+    AHFArbor, \
+    AHFNewArbor
diff --git a/ytree/frontends/ahf/arbor.py b/ytree/frontends/ahf/arbor.py
@@ -13,27 +13,36 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 #-----------------------------------------------------------------------------
 
+from collections import defaultdict
 import glob
 import os
 import re
 
 from ytree.data_structures.arbor import \
     CatalogArbor
 from ytree.frontends.ahf.fields import \
-    AHFFieldInfo
+    AHFFieldInfo, \
+    AHFNewFieldInfo
 from ytree.frontends.ahf.io import \
-    AHFDataFile
+    AHFDataFile, \
+    AHFNewDataFile
 from ytree.frontends.ahf.misc import \
     parse_AHF_file
 from unyt.unit_registry import \
     UnitRegistry
+from ytree.utilities.io import \
+    f_text_block
 
 class AHFArbor(CatalogArbor):
     """
     Arbor for Amiga Halo Finder data.
     """
 
-    _suffix = ".parameter"
+    _data_suffix = ".AHF_halos"
+    _mtree_suffix = ".AHF_mtree"
+    _par_suffix = ".parameter"
+    _crm_prefix = "MergerTree_"
+    _crm_suffix = ".txt-CRMratio2"
     _field_info_class = AHFFieldInfo
     _data_file_class = AHFDataFile
 
@@ -46,17 +55,42 @@ def __init__(self, filename, log_filename=None,
         self.omega_matter = omega_matter
         self.omega_lambda = omega_lambda
         self._box_size_user = box_size
+        self._file_pattern = re.compile(
+            rf"(^.+[^0-9a-zA-Z]+)(\d+).*{self._par_suffix}$")
         super().__init__(filename)
 
+    def _is_crm_file(self, filename):
+        return os.path.basename(filename).startswith(self._crm_prefix) and \
+          filename.endswith(self._crm_suffix)
+
+    def _get_crm_filename(self, filename):
+        # Searching for <keyword>.something.<suffix>
+        res = re.search(rf"([^\.]+)\.[^\.]+{self._par_suffix}$", filename)
+        if not res:
+            return None
+
+        filekey = res.groups()[0]
+        ddir = os.path.dirname(filekey)
+        bname = os.path.basename(filekey)
+        return os.path.join(ddir, f"{self._crm_prefix}{bname}{self._crm_suffix}")
+
     def _parse_parameter_file(self):
         df = AHFDataFile(self.filename, self)
 
         pars = {"simu.omega0": "omega_matter",
                 "simu.lambda0": "omega_lambda",
                 "simu.boxsize": "box_size"}
-        log_filename = self.log_filename \
-          if self.log_filename is not None else df.filekey + ".log"
-        if os.path.exists(log_filename):
+
+        if self.log_filename is None:
+            fns = glob.glob(df.filekey + "*.log")
+            if fns:
+                log_filename = fns[0]
+            else:
+                log_filename = None
+        else:
+            log_filename = self.log_filename
+
+        if log_filename is not None and os.path.exists(log_filename):
             vals = parse_AHF_file(log_filename, pars, sep=":")
             for attr in ["omega_matter",
                          "omega_lambda"]:
@@ -68,7 +102,7 @@ def _parse_parameter_file(self):
             self.box_size = self.quan(self._box_size_user, "Mpc/h")
 
         # fields from from the .AHF_halos files
-        f = open(f"{df.data_filekey}.AHF_halos")
+        f = open(f"{df.data_filekey}{self._data_suffix}")
         line = f.readline()
         f.close()
 
@@ -95,19 +129,17 @@ def _prefix(self):
             # Match a patten of any characters, followed by some sort of
             # separator (e.g., "." or "_"), then a number, and eventually
             # the suffix.
-            reg = re.search(rf"(^.+[^0-9a-zA-Z]+)\d+.+{self._suffix}$", self.filename)
+            reg = self._file_pattern.search(self.filename)
             self._fprefix = reg.groups()[0]
         return self._fprefix
 
     def _get_data_files(self):
         """
         Get all *.parameter files and sort them in reverse order.
         """
-        my_files = glob.glob(f"{self._prefix}*{self._suffix}")
+        my_files = glob.glob(f"{self._prefix}*{self._par_suffix}")
         # sort by catalog number
-        my_files.sort(
-            key=lambda x:
-            self._get_file_index(x))
+        my_files.sort(key=self._get_file_index)
         self.data_files = \
           [self._data_file_class(f, self) for f in my_files]
 
@@ -120,19 +152,99 @@ def _get_data_files(self):
         self.data_files.reverse()
 
     def _get_file_index(self, f):
-        reg = re.search(rf"{self._prefix}(\d+){self._suffix}$", f)
+        reg = self._file_pattern.search(f)
         if not reg:
             raise RuntimeError(
                 f"Could not locate index within file: {f}.")
-        return int(reg.groups()[0])
+        return int(reg.groups()[1])
 
     @classmethod
     def _is_valid(self, *args, **kwargs):
         """
-        File must end in .AHF_halos and have an associated
-        .parameter file.
+        File mush end in .parameter.
         """
         fn = args[0]
-        if not fn.endswith(self._suffix):
+        if not fn.endswith(self._par_suffix):
             return False
+
+        mtree_fn = self._get_crm_filename(self, fn)
+        if mtree_fn is not None and os.path.exists(mtree_fn):
+            return False
+
+        return True
+
+class AHFNewArbor(AHFArbor):
+    """
+    Arbor for a newer version of Amiga Halo Finder data.
+    """
+
+    _has_uids = True
+    _field_info_class = AHFNewFieldInfo
+    _data_file_class = AHFNewDataFile
+
+    def _set_paths(self, filename):
+        super()._set_paths(filename)
+        if self._is_crm_file(filename):
+            basename = os.path.basename(filename)
+            filekey = basename[len(self._crm_prefix):-len(self._crm_suffix)]
+            pfns = glob.glob(os.path.join(self.directory, filekey) +
+                             f"*{self._par_suffix}")
+            pfns.sort(key=self._get_file_index)
+            self.filename = pfns[-1]
+        self._crm_filename = self._get_crm_filename(self.filename)
+
+    def _plant_trees(self):
+        if self.is_planted:
+            return
+
+        self._compute_links()
+        super()._plant_trees()
+
+    def _compute_links(self):
+        """
+        Read the CRMratio2 file and hand out a dictionary of
+        uid: desc_uid for each data file.
+        """
+
+        links = defaultdict(dict)
+
+        f = open(self._crm_filename, mode="r")
+        for i in range(3):
+            f.readline()
+
+        for line, loc in f_text_block(f, pbar_string="Computing links"):
+            if line.startswith("END"):
+                break
+
+            online = line.split()
+            thing = online[0]
+            if len(online) == 2:
+                my_descid = int(thing)
+                continue
+
+            my_id = int(thing)
+            cid = int(thing[:-12])
+            links[cid][my_id] = my_descid
+        f.close()
+
+        for df in self.data_files:
+            df._links = links[df._catalog_index]
+
+    @classmethod
+    def _is_valid(self, *args, **kwargs):
+        """
+        Filename must end in .parameter or match the CRM naming
+        convention.
+        """
+        fn = args[0]
+        if self._is_crm_file(self, fn):
+            return True
+
+        if not fn.endswith(self._par_suffix):
+            return False
+
+        mtree_fn = self._get_crm_filename(self, fn)
+        if mtree_fn is None or not os.path.exists(mtree_fn):
+            return False
+
         return True