diff --git a/.circleci/config.yml b/.circleci/config.yml index 1555bb3d..f03e0c32 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -170,13 +170,13 @@ jobs: - restore_cache: name: "Restore test data cache." - key: test-data-abbey + key: test-data-abernathy - download-test-data - save_cache: name: "Save test data cache." - key: test-data-abbey + key: test-data-abernathy paths: - ~/ytree_test diff --git a/doc/source/Loading.rst b/doc/source/Loading.rst index 6d55ae8e..48b75a91 100644 --- a/doc/source/Loading.rst +++ b/doc/source/Loading.rst @@ -11,11 +11,15 @@ use the freely available :ref:`sample-data`. Amiga Halo Finder ----------------- -The `Amiga Halo Finder `__ format -stores data in a series of files, with one each per snapshot. Parameters -are stored in ".parameters" and ".log" files, halo information in -".AHF_halos" files, and descendent/ancestor links are stored in ".AHF_mtree" -files. Make sure to keep all of these together. To load, provide the name +There are a couple data formats associated with the `Amiga Halo Finder +`__. Both formats save a series of files +associated with each snapshot. Parameters are stored in ".parameters" +and ".log" files and halo properties in ".AHF_halos" files. In the +older format, descendent/ancestor links are stored in several +".AHF_mtree" files, one per snapshot. In the newer format, all halo +linking information is stored in a single file beginning with +"MergerTree\_" and ending with "-CRMratio2". Make sure to keep all +these files together in the same directory. To load, provide the name of the first ".parameter" file. .. code-block:: python @@ -24,6 +28,14 @@ of the first ".parameter" file. >>> a = ytree.load("ahf_halos/snap_N64L16_000.parameter", ... hubble_constant=0.7) +Alternatively, the "MergerTree\_" file can also be provided for the +newer format. + +.. code-block:: python + + >>> import ytree + >>> a = ytree.load("AHF_100_tiny/MergerTree_GIZMO-NewMDCLUSTER_0047.txt-CRMratio2") + .. note:: Four important notes about loading AHF data: 1. The dimensionless Hubble parameter is not provided in AHF diff --git a/tests/frontends/test_ahf.py b/tests/frontends/test_ahf.py index d7e91b48..b883e810 100644 --- a/tests/frontends/test_ahf.py +++ b/tests/frontends/test_ahf.py @@ -1,5 +1,6 @@ from ytree.frontends.ahf import \ - AHFArbor + AHFArbor, \ + AHFNewArbor from ytree.utilities.testing import \ ArborTest, \ TempDirTest @@ -9,3 +10,9 @@ class AHFArborTest(TempDirTest, ArborTest): test_filename = "ahf_halos/snap_N64L16_000.parameter" num_data_files = 136 tree_skip = 100 + +class AHFNewArborTest(TempDirTest, ArborTest): + arbor_type = AHFNewArbor + test_filename = "AHF_100_tiny/GIZMO-NewMDCLUSTER_0047.snap_128.parameter" + num_data_files = 5 + tree_skip = 100 diff --git a/ytree/data_structures/arbor.py b/ytree/data_structures/arbor.py index 0ad78882..1af167bc 100644 --- a/ytree/data_structures/arbor.py +++ b/ytree/data_structures/arbor.py @@ -1144,6 +1144,14 @@ def _plant_trees(self): if self.is_planted: return + # This is a somewhat hacky way of catching halos with links + # spanning more than one data set. That said, dict access + # is much faster than I thought and perhaps this whole + # routine needs to be refactored. + if self._has_uids: + all_dict = {} + missed_connections = [] + # this can be called once with the list, but fields are # not guaranteed to be returned in order. if self._has_uids: @@ -1182,15 +1190,29 @@ def _plant_trees(self): root = i == 0 or descid == -1 # The data says a descendent exists, but it's not there. # This shouldn't happen, but it does sometimes. + # This can also happen when a descendent is more than + # one snapshot removed. + mcollect = False if not root and descid not in lastids: root = True + my_descid = descid descid = data[desc_id_f][it] = -1 + if self._has_uids: + mcollect = True tree_node = TreeNode(my_uid, arbor=self, root=root) tree_node._fi = it tree_node.data_file = data_file batch[it] = tree_node + + if self._has_uids: + all_dict[my_uid] = tree_node + if root: - trees.append(tree_node) + if mcollect: + tree_node._desc_uid = my_descid + missed_connections.append(tree_node) + else: + trees.append(tree_node) else: ancs[descid].append(tree_node) uid += 1 @@ -1218,6 +1240,18 @@ def _plant_trees(self): pbar.update(i+1) pbar.finish() + if self._has_uids: + for node in missed_connections: + my_desc_uid = node._desc_uid + my_desc = all_dict[my_desc_uid] + delattr(node, "_desc_uid") + + node._descendent = my_desc + node.root = my_desc.root + if my_desc._ancestors is None: + my_desc._ancestors = [] + my_desc._ancestors.append(node) + self._trees = np.array(trees) self._size = self._trees.size diff --git a/ytree/frontends/ahf/__init__.py b/ytree/frontends/ahf/__init__.py index 2c44e96a..a27111db 100644 --- a/ytree/frontends/ahf/__init__.py +++ b/ytree/frontends/ahf/__init__.py @@ -14,4 +14,5 @@ #----------------------------------------------------------------------------- from ytree.frontends.ahf.arbor import \ - AHFArbor + AHFArbor, \ + AHFNewArbor diff --git a/ytree/frontends/ahf/arbor.py b/ytree/frontends/ahf/arbor.py index e1f3b30f..1af691ad 100644 --- a/ytree/frontends/ahf/arbor.py +++ b/ytree/frontends/ahf/arbor.py @@ -13,6 +13,7 @@ # The full license is in the file COPYING.txt, distributed with this software. #----------------------------------------------------------------------------- +from collections import defaultdict import glob import os import re @@ -20,20 +21,28 @@ from ytree.data_structures.arbor import \ CatalogArbor from ytree.frontends.ahf.fields import \ - AHFFieldInfo + AHFFieldInfo, \ + AHFNewFieldInfo from ytree.frontends.ahf.io import \ - AHFDataFile + AHFDataFile, \ + AHFNewDataFile from ytree.frontends.ahf.misc import \ parse_AHF_file from unyt.unit_registry import \ UnitRegistry +from ytree.utilities.io import \ + f_text_block class AHFArbor(CatalogArbor): """ Arbor for Amiga Halo Finder data. """ - _suffix = ".parameter" + _data_suffix = ".AHF_halos" + _mtree_suffix = ".AHF_mtree" + _par_suffix = ".parameter" + _crm_prefix = "MergerTree_" + _crm_suffix = ".txt-CRMratio2" _field_info_class = AHFFieldInfo _data_file_class = AHFDataFile @@ -46,17 +55,42 @@ def __init__(self, filename, log_filename=None, self.omega_matter = omega_matter self.omega_lambda = omega_lambda self._box_size_user = box_size + self._file_pattern = re.compile( + rf"(^.+[^0-9a-zA-Z]+)(\d+).*{self._par_suffix}$") super().__init__(filename) + def _is_crm_file(self, filename): + return os.path.basename(filename).startswith(self._crm_prefix) and \ + filename.endswith(self._crm_suffix) + + def _get_crm_filename(self, filename): + # Searching for .something. + res = re.search(rf"([^\.]+)\.[^\.]+{self._par_suffix}$", filename) + if not res: + return None + + filekey = res.groups()[0] + ddir = os.path.dirname(filekey) + bname = os.path.basename(filekey) + return os.path.join(ddir, f"{self._crm_prefix}{bname}{self._crm_suffix}") + def _parse_parameter_file(self): df = AHFDataFile(self.filename, self) pars = {"simu.omega0": "omega_matter", "simu.lambda0": "omega_lambda", "simu.boxsize": "box_size"} - log_filename = self.log_filename \ - if self.log_filename is not None else df.filekey + ".log" - if os.path.exists(log_filename): + + if self.log_filename is None: + fns = glob.glob(df.filekey + "*.log") + if fns: + log_filename = fns[0] + else: + log_filename = None + else: + log_filename = self.log_filename + + if log_filename is not None and os.path.exists(log_filename): vals = parse_AHF_file(log_filename, pars, sep=":") for attr in ["omega_matter", "omega_lambda"]: @@ -68,7 +102,7 @@ def _parse_parameter_file(self): self.box_size = self.quan(self._box_size_user, "Mpc/h") # fields from from the .AHF_halos files - f = open(f"{df.data_filekey}.AHF_halos") + f = open(f"{df.data_filekey}{self._data_suffix}") line = f.readline() f.close() @@ -95,7 +129,7 @@ def _prefix(self): # Match a patten of any characters, followed by some sort of # separator (e.g., "." or "_"), then a number, and eventually # the suffix. - reg = re.search(rf"(^.+[^0-9a-zA-Z]+)\d+.+{self._suffix}$", self.filename) + reg = self._file_pattern.search(self.filename) self._fprefix = reg.groups()[0] return self._fprefix @@ -103,11 +137,9 @@ def _get_data_files(self): """ Get all *.parameter files and sort them in reverse order. """ - my_files = glob.glob(f"{self._prefix}*{self._suffix}") + my_files = glob.glob(f"{self._prefix}*{self._par_suffix}") # sort by catalog number - my_files.sort( - key=lambda x: - self._get_file_index(x)) + my_files.sort(key=self._get_file_index) self.data_files = \ [self._data_file_class(f, self) for f in my_files] @@ -120,19 +152,99 @@ def _get_data_files(self): self.data_files.reverse() def _get_file_index(self, f): - reg = re.search(rf"{self._prefix}(\d+){self._suffix}$", f) + reg = self._file_pattern.search(f) if not reg: raise RuntimeError( f"Could not locate index within file: {f}.") - return int(reg.groups()[0]) + return int(reg.groups()[1]) @classmethod def _is_valid(self, *args, **kwargs): """ - File must end in .AHF_halos and have an associated - .parameter file. + File mush end in .parameter. """ fn = args[0] - if not fn.endswith(self._suffix): + if not fn.endswith(self._par_suffix): return False + + mtree_fn = self._get_crm_filename(self, fn) + if mtree_fn is not None and os.path.exists(mtree_fn): + return False + + return True + +class AHFNewArbor(AHFArbor): + """ + Arbor for a newer version of Amiga Halo Finder data. + """ + + _has_uids = True + _field_info_class = AHFNewFieldInfo + _data_file_class = AHFNewDataFile + + def _set_paths(self, filename): + super()._set_paths(filename) + if self._is_crm_file(filename): + basename = os.path.basename(filename) + filekey = basename[len(self._crm_prefix):-len(self._crm_suffix)] + pfns = glob.glob(os.path.join(self.directory, filekey) + + f"*{self._par_suffix}") + pfns.sort(key=self._get_file_index) + self.filename = pfns[-1] + self._crm_filename = self._get_crm_filename(self.filename) + + def _plant_trees(self): + if self.is_planted: + return + + self._compute_links() + super()._plant_trees() + + def _compute_links(self): + """ + Read the CRMratio2 file and hand out a dictionary of + uid: desc_uid for each data file. + """ + + links = defaultdict(dict) + + f = open(self._crm_filename, mode="r") + for i in range(3): + f.readline() + + for line, loc in f_text_block(f, pbar_string="Computing links"): + if line.startswith("END"): + break + + online = line.split() + thing = online[0] + if len(online) == 2: + my_descid = int(thing) + continue + + my_id = int(thing) + cid = int(thing[:-12]) + links[cid][my_id] = my_descid + f.close() + + for df in self.data_files: + df._links = links[df._catalog_index] + + @classmethod + def _is_valid(self, *args, **kwargs): + """ + Filename must end in .parameter or match the CRM naming + convention. + """ + fn = args[0] + if self._is_crm_file(self, fn): + return True + + if not fn.endswith(self._par_suffix): + return False + + mtree_fn = self._get_crm_filename(self, fn) + if mtree_fn is None or not os.path.exists(mtree_fn): + return False + return True diff --git a/ytree/frontends/ahf/fields.py b/ytree/frontends/ahf/fields.py index 89372f96..a60d8f2f 100644 --- a/ytree/frontends/ahf/fields.py +++ b/ytree/frontends/ahf/fields.py @@ -48,7 +48,7 @@ class AHFFieldInfo(FieldInfoContainer): ) alias_fields = ( - ("halo_id", "ID", None), + ("halo_id", "ID", ""), ("mass", "Mvir", m_unit), ("virial_mass", "Mvir", m_unit), ("position_x", "Xc", p_unit), @@ -59,7 +59,7 @@ class AHFFieldInfo(FieldInfoContainer): ("velocity_z", "VZc", v_unit), ("virial_radius", "Rvir", p_unit), ("velocity_dispersion", "sigV", v_unit), - ("spin_parameter", "lambda", None), + ("spin_parameter", "lambda", ""), ("kinetic_energy", "Ekin", "Msun/h * (km/s)**2"), ("potential_energy", "Epot", "Msun/h * (km/s)**2"), ) @@ -70,3 +70,22 @@ class AHFFieldInfo(FieldInfoContainer): ('uid', id_type), ('desc_uid', id_type) ) + +class AHFNewFieldInfo(AHFFieldInfo): + alias_fields = ( + ("uid", "ID", ""), + ("desc_uid", "desc_id", ""), + ("mass", "Mvir", m_unit), + ("virial_mass", "Mvir", m_unit), + ("position_x", "Xc", p_unit), + ("position_y", "Yc", p_unit), + ("position_z", "Zc", p_unit), + ("velocity_x", "VXc", v_unit), + ("velocity_y", "VYc", v_unit), + ("velocity_z", "VZc", v_unit), + ("virial_radius", "Rvir", p_unit), + ("velocity_dispersion", "sigV", v_unit), + ("spin_parameter", "lambda", ""), + ("kinetic_energy", "Ekin", "Msun/h * (km/s)**2"), + ("potential_energy", "Epot", "Msun/h * (km/s)**2"), + ) diff --git a/ytree/frontends/ahf/io.py b/ytree/frontends/ahf/io.py index fd0ce824..ba07e14d 100644 --- a/ytree/frontends/ahf/io.py +++ b/ytree/frontends/ahf/io.py @@ -25,13 +25,16 @@ CatalogDataFile from ytree.utilities.io import \ f_text_block +from ytree.utilities.misc import fround class AHFDataFile(CatalogDataFile): _redshift_precision = 3 def __init__(self, filename, arbor): + self.arbor = weakref.proxy(arbor) self.filename = filename - self.filekey = self.filename[:self.filename.rfind(".parameter")] + self._catalog_index = arbor._get_file_index(filename) + self.filekey = self.filename[:self.filename.rfind(self.arbor._par_suffix)] self._parse_header() rprec = self._redshift_precision @@ -40,21 +43,25 @@ def __init__(self, filename, arbor): if res: self.data_filekey = self.filekey[:res.end()] else: - minz = 10**-rprec - if abs(self.redshift) < minz: - self.redshift = 0 zfmt = f".0{rprec}f" - my_z = format(self.redshift, zfmt) - self.data_filekey = f"{self.filekey}.z{my_z}" - - self.halos_filename = self.data_filekey + ".AHF_halos" - self.mtree_filename = self.data_filekey + ".AHF_mtree" + for inc in [0, -10**-rprec]: + my_z = format(fround(self.redshift, decimals=rprec) + inc, zfmt) + fkey = f"{self.filekey}.z{my_z}" + if os.path.exists(fkey + self.arbor._data_suffix): + self.data_filekey = fkey + break + + if not hasattr(self, "data_filekey"): + raise FileNotFoundError( + f"Cannot find data file: {fkey + self.arbor._data_suffix}.") + + self.halos_filename = self.data_filekey + self.arbor._data_suffix + self.mtree_filename = self.data_filekey + self.arbor._mtree_suffix if not os.path.exists(self.mtree_filename): self.mtree_filename = None self.fh = None self._parse_data_header() self.offsets = None - self.arbor = weakref.proxy(arbor) def _parse_data_header(self): """ @@ -291,3 +298,26 @@ def _read_fields(self, fields, tree_nodes=None, dtypes=None): self._get_mtree_fields(tfields, dtypes, field_data) return field_data + +class AHFNewDataFile(AHFDataFile): + def _compute_links(self): + """ + Read the CRMratio2 file. + """ + + def _compute_links(self): + self.arbor._compute_links() + + def _get_mtree_fields(self, tfields, dtypes, field_data): + if not tfields: + return + + my_ids = field_data["ID"] + field_data["desc_id"] = descids = np.full( + len(my_ids), -1, dtype=dtypes["desc_id"]) + + if not self.links: + return + + for i in range(descids.size): + descids[i] = self.links.get(my_ids[i], -1) diff --git a/ytree/utilities/misc.py b/ytree/utilities/misc.py new file mode 100644 index 00000000..aec0560e --- /dev/null +++ b/ytree/utilities/misc.py @@ -0,0 +1,23 @@ +""" +miscellaneous utilities + + + +""" + +#----------------------------------------------------------------------------- +# Copyright (c) ytree development team. All rights reserved. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +import numpy as np + +def fround(val, decimals=0): + """ + Round 0.5 up to 1 and so on. + """ + fac = 10**decimals + return np.floor(val * fac + 0.5) / fac