Skip to content

Commit

Permalink
Document error classifications in the validator
Browse files Browse the repository at this point in the history
Went through the Python error codes to verify my understanding and improve the code comments before making figures for the manuscript/presentation.  This commit fixes issue #7
  • Loading branch information
bbucior committed Feb 9, 2019
1 parent 107a765 commit 2a4e29f
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 19 deletions.
53 changes: 39 additions & 14 deletions Python/check_mof_linkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
sbu.cpp to decompose MOFs into fragments. Compare actual fragments from
ToBACCo or GA hMOF structures against their "recipe."
See comments in summarize, GAMOFs/TobaccoMOFs.expected_moffles,
MOFCompare._test_generated, and smiles_diff.py for documentation on the error
classes (and other warnings) reported in the json output from this script.
@author: Ben Bucior
"""

Expand Down Expand Up @@ -134,7 +138,8 @@ def compare_moffles(moffles1, moffles2, names=None):
matched_topology = True
if matched_topology:
continue
# Else, it's a mismatch, so report an error
# Else, it's a mismatch, so report an error as "err_<KEY TYPE>",
# e.g. "err_topology"
comparison[key] = False
comparison['match'] = False
comparison['errors'].append("err_" + key)
Expand All @@ -148,7 +153,9 @@ def compare_moffles(moffles1, moffles2, names=None):
return comparison

def summarize(results):
# Summarize the error classes for MOFFLES results
# Summarize the error classes for MOFFLES results.
# "error_types" from the header of the .json output are defined below
# in the main for loop.
summarized = {'mofs': results, 'errors': dict()}
error_types = {'err_topology': 0, 'err_smiles': 0, 'two': 0, 'three_plus': 0, 'success': 0, 'undefined': 0}
for match in results:
Expand All @@ -161,7 +168,10 @@ def summarize(results):
error_types['two'] += 1
elif len(match['errors']) > 2:
error_types['three_plus'] += 1
elif len(match['errors']) == 1: # Other class of known issue with MOFFLES generation and/or naming scheme
elif len(match['errors']) == 1:
# Other classes of known issue with MOFid generation and/or naming
# scheme are copied verbatim, e.g. "err_"* from the
# `compare_moffles` function directly before this one.
known_issue = match['errors'][0]
if known_issue not in error_types:
error_types[known_issue] = 0 # Initialize new class of errors
Expand Down Expand Up @@ -239,14 +249,17 @@ def _test_generated(self, cif_path, generated_moffles, start_time = None, genera
if moffles_from_name is None: # missing SBU info in the DB file
return None # Currently, skip reporting of structures with undefined nodes/linkers

# Add common classes of error

if (py2 and type(moffles_from_name) in [str, unicode]) or (not py2 and type(moffles_from_name) is str):
orig_moffles = moffles_from_name
moffles_from_name = dict()
moffles_from_name['default'] = orig_moffles
default = parse_moffles(moffles_from_name['default'])
linkers = default['smiles'].split('.')

# Define sources of error when the program exits with errors.
# Without these definitions, the validator would return a generic
# class of error, e.g. "err_topology", instead of actually indicating
# the root cause from program error or timeout.
moffles_from_name['err_timeout'] = assemble_moffles(linkers,
'TIMEOUT', default['cat'], mof_name=default['name'])
moffles_from_name['err_systre_error'] = assemble_moffles(linkers,
Expand Down Expand Up @@ -415,8 +428,9 @@ def expected_moffles(self, cif_path):
if smiles not in sbus:
sbus.append(smiles)
# Also generate the nitrogen-terminated versions for pillared paddlewheels
if (part in ["linker1", "linker2"] and codes['nodes']
in ["1", "2"] and topology == "pcu"):
if (part in ["linker1", "linker2"] and
codes['nodes'] in ["1", "2"] and topology == "pcu"
):
assert len(full_smiles) == 1
n_smi = self._carboxylate_to_nitrogen(full_smiles[0])
if n_smi not in n_components:
Expand All @@ -427,11 +441,14 @@ def expected_moffles(self, cif_path):
sbus.sort()

moffles_options = dict()
moffles_options['default'] = assemble_moffles(sbus, topology, cat,
mof_name=codes['name'])
moffles_options['default'] = assemble_moffles(
sbus, topology, cat, mof_name=codes['name'])

# Flagging common cases when the generated MOF does not match the idealized recipe,
# for example when the bottom-up algorithm is missing a connection.
if topology == 'fcu': # Zr nodes do not always form **fcu** topology, even when linker1==linker2
# TODO: Will we have to somehow add the benzoic acid agent to the **pcu** MOFs above?
# Note: if we change the handling of bound modulators, we will have to add
# benzoic acid to these **pcu** MOF nodes
moffles_options['Zr_mof_not_fcu'] = assemble_moffles(sbus,
'pcu', cat, mof_name=codes['name'])
if codes['nodes'] == '4': # Some Zr hMOFs have four linkers replaced by benzoic acid
Expand All @@ -447,13 +464,18 @@ def expected_moffles(self, cif_path):
moffles_options['V_incomplete_linker'] = assemble_moffles(
v_sbus, 'ERROR', cat, mof_name=codes['name'])
if len(n_components) == 2:
# Ambiguities arise when there is both a linker1 and linker2
# because of the combinations between N/carboxylate and L1/L2
# in pcu paddlewheel MOFs with pillars.
for i, smi in enumerate(n_components):
# Both L1 and L2, plus N-pillar i (unknown which one beforehand)
n_sbus = copy.deepcopy(sbus)
n_sbus.append(smi)
n_sbus.sort()
moffles_options['unk_pillar' + str(i+1)] = assemble_moffles(
n_sbus, 'pcu', cat, mof_name=codes['name'])

# Replacing linker i from a carboxylate to a pillar
n_sbus.remove(n_orig[i])
n_sbus.sort()
moffles_options['replaced_pillar' + str(i+1)] = assemble_moffles(
Expand Down Expand Up @@ -584,8 +606,8 @@ def expected_moffles(self, cif_path):
cat, mof_name=codes['name'])
# Known classes of issues go here
if topology == "tpt": # Systre analysis finds an **stp** net for ToBaCCo MOFs with the **tpt** template
moffles_options['stp_from_tpt'] = assemble_moffles(linkers,
"stp", cat, mof_name=codes['name'])
moffles_options['stp_from_tpt'] = assemble_moffles(
linkers, "stp", cat, mof_name=codes['name'])

if EXPORT_CODES:
moffles_options['_codes'] = codes
Expand Down Expand Up @@ -698,8 +720,11 @@ def _choose_parser(self, cif_name):
# Run a whole directory if specified as a single argument with an ending slash
inputs = glob.glob(args[0] + '*.[Cc][Ii][Ff]')
comparer = AutoCompare(True) # Do not use database of known MOFs
elif len(args) == 1 and (args[0].endswith('.txt') or
args[0].endswith('.smi') or args[0].endswith('.out')):
elif len(args) == 1 and (
args[0].endswith('.txt') or
args[0].endswith('.smi') or
args[0].endswith('.out')
):
input_type = "MOFid line"
with open(args[0], "r") as f:
inputs = f.readlines()
Expand Down
21 changes: 16 additions & 5 deletions Python/smiles_diff.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
Run a diff between two SMILES codes
Report common classes of errors in the calculated MOFid.
Report common classes of errors in the calculated MOFid. Provides more
specific SMILES errors/differences than just "err_smiles". See DIFF_LEVELS
and single_smiles_diff for documentation on the reported error classifications.
@author: Ben Bucior
"""
Expand All @@ -11,8 +13,16 @@
from cpp_cheminformatics import ob_normalize, openbabel_replace, openbabel_formula, openbabel_GetSpacedFormula


# Define priority order for differences between SMILES.
# The lowest matching value indicates the closest match, so it has precedence.
# See the function `single_smiles_diff` to see how these errors are defined.
#
# The _extra suffix from `find_closest_match` indicates that a component has been
# replicated in the MOFid with a redundant copy containing the specified error.
# Examples: a heterocycle containing the expected SMILES and a copy with the wrong
# aromaticity specification; or a copy of a node in the MOF with extra bonds between metals.
DIFF_LEVELS = dict({
'same' : 0,
'same' : 0, # match
'formula' : 50,
'nonplanar_carboxylate': 5,
'phenyl_radicals': 6,
Expand Down Expand Up @@ -53,7 +63,7 @@ def find_closest_match(smiles, preferred_list, extra_list):

def multi_smiles_diff(smiles1, smiles2):
# What are the differences between two dot-separated SMILES codes, a reference and candidate?
# Just bond orders? The entire bond structure? Nothing similar?
# e.g. Just bond orders? The entire bond structure? Nothing similar?
if smiles1 == smiles2:
return []

Expand Down Expand Up @@ -124,6 +134,7 @@ def strip_extra(formula):

def radical_to_carb(smiles):
return openbabel_replace(smiles, '[#6:1][#6D3:2](~[O-0:3])[O:4]', '[#6:1][#6:2](=[O:3])[O-:4]')
# OB will not assign a double bond to nonplanar carboxylates (if the torsion is above a threshold)
if radical_to_carb(smiles1) == radical_to_carb(smiles2):
return "nonplanar_carboxylate"

Expand Down Expand Up @@ -155,9 +166,9 @@ def is_organic(smiles):
base2 = base2.upper()

if base1 != base2:
return molec_type + "_single_bonds"
return molec_type + "_single_bonds" # the underlying connectivity does not match
else:
return molec_type + "_bond_orders"
return molec_type + "_bond_orders" # the connectivity itself is okay, just not the BO

if __name__ == "__main__":
args = sys.argv[1:]
Expand Down

0 comments on commit 2a4e29f

Please sign in to comment.