Document error classifications in the validator

Went through the Python error codes to verify my understanding and improve the code comments before making figures for the manuscript/presentation. This commit fixes issue #7
snurr-group · Feb 9, 2019 · 2a4e29f · 2a4e29f
1 parent 107a765
commit 2a4e29f
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 19 deletions.
diff --git a/Python/check_mof_linkers.py b/Python/check_mof_linkers.py
@@ -5,6 +5,10 @@
 sbu.cpp to decompose MOFs into fragments.  Compare actual fragments from
 ToBACCo or GA hMOF structures against their "recipe."
 
+See comments in summarize, GAMOFs/TobaccoMOFs.expected_moffles,
+MOFCompare._test_generated, and smiles_diff.py for documentation on the error
+classes (and other warnings) reported in the json output from this script.
+
 @author: Ben Bucior
 """
 
@@ -134,7 +138,8 @@ def compare_moffles(moffles1, moffles2, names=None):
 					matched_topology = True
 			if matched_topology:
 				continue
-		# Else, it's a mismatch, so report an error
+		# Else, it's a mismatch, so report an error as "err_<KEY TYPE>",
+		# e.g. "err_topology"
 		comparison[key] = False
 		comparison['match'] = False
 		comparison['errors'].append("err_" + key)
@@ -148,7 +153,9 @@ def compare_moffles(moffles1, moffles2, names=None):
 	return comparison
 
 def summarize(results):
-	# Summarize the error classes for MOFFLES results
+	# Summarize the error classes for MOFFLES results.
+	# "error_types" from the header of the .json output are defined below
+	# in the main for loop.
 	summarized = {'mofs': results, 'errors': dict()}
 	error_types = {'err_topology': 0, 'err_smiles': 0, 'two': 0, 'three_plus': 0, 'success': 0, 'undefined': 0}
 	for match in results:
@@ -161,7 +168,10 @@ def summarize(results):
 			error_types['two'] += 1
 		elif len(match['errors']) > 2:
 			error_types['three_plus'] += 1
-		elif len(match['errors']) == 1:  # Other class of known issue with MOFFLES generation and/or naming scheme
+		elif len(match['errors']) == 1:
+			# Other classes of known issue with MOFid generation and/or naming
+			# scheme are copied verbatim, e.g. "err_"* from the
+			# `compare_moffles` function directly before this one.
 			known_issue = match['errors'][0]
 			if known_issue not in error_types:
 				error_types[known_issue] = 0  # Initialize new class of errors
@@ -239,14 +249,17 @@ def _test_generated(self, cif_path, generated_moffles, start_time = None, genera
 		if moffles_from_name is None:  # missing SBU info in the DB file
 			return None  # Currently, skip reporting of structures with undefined nodes/linkers
 
-		# Add common classes of error
-
 		if (py2 and type(moffles_from_name) in [str, unicode]) or (not py2 and type(moffles_from_name) is str):
 			orig_moffles = moffles_from_name
 			moffles_from_name = dict()
 			moffles_from_name['default'] = orig_moffles
 		default = parse_moffles(moffles_from_name['default'])
 		linkers = default['smiles'].split('.')
+
+		# Define sources of error when the program exits with errors.
+		# Without these definitions, the validator would return a generic
+		# class of error, e.g. "err_topology", instead of actually indicating
+		# the root cause from program error or timeout.
 		moffles_from_name['err_timeout'] = assemble_moffles(linkers,
 			'TIMEOUT', default['cat'], mof_name=default['name'])
 		moffles_from_name['err_systre_error'] = assemble_moffles(linkers,
@@ -415,8 +428,9 @@ def expected_moffles(self, cif_path):
 					if smiles not in sbus:
 						sbus.append(smiles)
 				# Also generate the nitrogen-terminated versions for pillared paddlewheels
-				if (part in ["linker1", "linker2"] and codes['nodes'] 
-					in ["1", "2"] and topology == "pcu"):
+				if (part in ["linker1", "linker2"] and
+					codes['nodes'] in ["1", "2"] and topology == "pcu"
+					):
 					assert len(full_smiles) == 1
 					n_smi = self._carboxylate_to_nitrogen(full_smiles[0])
 					if n_smi not in n_components:
@@ -427,11 +441,14 @@ def expected_moffles(self, cif_path):
 			sbus.sort()
 
 			moffles_options = dict()
-			moffles_options['default'] = assemble_moffles(sbus, topology, cat,
-			mof_name=codes['name'])
+			moffles_options['default'] = assemble_moffles(
+				sbus, topology, cat, mof_name=codes['name'])
 
+			# Flagging common cases when the generated MOF does not match the idealized recipe,
+			# for example when the bottom-up algorithm is missing a connection.
 			if topology == 'fcu':  # Zr nodes do not always form **fcu** topology, even when linker1==linker2
-				# TODO: Will we have to somehow add the benzoic acid agent to the **pcu** MOFs above?
+				# Note: if we change the handling of bound modulators, we will have to add
+				# benzoic acid to these **pcu** MOF nodes
 				moffles_options['Zr_mof_not_fcu'] = assemble_moffles(sbus,
 					'pcu', cat, mof_name=codes['name'])
 			if codes['nodes'] == '4':  # Some Zr hMOFs have four linkers replaced by benzoic acid
@@ -447,13 +464,18 @@ def expected_moffles(self, cif_path):
 				moffles_options['V_incomplete_linker'] = assemble_moffles(
 					v_sbus, 'ERROR', cat, mof_name=codes['name'])
 			if len(n_components) == 2:
+				# Ambiguities arise when there is both a linker1 and linker2
+				# because of the combinations between N/carboxylate and L1/L2
+				# in pcu paddlewheel MOFs with pillars.
 				for i, smi in enumerate(n_components):
+					# Both L1 and L2, plus N-pillar i (unknown which one beforehand)
 					n_sbus = copy.deepcopy(sbus)
 					n_sbus.append(smi)
 					n_sbus.sort()
 					moffles_options['unk_pillar' + str(i+1)] = assemble_moffles(
 						n_sbus, 'pcu', cat, mof_name=codes['name'])
 
+					# Replacing linker i from a carboxylate to a pillar
 					n_sbus.remove(n_orig[i])
 					n_sbus.sort()
 					moffles_options['replaced_pillar' + str(i+1)] = assemble_moffles(
@@ -584,8 +606,8 @@ def expected_moffles(self, cif_path):
 				cat, mof_name=codes['name'])
 			# Known classes of issues go here
 			if topology == "tpt":  # Systre analysis finds an **stp** net for ToBaCCo MOFs with the **tpt** template
-				moffles_options['stp_from_tpt'] = assemble_moffles(linkers,
-					"stp", cat, mof_name=codes['name'])
+				moffles_options['stp_from_tpt'] = assemble_moffles(
+					linkers, "stp", cat, mof_name=codes['name'])
 
 			if EXPORT_CODES:
 				moffles_options['_codes'] = codes
@@ -698,8 +720,11 @@ def _choose_parser(self, cif_name):
 		# Run a whole directory if specified as a single argument with an ending slash
 		inputs = glob.glob(args[0] + '*.[Cc][Ii][Ff]')
 		comparer = AutoCompare(True)  # Do not use database of known MOFs
-	elif len(args) == 1 and (args[0].endswith('.txt') or
-		args[0].endswith('.smi') or args[0].endswith('.out')):
+	elif len(args) == 1 and (
+		args[0].endswith('.txt') or
+		args[0].endswith('.smi') or
+		args[0].endswith('.out')
+		):
 		input_type = "MOFid line"
 		with open(args[0], "r") as f:
 			inputs = f.readlines()

diff --git a/Python/smiles_diff.py b/Python/smiles_diff.py
@@ -1,7 +1,9 @@
 """
 Run a diff between two SMILES codes
 
-Report common classes of errors in the calculated MOFid.
+Report common classes of errors in the calculated MOFid.  Provides more
+specific SMILES errors/differences than just "err_smiles".  See DIFF_LEVELS
+and single_smiles_diff for documentation on the reported error classifications.
 
 @author: Ben Bucior
 """
@@ -11,8 +13,16 @@
 from cpp_cheminformatics import ob_normalize, openbabel_replace, openbabel_formula, openbabel_GetSpacedFormula
 
 
+# Define priority order for differences between SMILES.
+# The lowest matching value indicates the closest match, so it has precedence.
+# See the function `single_smiles_diff` to see how these errors are defined.
+#
+# The _extra suffix from `find_closest_match` indicates that a component has been
+# replicated in the MOFid with a redundant copy containing the specified error.
+# Examples: a heterocycle containing the expected SMILES and a copy with the wrong
+# aromaticity specification; or a copy of a node in the MOF with extra bonds between metals.
 DIFF_LEVELS = dict({
-	'same' : 0,
+	'same' : 0,  # match
 	'formula' : 50,
 	'nonplanar_carboxylate': 5,
 	'phenyl_radicals': 6,
@@ -53,7 +63,7 @@ def find_closest_match(smiles, preferred_list, extra_list):
 
 def multi_smiles_diff(smiles1, smiles2):
 	# What are the differences between two dot-separated SMILES codes, a reference and candidate?
-	# Just bond orders?  The entire bond structure?  Nothing similar?
+	# e.g. Just bond orders?  The entire bond structure?  Nothing similar?
 	if smiles1 == smiles2:
 		return []
 
@@ -124,6 +134,7 @@ def strip_extra(formula):
 
 	def radical_to_carb(smiles):
 		return openbabel_replace(smiles, '[#6:1][#6D3:2](~[O-0:3])[O:4]', '[#6:1][#6:2](=[O:3])[O-:4]')
+	# OB will not assign a double bond to nonplanar carboxylates (if the torsion is above a threshold)
 	if radical_to_carb(smiles1) == radical_to_carb(smiles2):
 		return "nonplanar_carboxylate"
 
@@ -155,9 +166,9 @@ def is_organic(smiles):
 	base2 = base2.upper()
 
 	if base1 != base2:
-		return molec_type + "_single_bonds"
+		return molec_type + "_single_bonds"  # the underlying connectivity does not match
 	else:
-		return molec_type + "_bond_orders"
+		return molec_type + "_bond_orders"  # the connectivity itself is okay, just not the BO
 
 if __name__ == "__main__":
 	args = sys.argv[1:]