From 3658f8ccd6d82a5b6ac7ff11656ae20af0cbdc36 Mon Sep 17 00:00:00 2001
From: Ben Bucior <bbucior@u.northwestern.edu>
Date: Sun, 17 Feb 2019 09:26:47 -0600
Subject: [PATCH] Visualize errors in round-trip validation

Interpreting the results of check_mof_linkers.py to establish the percent accuracy (and common sources of mismatch) between a MOF's GA/ToBaCCo "recipe" and the MOFid of the resulting structure.  Mostly there are exact matches, though there are some instances of either unexpected structures and incompatibility with MOFid, particularly due to an upstream bug in #8.  This commit adapts some previous visualizations from earlier work, e.g. from Notebooks/20181211-github-refactoring/StartValidationTest/CheckToBaCCo/NewResults20190207/plot_tobacco_viz.R

I also identified an error class missing from the Python validation test, which corresponds to the case when sbu.cpp cannot find a MOF structure in the CIF.  This case will have a signature of `no_mof` in the catenation field.
---
 Analysis/README.md                |   4 +-
 Analysis/plot_round_trip_errors.R | 218 ++++++++++++++++++++++++++++++
 Python/check_mof_linkers.py       |   2 +
 Python/extract_moffles.py         |   4 +-
 4 files changed, 226 insertions(+), 2 deletions(-)
 create mode 100644 Analysis/plot_round_trip_errors.R

diff --git a/Analysis/README.md b/Analysis/README.md
index ab0f5e3c..06d44280 100644
--- a/Analysis/README.md
+++ b/Analysis/README.md
@@ -10,7 +10,9 @@ Analysis scripts to generate figures and stats for the manuscript
 	1. `Scripts/HPC/run_submit_all.sh` and wait for jobs to finish.
 	2. `Scripts/HPC/run_post_submit_all.sh` to collect results and run the validator.
 3. TODO: probably something with `Python/convert_smi_to_tables.py`
-4. TODO: scripts in this directory for analysis
+4. Open the `mofid.Rproj` file in RStudio, then run analysis scripts to generate validation figures.
+	1. `Analysis/plot_round_trip_errors.R` parses the validator .json output to find a percent success/match rate for the MOFid code vs. structure recipe, as well as figures showing the composition.
+5. TODO: other scripts in this directory for analysis
 
 
 ## Other comments
diff --git a/Analysis/plot_round_trip_errors.R b/Analysis/plot_round_trip_errors.R
new file mode 100644
index 00000000..da3576e5
--- /dev/null
+++ b/Analysis/plot_round_trip_errors.R
@@ -0,0 +1,218 @@
+# Visualize sources of error from round-tripping
+
+# Comparing MOFid's of GA/ToBaCCo MOFs against their recipes to check for
+# systematic sources of error in MOFid.
+# See README for usage and prerequisites.
+
+# In a previous talk about this project (and a notebook from 2017-08-31), I reported
+# the percent success rate for the validator according to the following classes:
+# - ToBaCCo MOFs: successes.  We get an additional 2.6% up to 78.2% if we accept the stp/tpt bug.
+# - GA MOFs: success, replaced_pillarX, or unk_pillarX.  That means the MOF is either an exact match or
+#   ambiguous results from primary vs. secondary linker for carboxylate vs. pyridine pillar.
+#
+# Thinking about these results for a new analysis, generate two types of figures:
+# 1. For the main text, draw a Sankey/alluvial diagram showing flow of BB's vs. error types
+#    See also https://cran.r-project.org/web/packages/ggalluvial/vignettes/ggalluvial.html
+# 2. For the SI, consider using heatmaps, and possibly a sum over different error types
+#    This might not be necessary -- TBD.
+#
+# If I need treemaps later, consider library(treemapify) and this layout:
+# https://support.office.com/en-us/article/create-a-treemap-chart-in-office-dfe86d28-a610-4ef5-9b30-362d5c624b68
+
+
+source('Analysis/R-Utilities/parse_json_results.R')
+
+library(ggplot2)
+library(stringr)
+library(cowplot)
+library(viridis)
+library(ggalluvial)
+library(purrr)
+
+
+### DATA IMPORT ###
+
+# Import data
+tobacco_errs <- import_err_df("Summary/tob_validation.json") %>% 
+  # rename the linker column removing the L_
+  mutate(linker = ifelse(str_detect(code.linker, "^L_"), str_sub(code.linker, 3), code.linker))
+ga_errs <- import_err_df("Summary/ga_validation.json")
+
+# Extract the ON from ToBaCCo MOFs
+extract_on <- function(list_of_nodes, pattern="_on_") {
+  matches <- str_detect(list_of_nodes, pattern)
+  if (sum(matches) == 0) {
+    return(NA_character_)
+  } else if (sum(matches) > 1) {
+    return("Multiple")
+  } else {
+    return(list_of_nodes[matches])
+  }
+}
+# See https://community.rstudio.com/t/dplyr-alternatives-to-rowwise/8071 on how to mutate by row
+df_extract_on <- Vectorize(extract_on)
+tobacco_errs <- tobacco_errs %>% 
+  mutate(nodes.on = df_extract_on(code.nodes)) %>% 
+  mutate(nodes.on = ifelse(is.na(nodes.on), "None", nodes.on))
+
+
+### INTERPRET ERRORS ###
+
+# Extract more detailed error information
+df_nested_c_length <- Vectorize(function(column) {length(column)})
+# TODO: consider lumping as part of a bigger error interpretation function
+tobacco_errs <- mutate(tobacco_errs, err_qty = df_nested_c_length(errors))
+
+# First, what are the unique error classes?
+err_classes_tob <- flatten_chr(tobacco_errs$errors) %>% unique
+err_classes_ga <- flatten_chr(ga_errs$errors) %>% unique
+# c(err_classes_ga, err_classes_tob) %>% unique
+
+mismatch_codes <- c("V_incomplete_linker",
+  "Zr_mof_as_hex", "Zr_mof_not_fcu", "Zr_mof_pcu_as_fcu",
+  "stp_from_tpt"
+)
+pillar_codes <- c("unk_pillar1", "unk_pillar2", "replaced_pillar1", "replaced_pillar2")
+
+summarize_errors <- function(error_char_vec) {
+  # Translate error codes from check_mof_linkers.py into their implications
+  
+  # First handle the easiest cases
+  if (length(error_char_vec) == 0) {
+    return("Success")
+  }
+  
+  # Filter out known issues in structure recipes
+  cleaned_errs <- error_char_vec[!(error_char_vec %in% pillar_codes)]
+  if (length(cleaned_errs) == 0) {
+    # Ambiguous pillars should not be considered a source of error
+    # return("Ambiguous pillars")
+    return("Success")
+  }
+  
+  cleaned_errs <- cleaned_errs[!(cleaned_errs %in% mismatch_codes)]
+  if (length(cleaned_errs) == 0) {
+    return("Definition mismatch")
+  }
+  
+  # Strip off the "_extra" suffix.  It's possibly too much granularity, and we don't need to differentiate
+  # between inconsistent bond assignment and incorrect bond assignment (or treat them separately)
+  cleaned_errs <- str_replace(cleaned_errs, "_extra", "") %>% unique
+  
+  # TODO STUFF
+  if (length(cleaned_errs) == 1) {
+    if (cleaned_errs %in% c("err_systre_error", "err_cpp_error")) {
+      return("Crash")
+    } else {
+      return(substr(cleaned_errs[1], 5, nchar(cleaned_errs[1])))  # strip the "err_"
+    }
+  } else if (length(cleaned_errs) > 2) {
+    return("Many errors")
+  }
+  
+  # Cases when length(error_char_vec) == 2:
+  return("Two errors")
+}
+df_summarize_errors <- Vectorize(summarize_errors)
+
+# TODO: make a new .df to translate summarize_errors to an overall error class
+# "structure_mismatch" label?
+# Or just have an else statement for Success, etc.
+
+tobacco_errs <- tobacco_errs %>% 
+  mutate(err_type=df_summarize_errors(errors)) %>% 
+  mutate(err_color=ifelse(err_type %in% mismatch_codes, "Mismatch", "Error"))
+ga_errs <- ga_errs %>% 
+  mutate(err_type=df_summarize_errors(errors)) %>% 
+  mutate(err_color=ifelse(err_type %in% mismatch_codes, "Mismatch", "Error"))
+
+
+### UNDERSTANDING ERROR FLOWS ###
+
+RUN_ERROR_FLOWS <- TRUE
+if (RUN_ERROR_FLOWS) {
+  # TODO: run `summary` as described below to understand error flows, if I go that route
+  # TO GET STARTED, take each error class and run a summary command on the relevant classes of error
+}  # endif(RUN_ERROR_FLOWS)
+
+
+### PLOTS FOR MAIN TEXT ###
+
+# TODO: implement figure for the paper
+
+# Initial overall structure for the diagram
+sankey_tobacco <- 
+  tobacco_errs %>%
+  filter(err_type != "Success") %>%
+  group_by(err_type, nodes.on, err_qty) %>%
+  summarize(err_n = n()) %>%
+  #ggplot(aes(y=1, axis1=linker, axis2=nodes.on, axis3=err_qty)) +
+  ggplot(aes(y=err_n, axis1=err_type, axis2=nodes.on, axis3=err_qty)) +
+  geom_alluvium(aes(fill=err_type)) +
+  geom_stratum(width=1/6, color="grey") +
+  guides(fill=FALSE) +
+  geom_label(stat = "stratum", label.strata = TRUE) +
+  scale_x_discrete(limits = c("Error class", "on", "Num errs"), expand = c(.05, .05))
+cowplot::save_plot("Analysis/Figures/sankey_tobacco.png", sankey_tobacco, base_aspect_ratio=2.0)
+
+sankey_ga <-
+  ga_errs %>%
+  filter(err_type != "Success") %>%  # Report % success as a number
+  group_by(err_type, code.nodes, code.linker1) %>%
+  summarize(err_n = n()) %>%
+  #ggplot(aes(y=1, axis1=linker, axis2=nodes.on, axis3=err_qty)) +
+  ggplot(aes(y=err_n, axis1=err_type, axis2=code.nodes, axis3=code.linker1)) +
+  geom_alluvium(aes(fill=err_type)) +
+  geom_stratum(width=1/6, color="grey") +
+  guides(fill=FALSE) +
+  geom_label(stat = "stratum", label.strata = TRUE) +
+  scale_x_discrete(limits = c("Error class", "Metal node", "L1"), expand = c(.05, .05))
+cowplot::save_plot("Analysis/Figures/sankey_ga.png", sankey_ga, base_aspect_ratio=2.0)
+
+# TODO: consider the networkD3::sankeyNetwork layout, instead
+# https://www.r-graph-gallery.com/323-sankey-diagram-with-the-networkd3-library/
+# Would need to revamp the figure by running a series of steps to convert the data frame, if we went that route
+# Maybe also river plot as an alternative: https://stackoverflow.com/questions/9968433/sankey-diagrams-in-r
+# That package seems a bit cleaner if you don't need interactivity
+# At any rate, I'll need to semi-manually calculate flows/my story instead of relying on ggplot.
+# The format for makeRiver is a dataframe with N1, N2, Value.
+
+# Idea: the purpose of the Sankey diagram will be error attribution, to complement the heatmaps
+# TODO: rephrase the "success" of the heatmaps for the GA MOFs - and notate properly wihtin the text
+# Also, maybe then it could be a 2-part figure for both GA/ToBaCCo.  A heatmap with overall sucess, then Sankey to drill down.
+
+
+### PLOTS FOR SI ###
+
+# TODO: consider breaking up the diagram into more specific classes of error?
+# Honestly, these plots might become irrelevant if the Sankey diagram has sufficient information.
+
+tobacco_heatmap <- 
+  tobacco_errs %>% 
+  group_by(linker, nodes.on) %>% 
+  summarize(num_successes = sum(match), num_total = n()) %>%
+  mutate(success_rate = num_successes / num_total) %>% 
+  mutate(Linker = factor(linker, levels=c("_", 1:50))) %>% 
+  ggplot(aes(Linker, nodes.on)) +
+  geom_tile(aes(fill=success_rate)) +
+  # scale_fill_viridis() +
+  guides(fill=FALSE) +
+  scale_x_discrete(breaks=c(1, seq(5, 47, 5)), drop=FALSE)
+cowplot::save_plot("Analysis/Figures/si_recipe_tobacco.png", tobacco_heatmap)
+
+# And repeat for catenated, non-functionalized ("parent") GA hMOFs
+ga_heatmap <- 
+  ga_errs %>% 
+  group_by(code.linker1, code.linker2, code.nodes) %>% 
+  summarize(num_successes = sum(match), num_total = n()) %>%
+  mutate(success_rate = num_successes / num_total) %>% 
+  mutate(Linker1 = factor(code.linker1, levels=0:39)) %>% 
+  mutate(Linker2 = factor(code.linker2, levels=0:39)) %>% 
+  ggplot(aes(Linker1, Linker2)) +
+  geom_tile(aes(fill=success_rate)) +
+  guides(fill=FALSE) +
+  facet_grid(code.nodes~.) +
+  scale_x_discrete(drop=FALSE) +
+  scale_y_discrete(drop=FALSE)
+cowplot::save_plot("Analysis/Figures/si_recipe_ga.png", ga_heatmap)
+
diff --git a/Python/check_mof_linkers.py b/Python/check_mof_linkers.py
index 40db7b6d..877a9dbc 100644
--- a/Python/check_mof_linkers.py
+++ b/Python/check_mof_linkers.py
@@ -266,6 +266,8 @@ def _test_generated(self, cif_path, generated_moffles, start_time = None, genera
 			'ERROR', default['cat'], mof_name=default['name'])
 		moffles_from_name['err_cpp_error'] = assemble_moffles(['*'], 'NA',
 			None, mof_name=default['name'])
+		moffles_from_name['err_no_mof'] = assemble_moffles(['*'], 'NA',
+			'no_mof', mof_name=default['name'])
 
 
 		# Run transformations on the generated MOFFLES from CIF or smi database, if applicable (e.g. GA hMOFs)
diff --git a/Python/extract_moffles.py b/Python/extract_moffles.py
index 860d7d81..71bf1b3a 100644
--- a/Python/extract_moffles.py
+++ b/Python/extract_moffles.py
@@ -136,7 +136,9 @@ def assemble_moffles(linkers, topology, cat = None, mof_name="NAME_GOES_HERE"):
 	moffles = ".".join(linkers) + " "
 	moffles = moffles + "f1" + "."
 	moffles = moffles + topology + "."
-	if cat is not None:
+	if cat == "no_mof":
+		moffles += cat + "."
+	elif cat is not None:
 		moffles = moffles + "cat" + cat + "."
 	if moffles.startswith(" "):  # Null linkers.  Make .smi compatible
 		moffles = "*" + moffles + "no_mof."