From 55b8d51697c30f2184a4fe0d7ccbbb2e9412e0d1 Mon Sep 17 00:00:00 2001
From: pauladkisson <paul.wesley.adkisson@gmail.com>
Date: Fri, 24 May 2024 12:57:54 -0700
Subject: [PATCH] added subjects to skip and metadata sheet corrections

---
 .../seiler_2024_convert_dataset.py            | 94 ++++++++++---------
 .../seiler_2024_convert_session.py            |  2 +-
 .../seiler_2024/seiler_2024_notes.md          | 39 +++++++-
 3 files changed, 86 insertions(+), 49 deletions(-)

diff --git a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_dataset.py b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_dataset.py
index e08e2cd..ebba637 100644
--- a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_dataset.py
+++ b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_dataset.py
@@ -37,6 +37,22 @@ def dataset_to_nwb(
     verbose : bool, optional
         Whether to print verbose output, by default True
     """
+    subjects_to_skip = {
+        "289.407",
+        "244.464",
+        "264.477",
+        "102.260",
+        "262.478",
+        "289.408",
+        "264.475",
+        "129.425",
+        "250.427",
+        "95.259",
+        "309.399",
+        "433.421",
+        "416.405",
+        "364.426",
+    }
     start_variable = "Start Date"
     data_dir_path = Path(data_dir_path)
     output_dir_path = Path(output_dir_path)
@@ -55,51 +71,36 @@ def dataset_to_nwb(
         verbose=verbose,
     )
     session_to_nwb_args_per_session = fp_session_to_nwb_args_per_session + opto_session_to_nwb_args_per_session
-    unique_subject_ids = set()
-    for session_to_nwb_kwargs in session_to_nwb_args_per_session:
-        subject_id = session_to_nwb_kwargs["subject_id"]
-        unique_subject_ids.add(subject_id)
-    metadata_path = Path(data_dir_path / "MouseDemographics.xlsx")
-    df = pd.read_excel(
-        metadata_path,
-        sheet_name="Mouse Demographics",
-        dtype={"Mouse ID": str},
-    )
-    df["DNL"] = df["Mouse ID"].str.contains("(DNL)", regex=False)
-    df["Mouse ID"] = df["Mouse ID"].str.replace("(DNL)", "")
-    df["Mouse ID"] = df["Mouse ID"].str.strip()
-    mouse_ids = set(df["Mouse ID"])
-    missing_subject_ids = unique_subject_ids - mouse_ids
-    for missing_subject_id in missing_subject_ids:
-        print(f"Missing metadata for {missing_subject_id}")
-
-    # futures = []
-    # with ProcessPoolExecutor(max_workers=max_workers) as executor:
-    #     for session_to_nwb_kwargs in session_to_nwb_args_per_session:
-    #         experiment_type = session_to_nwb_kwargs["experiment_type"]
-    #         experimental_group = session_to_nwb_kwargs["experimental_group"]
-    #         subject_id = session_to_nwb_kwargs["subject_id"]
-    #         start_datetime = session_to_nwb_kwargs["start_datetime"]
-    #         optogenetic_treatment = session_to_nwb_kwargs.get("optogenetic_treatment", None)
-    #         if experiment_type == "FP":
-    #             exception_file_path = (
-    #                 output_dir_path
-    #                 / f"ERROR_{experiment_type}_{experimental_group}_{subject_id}_{start_datetime.isoformat().replace(':', '-')}.txt"
-    #             )
-    #         elif experiment_type == "Opto":
-    #             exception_file_path = (
-    #                 output_dir_path
-    #                 / f"ERROR_{experiment_type}_{experimental_group}_{optogenetic_treatment}_{subject_id}_{start_datetime.isoformat().replace(':', '-')}.txt"
-    #             )
-    #         futures.append(
-    #             executor.submit(
-    #                 safe_session_to_nwb,
-    #                 session_to_nwb_kwargs=session_to_nwb_kwargs,
-    #                 exception_file_path=exception_file_path,
-    #             )
-    #         )
-    #     for _ in tqdm(as_completed(futures), total=len(futures)):
-    #         pass
+
+    futures = []
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        for session_to_nwb_kwargs in session_to_nwb_args_per_session:
+            experiment_type = session_to_nwb_kwargs["experiment_type"]
+            experimental_group = session_to_nwb_kwargs["experimental_group"]
+            subject_id = session_to_nwb_kwargs["subject_id"]
+            if subject_id in subjects_to_skip:
+                continue
+            start_datetime = session_to_nwb_kwargs["start_datetime"]
+            optogenetic_treatment = session_to_nwb_kwargs.get("optogenetic_treatment", None)
+            if experiment_type == "FP":
+                exception_file_path = (
+                    output_dir_path
+                    / f"ERROR_{experiment_type}_{experimental_group}_{subject_id}_{start_datetime.isoformat().replace(':', '-')}.txt"
+                )
+            elif experiment_type == "Opto":
+                exception_file_path = (
+                    output_dir_path
+                    / f"ERROR_{experiment_type}_{experimental_group}_{optogenetic_treatment}_{subject_id}_{start_datetime.isoformat().replace(':', '-')}.txt"
+                )
+            futures.append(
+                executor.submit(
+                    safe_session_to_nwb,
+                    session_to_nwb_kwargs=session_to_nwb_kwargs,
+                    exception_file_path=exception_file_path,
+                )
+            )
+        for _ in tqdm(as_completed(futures), total=len(futures)):
+            pass
 
 
 def safe_session_to_nwb(*, session_to_nwb_kwargs: dict, exception_file_path: Union[Path, str]):
@@ -208,6 +209,7 @@ def fp_to_nwb(
         "418": "418.404",
         "299": "299.405",
         "276": "276.405",
+        "262.259.478": "262.478",
     }
     raw_file_to_info = get_raw_info(behavior_path)
 
@@ -487,6 +489,7 @@ def opto_to_nwb(
         "300": "300.405",
         "299": "299.405",
         "276": "276.405",
+        "262.259.478": "262.478",
     }
     experiment_type = "Opto"
     experimental_group_to_optogenetic_treatments = {
@@ -674,6 +677,7 @@ def get_opto_subject_id(subject_path: Path):
         "300": "300.405",
         "299": "299.405",
         "276": "276.405",
+        "262.259.478": "262.478",
     }
 
     # fmt: off
diff --git a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_session.py b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_session.py
index 084c7de..ab32de5 100644
--- a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_session.py
+++ b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_convert_session.py
@@ -134,7 +134,7 @@ def session_to_nwb(
         conversion_options.update(dict(Optogenetic={}))
 
     # Add Excel-based Metadata
-    metadata_path = data_dir_path / "MouseDemographics.xlsx"
+    metadata_path = data_dir_path / "MouseDemographicsCorrected.xlsx"
     source_data.update(
         dict(
             Metadata={
diff --git a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_notes.md b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_notes.md
index ea204cb..08ff247 100644
--- a/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_notes.md
+++ b/src/lerner_lab_to_nwb/seiler_2024/seiler_2024_notes.md
@@ -153,9 +153,42 @@ for that 1 session split across the two folders?
 ### Questions
 - Some of the subject_ids are not present in the metadata excel file -- pls provide
 - Some animals are missing the "Hemisphere with DMS" field -- pls provide
-- Looks like some of the mouse ids in the rr20 section of the excel file are incorrect (all XX.259 instead of XX.257)
-=======
-    - Solution: Added missing MSNs; skipped RK_C_FR1_BOTH_1hr
+- Some of the mouse ids have typos (leading and trailing zeros) as well as some that appear incorrect (RR20 section)
+    So, I made the following corrections to metadata excel sheet:
+    Mouse ID corrections:
+        79.402 --> 079.402
+        344.4 --> 344.400
+        432.42 --> 432.420
+        48.392 --> 048.392
+        98.259 --> 98.257
+        101.259 --> 101.260
+        97.259 --> 97.257
+        99.259 --> 99.257
+        100.259 --> 100.258
+        359.43 --> 359.430
+        28.392 --> 028.392
+        227.43 --> 227.430
+        262.478 --> 262.259
+        354.43 --> 354.430
+        430.42 --> 430.420
+        342.483 --> 342.400
+    After these corrections the following mouse_ids are still missing from the excel sheet:
+    subjects_to_skip = {
+        "289.407",
+        "244.464",
+        "264.477",
+        "102.260",
+        "262.478",
+        "289.408",
+        "264.475",
+        "129.425",
+        "250.427",
+        "95.259",
+        "309.399",
+        "433.421",
+        "416.405",
+        "364.426",
+    }
 
 ### Active Questions
 - DMS-Excitatory has some csv files w/ only session-aggregated info (total right rewards but not right reward times) ex. ChR2/121_280.CSV -- do you have individual session info for these animals?