databio · saanikat · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Oct 1, 2024
diff --git a/README.md b/README.md
@@ -1,7 +1,6 @@
 # BEDMS
 
-BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE).
-
+BEDMS (BED Metadata Standardizer) is a tool desgined to standardize genomics and epigenomics metadata attributes according to user-selected schemas such as `ENCODE`, `FAIRTRACKS` and `BEDBASE`. BEDMS ensures consistency and FAIRness of metadata across different platforms. Additionally, users have the option to train their own standardizer model using a custom schema (`CUSTOM`), allowing for the standardization of attributes based on users' specific research requirements. 
 
 ## Installation
 
@@ -16,22 +15,72 @@ pip install git+https://github.com/databio/bedms.git
 
 ## Usage
 
+### Standardizing based on available schemas
+
+To choose the schema you want to standardize according to, please refer to the [HuggingFace repository](https://huggingface.co/databio/attribute-standardizer-model6). Based on the schema design `.yaml` files, you can select which schema best represents your attributes. In the example below, we have chosen `encode` schema. 
+
 ```python
 from bedms import AttrStandardizer
 
-model = AttrStandardizer("ENCODE")
+model = AttrStandardizer(
+    repo_id="databio/attribute-standardizer-model6", model_name="encode"
+)
 results = model.standardize(pep="geo/gse228634:default")
 
 assert results
 ```
 
+### Training custom schemas
+Training your custom schema is very easy with `BEDMS`. You would need two things to get started:
+1. Training Sets
+2. `training_config.yaml`
+
+To instantiate `TrainStandardizer` class:
+
+```python
+from bedms.train import AttrStandardizerTrainer
+
+trainer = AttrStandardizerTrainer("training_config.yaml")
 
-To see the available schemas, you can run:
 ```
-from bedms.constants import AVAILABLE_SCHEMAS
-print(AVAILABLE_SCHEMAS)
+To load the datasets and encode them:
 
-# >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] 
+```python
+train_data, val_data, test_data, label_encoder, vectorizer = trainer.load_data()
+```
+
+To train the custom model:
 
+```python
+trainer.train()
+```
+
+To test the custom model:
+
+```python
+test_results_dict = trainer.test()
 ```
-AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata.
+
+To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve:
+
+```python
+acc_fig, loss_fig, conf_fig, roc_fig = trainer.plot_visualizations() 
+```
+
+Where `acc_fig` is Accuracy Curve figure object, `loss_fig` is Loss Curve figure object, `conf_fig` is the Confusion Matrix figure object, and `roc_fig` is the ROC Curve figure object. 
+
+
+### Standardizing based on custom schema
+
+For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). 
+
+```python
+from bedms import AttrStandardizer
+
+model = AttrStandardizer(
+    repo_id="name/of/your/hf/repo", model_name="model/name"
+)
+results = model.standardize(pep="geo/gse228634:default")
+
+print(results) #Dictionary of suggested predictions with their confidence: {'attr_1':{'prediction_1': 0.70, 'prediction_2':0.30}}
+```
diff --git a/bedms/__init__.py b/bedms/__init__.py
@@ -3,3 +3,6 @@
 """
 
 from .attr_standardizer import AttrStandardizer
+from .train import AttrStandardizerTrainer
+
+__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"]
diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
@@ -3,44 +3,26 @@
 """
 
 import logging
-from typing import Dict, Tuple, Union
+import glob
+import os
+import yaml
+from typing import Dict, Tuple, Union, Optional
 import pickle
 import peppy
 import torch
 from torch import nn
 import torch.nn.functional as torch_functional
-
+import yaml
+from huggingface_hub import hf_hub_download
 from .const import (
     AVAILABLE_SCHEMAS,
     CONFIDENCE_THRESHOLD,
-    DROPOUT_PROB,
-    EMBEDDING_SIZE,
-    HIDDEN_SIZE,
-    INPUT_SIZE_BOW_BEDBASE,
-    INPUT_SIZE_BOW_ENCODE,
-    INPUT_SIZE_BOW_FAIRTRACKS,
-    OUTPUT_SIZE_BEDBASE,
-    OUTPUT_SIZE_ENCODE,
-    OUTPUT_SIZE_FAIRTRACKS,
     PROJECT_NAME,
     SENTENCE_TRANSFORMER_MODEL,
-    REPO_ID,
-    ENCODE_VECTORIZER_FILENAME,
-    ENCODE_LABEL_ENCODER_FILENAME,
-    FAIRTRACKS_VECTORIZER_FILENAME,
-    FAIRTRACKS_LABEL_ENCODER_FILENAME,
-    BEDBASE_VECTORIZER_FILENAME,
-    BEDBASE_LABEL_ENCODER_FILENAME,
 )
 from .model import BoWSTModel
-from .utils import (
-    data_encoding,
-    data_preprocessing,
-    fetch_from_pephub,
-    get_any_pep,
-    load_from_huggingface,
-)
-from huggingface_hub import hf_hub_download
+from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(PROJECT_NAME)
@@ -51,53 +33,55 @@ class AttrStandardizer:
     This is the AttrStandardizer class which holds the models for Attribute Standardization.
     """
 
-    def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None:
+    def __init__(
+        self,
+        repo_id: str,
+        model_name: str,
+        custom_param: Optional[str] = None,
+        confidence: int = CONFIDENCE_THRESHOLD,
+    ) -> None:
         """
         Initializes the attribute standardizer with user provided schema, loads the model.
 
-        :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
+        :param str repo_id: HuggingFace repository ID
+        :param str model_name: Name of the schema model
+        :param str custom_param: User provided config file for
+            custom parameters, if they choose "CUSTOM" schema.
         :param int confidence: Confidence threshold for the predictions.
         """
-        self.schema = schema
-        self.model, self.vectorizer, self.label_encoder = self._load_model()
+        self.repo_id = repo_id
+        self.model_name = model_name
         self.conf_threshold = confidence
+        self.custom_param = custom_param
+        self.model, self.vectorizer, self.label_encoder = self._load_model()
 
     def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
         """
         Get the model parameters as per the chosen schema.
 
         :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters.
         """
-        if self.schema == "ENCODE":
-            return (
-                INPUT_SIZE_BOW_ENCODE,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_ENCODE,
-                DROPOUT_PROB,
-            )
-        if self.schema == "FAIRTRACKS":
-            return (
-                INPUT_SIZE_BOW_FAIRTRACKS,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_FAIRTRACKS,
-                DROPOUT_PROB,
-            )
-        if self.schema == "BEDBASE":
-            return (
-                INPUT_SIZE_BOW_BEDBASE,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_BEDBASE,
-                DROPOUT_PROB,
-            )
-        raise ValueError(
-            f"Schema not available: {self.schema}."
-            "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE"
+        config_filename = f"config_{self.model_name}.yaml"
+        config_pth = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, config_filename),
+        )
+        with open(config_pth, "r") as file:
+            config = yaml.safe_load(file)
+
+        input_size_bow = config["params"]["input_size_bow"]
+        embedding_size = config["params"]["embedding_size"]
+        hidden_size = config["params"]["hidden_size"]
+        output_size = config["params"]["output_size"]
+        dropout_prob = config["params"]["dropout_prob"]
+
+        return (
+            input_size_bow,
+            embedding_size,
+            embedding_size,
+            hidden_size,
+            output_size,
+            dropout_prob,
         )
 
     def _load_model(self) -> Tuple[nn.Module, object, object]:
@@ -108,63 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]:
         :return object: The scikit learn vectorizer for bag of words encoding.
         :return object: Label encoder object for the labels (y).
         """
-        try:
-            if self.schema == "ENCODE":
-                filename_vc = ENCODE_VECTORIZER_FILENAME
-                filename_lb = ENCODE_LABEL_ENCODER_FILENAME
-            elif self.schema == "FAIRTRACKS":
-                filename_vc = FAIRTRACKS_VECTORIZER_FILENAME
-                filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME
-            elif self.schema == "BEDBASE":
-                filename_vc = BEDBASE_VECTORIZER_FILENAME
-                filename_lb = BEDBASE_LABEL_ENCODER_FILENAME
-
-            vectorizer = None
-            label_encoder = None
-
-            vc_path = hf_hub_download(
-                repo_id=REPO_ID,
-                filename=filename_vc,
-            )
+        model_filename = f"model_{self.model_name}.pth"
+        label_encoder_filename = f"label_encoder_{self.model_name}.pkl"
+        vectorizer_filename = f"vectorizer_{self.model_name}.pkl"
 
-            with open(vc_path, "rb") as f:
-                vectorizer = pickle.load(f)
+        model_pth = hf_hub_download(
+            repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename)
+        )
 
-            lb_path = hf_hub_download(
-                repo_id=REPO_ID,
-                filename=filename_lb,
-            )
+        vc_path = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, vectorizer_filename),
+        )
 
-            with open(lb_path, "rb") as f:
-                label_encoder = pickle.load(f)
+        lb_path = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, label_encoder_filename),
+        )
 
-            model = load_from_huggingface(self.schema)
-            state_dict = torch.load(model)
+        with open(vc_path, "rb") as f:
+            vectorizer = pickle.load(f)
+
+        with open(lb_path, "rb") as f:
+            label_encoder = pickle.load(f)
+
+        state_dict = torch.load(model_pth)
+
+        (
+            input_size_values,
+            input_size_values_embeddings,
+            input_size_headers,
+            hidden_size,
+            output_size,
+            dropout_prob,
+        ) = self._get_parameters()
+
+        model = BoWSTModel(
+            input_size_values,
+            input_size_values_embeddings,
+            input_size_headers,
+            hidden_size,
+            output_size,
+            dropout_prob,
+        )
 
-            (
-                input_size_values,
-                input_size_values_embeddings,
-                input_size_headers,
-                hidden_size,
-                output_size,
-                dropout_prob,
-            ) = self._get_parameters()
-
-            model = BoWSTModel(
-                input_size_values,
-                input_size_values_embeddings,
-                input_size_headers,
-                hidden_size,
-                output_size,
-                dropout_prob,
-            )
-            model.load_state_dict(state_dict)
-            model.eval()
-            return model, vectorizer, label_encoder
+        model.load_state_dict(state_dict)
+        model.eval()
 
-        except Exception as e:
-            logger.error(f"Error loading the model: {str(e)}")
-            raise
+        return model, vectorizer, label_encoder
 
     def standardize(
         self, pep: Union[str, peppy.Project]

diff --git a/bedms/const.py b/bedms/const.py
@@ -2,29 +2,10 @@
 This module contains constant values used in the 'bedms' package.
 """
 
-PROJECT_NAME = "bedmess"
+PROJECT_NAME = "bedms"
 
-AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"]
+AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"]
 PEP_FILE_TYPES = ["yaml", "csv"]
-REPO_ID = "databio/attribute-standardizer-model6"
-MODEL_ENCODE = "model_encode.pth"
-MODEL_FAIRTRACKS = "model_fairtracks.pth"
-MODEL_BEDBASE = "model_bedbase.pth"
-ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl"
-FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl"
-BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl"
-ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl"
-FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl"
-BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl"
 SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
-HIDDEN_SIZE = 32
-DROPOUT_PROB = 0.113
 CONFIDENCE_THRESHOLD = 0.70
-EMBEDDING_SIZE = 384
-INPUT_SIZE_BOW_ENCODE = 10459
-INPUT_SIZE_BOW_FAIRTRACKS = 13617
-OUTPUT_SIZE_FAIRTRACKS = 15
-OUTPUT_SIZE_ENCODE = 18
 NUM_CLUSTERS = 3
-INPUT_SIZE_BOW_BEDBASE = 13708
-OUTPUT_SIZE_BEDBASE = 12