Merge pull request #45 from puja-trivedi/create_docs_20241003

bkbit documentation
brain-bican · Oct 4, 2024 · c11011b · c11011b
2 parents 3b1408b + 2c4d144
commit c11011b
Show file tree

Hide file tree

Showing 6 changed files with 293 additions and 23 deletions.
diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
@@ -7,17 +7,18 @@
 4. Serialize the extracted information into JSON-LD format for further use.
 
 Classes:
-    Gff3: A class to handle the entire process of downloading, parsing, and processing GFF3 files.
+    Gff3: The Gff3 class is designed to handle the complete lifecycle of downloading, parsing, and processing GFF3 files from NCBI or Ensembl repositories. It extracts gene annotations and serializes the data into JSON-LD format.
 
 Functions:
-    cli: Command line interface function to execute the module as a script.
+    gff2jsonld: The gff2jsonld function is responsible for creating GeneAnnotation objects from a provided GFF3 file and serializing the extracted information into the JSON-LD format.
 
 Usage:
     The module can be run as a standalone script by executing it with appropriate arguments and options:
     
     ```
     python genome_annotation_translator.py <content_url> -a <assembly_accession> -s <assembly_strain> -l <log_level> -f
     ```
+    
     The script will download the GFF3 file from the specified URL, parse it, and serialize the extracted information into JSON-LD format.
 
 Example:
@@ -180,7 +181,7 @@ def __init__(
             self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
             self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH)
         except FileNotFoundError as e:
-            self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download_ncbi_taxonomy' command first." )
+            self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first." )
             print(e)
             sys.exit(2)
 

diff --git a/bkbit/data_translators/library_generation_translator.py b/bkbit/data_translators/library_generation_translator.py
@@ -21,7 +21,7 @@
     The module can be run as a standalone script using the command-line interface with the appropriate arguments and options:
 
     ```
-    python specimen_portal.py <nhash_id> -d
+    python specimen_portal.py <nhash_id> [-d]
     ```
 
     This script will parse the nhash ID and serialize the generated data into JSON-LD format, with the option to parse descendants or ancestors.
@@ -95,18 +95,15 @@ class SpecimenPortal:
         serialize_to_jsonld(exclude_none=True, exclude_unset=False):
             Serializes the generated objects into JSON-LD format for further use or storage.
 
-    Static Methods:
-        __check_valueset_membership(enum_type, nimp_value):
-            Checks if a given value belongs to a specified enum.
-    
-    Private Methods:
-        __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
+        parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
             Parses a single nhash ID and optionally saves the result to a JSON-LD file.
 
-        __parse_multiple_nashids(jwt_token, file_path, descendants):
-        Parses multiple nhash IDs from a file and saves the results to JSON-LD files.
-        
+        parse_multiple_nashids(jwt_token, file_path, descendants):
+            Parses multiple nhash IDs from a file and saves the results to JSON-LD files.
 
+    Static Methods:
+        __check_valueset_membership(enum_type, nimp_value):
+            Checks if a given value belongs to a specified enum.
     """
     def __init__(self, jwt_token):
         self.jwt_token = jwt_token
@@ -360,7 +357,7 @@ def serialize_to_jsonld(
         return json.dumps(output_data, indent=2)
 
 
-def __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
+def parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
     """
     Parse a single nashid using the SpecimenPortal class.
 
@@ -388,7 +385,7 @@ def __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
         print(sp_obj.serialize_to_jsonld())
 
 
-def __parse_multiple_nashids(jwt_token, file_path, descendants):
+def parse_multiple_nashids(jwt_token, file_path, descendants):
     """
     Parse multiple nashids from a file.
 
@@ -405,7 +402,7 @@ def __parse_multiple_nashids(jwt_token, file_path, descendants):
         nhashids = [line.strip() for line in file.readlines()]
     with Pool() as pool:
         results = pool.starmap(
-            __parse_single_nashid,
+            parse_single_nashid,
             [(jwt_token, nhash_id, descendants, True) for nhash_id in nhashids],
         )
     return results
@@ -438,9 +435,9 @@ def specimen2jsonld(nhash_id: str, descendants: bool):
     if not jwt_token or jwt_token == "":
         raise ValueError("JWT token is required")
     if os.path.isfile(nhash_id):
-        __parse_multiple_nashids(jwt_token, nhash_id, descendants)
+        parse_multiple_nashids(jwt_token, nhash_id, descendants)
     else:
-        __parse_single_nashid(jwt_token, nhash_id, descendants)
+        parse_single_nashid(jwt_token, nhash_id, descendants)
 
 
 if __name__ == "__main__":

diff --git a/docs/genome_annotation.rst b/docs/genome_annotation.rst
@@ -0,0 +1,77 @@
+.. _genome_annotation:
+
+Annotated Genome Data
+----------------------
+
+Overview
+.........
+
+Generate JSON-LD files for annotated genes from a given GFF3 file. Currently GFF3 files from ENSEMBL and NCBI are supported.
+
+Each JSON-LD file will contain:
+
+- GeneAnnotation objects
+- 1 GenomeAnnotation object
+- 1 GenomeAssembly object
+- 1 OrganismTaxon object
+- 1 Checksum object
+
+Command Line 
+.............
+
+``bkbit gff2jsonld``
+,,,,,,,,,,,,,,,,,,,,,
+
+    .. code-block:: bash
+
+        $ bkbit gff2jsonld [OPTIONS] GFF3_URL
+
+Options
+,,,,,,,,
+
+    ``-a, --assembly_accession``
+        ID assigned to the genomic assembly used in the GFF3 file.
+        **Note: Must be provided when using ENSEMBL GFF3 files**
+
+    ``-s, --assembly_strain``
+        Specific strain of the organism associated with the GFF3 file.
+
+    ``-l, --log_level``
+        Logging level.
+
+        Default:
+            WARNING
+        Options:
+            DEBUG | INFO | WARNING | ERROR | CRITICIAL
+
+    ``-f, --log_to_file``
+        Log to a file instead of the console.
+
+        Default:
+            FALSE
+
+Arguments
+,,,,,,,,,,,
+
+    ``GFF3_URL``
+        URL to the GFF3 file.
+
+Examples 
+.........
+
+Example 1: NCBI GFF3 file
+,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run gff2jsonld command
+    $ bkbit gff2jsonld 'https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9823/106/GCF_000003025.6_Sscrofa11.1/GCF_000003025.6_Sscrofa11.1_genomic.gff.gz' > output.jsonld
+
+
+Example 2: ENSEMBL GFF3 file
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run gff2jsonld command
+    $ bkbit gff2jsonld -a 'GCF_003339765.1' 'https://ftp.ensembl.org/pub/release-104/gff3/macaca_mulatta/Macaca_mulatta.Mmul_10.104.gff3.gz' > output.jsonld
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,17 +8,28 @@ Brain Knowledge Base Interaction Toolkit Documentation
 This package contains tools to use the BICAN Knowledgebase Data Models.
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   :maxdepth: 1
+   :caption: GETTING STARTED
 
    install
-   bkbit-quickstart
-   contributing
+
+.. toctree::
+   :maxdepth: 1
+   :caption: DATA TRANSLATORS
+
+   specimen_file_manifest
+   specimen_metadata
+   genome_annotation
+
+.. toctree::
+   :maxdepth: 1
+   :caption: REFERENCE
+
    modules
 
 Indices and tables
 ==================
 
 * :ref:`genindex`
 * :ref:`modindex`
-* :ref:`search`
+.. * :ref:`search`
diff --git a/docs/specimen_file_manifest.rst b/docs/specimen_file_manifest.rst
@@ -0,0 +1,68 @@
+.. _specimen_file_manifest:
+
+Specimen File Manifest
+----------------------
+
+Overview
+.........
+
+Generates a JSON-LD file containing specimen file data using the BICAN Library Generation Schema. 
+
+Command Line
+.............
+
+``bkbit filemanifest2jsonld``
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    $ bkbit filemanifest2jsonld [OPTIONS] FILE_MANIFEST_CSV
+
+**Options**
+
+    ``--list_library_aliquots``
+        A boolean flag that, when provided, generates a list of unique library aliquots contained in the given file manifest and saves output in file called 'file_manifest_library_aliquots.txt'. 
+        If this flag is not set (DEFAULT), then only the JSON-LD output will be generated.
+
+**Arguments**
+
+    ``FILE_MANIFEST_CSV``
+        Required argument. 
+        FILE_MANIFEST_CSV can be optained from Brain Knowledge Platform and **must** contains the following columns:
+
+            - Project ID	
+            - Specimen ID	
+            - File Name	
+            - Checksum	
+            - File Type	
+            - Archive	
+            - Archive URI
+
+Examples
+.........
+
+Example 1: Only generate JSON-LD output
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run filemanifest2jsonld command 
+    $ bkbit filemanifest2jsonld file_manifest.csv > output.jsonld
+
+Example 2: Generate JSON-LD output and list of library aliquots
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run filemanifest2jsonld command 
+    $ bkbit filemanifest2jsonld --list_library_aliquots file_manifest.csv > output.jsonld
+
+    # Generated output files 
+    $ ls .
+    output.jsonld
+    file_manifest_library_aliquots.txt
+
+    # Contents of file_manifest_library_aliquots.txt
+    $ cat file_manifest_library_aliquots.txt
+    LP-123
+    LP-345