Update README

WGLab · Sep 5, 2024 · b53ad18 · b53ad18
1 parent c4cc4b7
commit b53ad18
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -13,7 +13,9 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f
 - General usage for common filetypes:
   - [Common parameters](#common-parameters)
   - [WGS BAM](#wgs-bam)
+  - [BAM with base modifications/methylation](#bam-with-base-modifications)
   - [RRMS BAM](#rrms-bam)
+  - [PacBio unaligned BAM](#pacbio-unaligned-bam)
   - [RNA-Seq BAM (TIN values)](#rna-seq-bam)
   - [ONT POD5](#ont-pod5)
   - [ONT FAST5](#ont-fast5)
@@ -96,7 +98,13 @@ python longreadsum [arguments]
 ```
 
 # General Usage
-## Common parameters
+
+Specify the filetype followed by parameters:
+```
+longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY
+```
+
+# Common parameters
 
 To see all parameters for a filetype, run:
 
@@ -114,17 +122,91 @@ This section describes parameters common to all filetypes:
 | -o, --outputfolder | Output directory | output_longreadsum
 | -t, --threads | The number of threads used | 1
 | -Q, --outprefix | Output file prefix |  QC_
-| --fontsize | Font size for plots | 14
-| --markersize | Marker size for plots | 10
-
-## WGS BAM
-## RRMS BAM
-## RNA-Seq BAM
-## ONT POD5
-## ONT FAST5
-## Basecall summary
-## FASTQ
-## FASTA
+
+# WGS BAM
+
+This section describes general usage for BAM files from whole-genome sequencing
+(WGS) with alignments to a linear reference genome such as GRCh38:
+
+```
+longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY
+```
+Download an example HTML report [here]() (data is HG002 sequenced with ONT Kit V14
+Promethion R10.4.1 from https://labs.epi2me.io/askenazi-kit14-2022-12/)
+
+# BAM with base modifications
+
+This section describes parameters for BAM files with base modification tags (MM,
+ML).
+
+| Parameter	| Description | Default |
+| --- | --- | --- |
+| --mod | Run base modification analysis on the BAM file | False
+| --modprob | Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. | 0.8
+| --ref | The reference genome FASTA file to use for identifying CpG sites (optional)
+
+General usage:
+```
+longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --ref $REF_GENOME --modprob 0.8
+```
+
+Download an example HTML report [here]() (data is HG002 sequenced with ONT
+MinION R9.4.1 from https://labs.epi2me.io/gm24385-5mc/)
+
+# RRMS BAM
+
+This section describes parameters for ONT RRMS BAM files and associated CSVs.
+
+| Parameter	| Description | Default |
+| --- | --- | --- |
+| -c, --csv | CSV file containing read IDs to extract from the BAM file*
+
+The CSV file should contain a `read_id` column with the read IDs in the BAM
+file, and a `decision` column with the accepted/rejected status of the read.
+Accepted reads will have `stop_receiving` in the `decision` column, while rejected
+reads will have `unblock`:
+
+```
+batch_time,read_number,channel,num_samples,read_id,sequence_length,decision
+1675186897.6034577,93,4,4011,f943c811-3f97-4971-8aed-bb9f36ffb8d1,361,unblock
+1675186897.7544408,80,68,4025,fab0c19d-8085-454c-bfb7-c375bbe237a1,462,unblock
+1675186897.7544408,93,127,4028,5285e0ba-86c0-4b5d-ba27-5783acad6105,438,unblock
+1675186897.7544408,103,156,4023,65d8befa-eec0-4496-bf2b-aa1a84e6dc5e,362,stop_receiving
+```
+
+General usage:
+```
+longreadsum rrms -i $INPUT_FILE -o $OUTPUT_DIRECTORY -c $RRMS_CSV
+```
+
+Download an example HTML report [here]() (data is HG002 RRMS using ONT
+R9.4.1)
+
+# RNA-Seq BAM
+
+This section describes parameters for generating TIN (transcript integrity
+number) scores from RNA-Seq BAM files.
+
+| Parameter	| Description | Default |
+| --- | --- | --- |
+| --genebed | Gene BED12 file required for calculating TIN scores
+| --sample-size | Sample size for TIN calculation | 100
+| --min-coverage | Minimum coverage for TIN calculation | 10
+
+General usage:
+```
+longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --genebed $BED_FILE --min-coverage 10 --sample-size 100
+```
+
+Download an example HTML report [here]() (data is Adult GTEx v9 long-read RNA-seq data sequenced with ONT
+cDNA-PCR protocol from https://www.gtexportal.org/home/downloads/adult-gtex/long_read_data)
+
+# PacBio unaligned BAM
+# ONT POD5
+# ONT FAST5
+# Basecall summary
+# FASTQ
+# FASTA
 
 Specifying input files:
 

diff --git a/src/cli.py b/src/cli.py
@@ -51,7 +51,7 @@ def get_common_param(margs):
 
     if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and (
             margs.pattern == None or margs.pattern == ""):
-        parsing_error_msg += "No input file(s) are provided. \n"
+        parsing_error_msg += "No input file(s) were provided.\n"
     else:
         # Group parameters into an array
         param_dict["input_files"] = []
@@ -70,14 +70,14 @@ def get_common_param(margs):
         # param_dict["read_count"] = read_count
 
         if len(param_dict["input_files"]) == 0:
-            parsing_error_msg += "No input file(s) can be found. \n"
+            parsing_error_msg += "No input file(s) were provided.\n"
         else:
             for input_filepath in param_dict["input_files"]:
                 if not os.path.isfile(input_filepath):
                     parsing_error_msg += "Cannot find the input file: " + input_filepath + "\n"
 
     if (margs.outputfolder is None or margs.outputfolder == ""):
-        parsing_error_msg += "No output file is provided. \n"
+        parsing_error_msg += "No output directory was provided.\n"
     else:
         output_dir = margs.outputfolder
         param_dict["output_folder"] = output_dir
@@ -92,7 +92,7 @@ def get_common_param(margs):
 
     # Set up logging to file and stdout
     if margs.log is None or margs.log == "":
-        parsing_error_msg += "No log file is provided. \n"
+        parsing_error_msg += "No log file was provided.\n"
 
         # Set up logging to stdout
         logging.basicConfig(stream=sys.stdout,
@@ -113,10 +113,6 @@ def get_common_param(margs):
 
     param_dict["threads"] = margs.threads
 
-    # Plot style parameters
-    param_dict["fontsize"] = margs.fontsize
-    param_dict["markersize"] = margs.markersize
-
     # Reset the param_dict if there are parsing errors
     if parsing_error_msg != "":
         param_dict = {}
@@ -465,11 +461,6 @@ def set_file_parser_defaults(file_parser):
                         help="The number of threads used. Default: 1.")
     file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_",
                         help="The prefix for output filenames. Default: `QC_`.")
-    file_parser.add_argument("--fontsize", type=int, default=14,
-                        help="Font size for plots. Default: 14")
-    file_parser.add_argument("--markersize", type=int, default=10,
-                        help="Marker size for plots. Default: 10")
-
 
 def pod5_module(margs):
     """POD5 file input module."""
@@ -647,20 +638,18 @@ def pod5_module(margs):
                                     formatter_class=RawTextHelpFormatter)
 set_file_parser_defaults(bam_parser)
 
-bam_parser.add_argument("--ref", type=str, default="",
-                        help="Reference genome file for the BAM file, used for base modification analysis. Default: None.")
-
-# Add argument for whether to run base modification analysis
 bam_parser.add_argument("--mod", action="store_true",
                         help="Run base modification analysis on the BAM file. Default: False.")
 
-# Add argument for base modification filtering threshold
-bam_parser.add_argument("--modprob", type=float, default=0.5,
-                        help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.5.")
-
 # Add argument for gene BED file required for RNA-seq transcript analysis (TIN, etc.)
 bam_parser.add_argument("--genebed", type=str, default="",
-                        help="Gene BED file required for RNA-seq analysis. Default: None.")
+                        help="Gene BED12 file required for calculating TIN scores from RNA-seq BAM files. Default: None.")
+
+bam_parser.add_argument("--modprob", type=float, default=0.8,
+                        help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.8.")
+
+bam_parser.add_argument("--ref", type=str, default="",
+                        help="The reference genome FASTA file to use for identifying CpG sites.")
 
 # Add TIN sample size argument
 bam_parser.add_argument("--sample-size", type=int, default=100,

diff --git a/src/plot_utils.py b/src/plot_utils.py
@@ -437,7 +437,7 @@ def plot(output_data, para_dict, file_type):
     plot_filepaths = getDefaultPlotFilenames()
 
     # Get the font size for plotly plots
-    font_size = para_dict["fontsize"]
+    font_size = 14
 
     # Create the summary table
     create_summary_table(output_data, plot_filepaths, file_type)
@@ -511,7 +511,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None):
     create_pod5_table(pod5_output, plot_filepaths)
 
     # Generate the signal plots
-    marker_size = para_dict["markersize"]
+    marker_size = 10
     read_count_max = para_dict["read_count"]
 
     read_count = len(pod5_output.keys())
@@ -634,7 +634,7 @@ def plot_signal(output_data, para_dict):
 
     # Get input parameters
     output_dir = para_dict["output_folder"]
-    marker_size = para_dict["markersize"]
+    marker_size = 10
     read_count_max = para_dict["read_count"]
 
     # Get read and base counts