diff --git a/README.md b/README.md index d80781f..58b36d4 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f - General usage for common filetypes: - [Common parameters](#common-parameters) - [WGS BAM](#wgs-bam) + - [BAM with base modifications/methylation](#bam-with-base-modifications) - [RRMS BAM](#rrms-bam) + - [PacBio unaligned BAM](#pacbio-unaligned-bam) - [RNA-Seq BAM (TIN values)](#rna-seq-bam) - [ONT POD5](#ont-pod5) - [ONT FAST5](#ont-fast5) @@ -96,7 +98,13 @@ python longreadsum [arguments] ``` # General Usage -## Common parameters + +Specify the filetype followed by parameters: +``` +longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY +``` + +# Common parameters To see all parameters for a filetype, run: @@ -114,17 +122,91 @@ This section describes parameters common to all filetypes: | -o, --outputfolder | Output directory | output_longreadsum | -t, --threads | The number of threads used | 1 | -Q, --outprefix | Output file prefix | QC_ -| --fontsize | Font size for plots | 14 -| --markersize | Marker size for plots | 10 - -## WGS BAM -## RRMS BAM -## RNA-Seq BAM -## ONT POD5 -## ONT FAST5 -## Basecall summary -## FASTQ -## FASTA + +# WGS BAM + +This section describes general usage for BAM files from whole-genome sequencing +(WGS) with alignments to a linear reference genome such as GRCh38: + +``` +longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY +``` +Download an example HTML report [here]() (data is HG002 sequenced with ONT Kit V14 +Promethion R10.4.1 from https://labs.epi2me.io/askenazi-kit14-2022-12/) + +# BAM with base modifications + +This section describes parameters for BAM files with base modification tags (MM, +ML). + +| Parameter | Description | Default | +| --- | --- | --- | +| --mod | Run base modification analysis on the BAM file | False +| --modprob | Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. | 0.8 +| --ref | The reference genome FASTA file to use for identifying CpG sites (optional) + +General usage: +``` +longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --ref $REF_GENOME --modprob 0.8 +``` + +Download an example HTML report [here]() (data is HG002 sequenced with ONT +MinION R9.4.1 from https://labs.epi2me.io/gm24385-5mc/) + +# RRMS BAM + +This section describes parameters for ONT RRMS BAM files and associated CSVs. + +| Parameter | Description | Default | +| --- | --- | --- | +| -c, --csv | CSV file containing read IDs to extract from the BAM file* + +The CSV file should contain a `read_id` column with the read IDs in the BAM +file, and a `decision` column with the accepted/rejected status of the read. +Accepted reads will have `stop_receiving` in the `decision` column, while rejected +reads will have `unblock`: + +``` +batch_time,read_number,channel,num_samples,read_id,sequence_length,decision +1675186897.6034577,93,4,4011,f943c811-3f97-4971-8aed-bb9f36ffb8d1,361,unblock +1675186897.7544408,80,68,4025,fab0c19d-8085-454c-bfb7-c375bbe237a1,462,unblock +1675186897.7544408,93,127,4028,5285e0ba-86c0-4b5d-ba27-5783acad6105,438,unblock +1675186897.7544408,103,156,4023,65d8befa-eec0-4496-bf2b-aa1a84e6dc5e,362,stop_receiving +``` + +General usage: +``` +longreadsum rrms -i $INPUT_FILE -o $OUTPUT_DIRECTORY -c $RRMS_CSV +``` + +Download an example HTML report [here]() (data is HG002 RRMS using ONT +R9.4.1) + +# RNA-Seq BAM + +This section describes parameters for generating TIN (transcript integrity +number) scores from RNA-Seq BAM files. + +| Parameter | Description | Default | +| --- | --- | --- | +| --genebed | Gene BED12 file required for calculating TIN scores +| --sample-size | Sample size for TIN calculation | 100 +| --min-coverage | Minimum coverage for TIN calculation | 10 + +General usage: +``` +longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --genebed $BED_FILE --min-coverage 10 --sample-size 100 +``` + +Download an example HTML report [here]() (data is Adult GTEx v9 long-read RNA-seq data sequenced with ONT +cDNA-PCR protocol from https://www.gtexportal.org/home/downloads/adult-gtex/long_read_data) + +# PacBio unaligned BAM +# ONT POD5 +# ONT FAST5 +# Basecall summary +# FASTQ +# FASTA Specifying input files: diff --git a/src/cli.py b/src/cli.py index be1836d..41cd047 100644 --- a/src/cli.py +++ b/src/cli.py @@ -51,7 +51,7 @@ def get_common_param(margs): if (margs.input == None or margs.input == "") and (margs.inputs == None or margs.inputs == "") and ( margs.pattern == None or margs.pattern == ""): - parsing_error_msg += "No input file(s) are provided. \n" + parsing_error_msg += "No input file(s) were provided.\n" else: # Group parameters into an array param_dict["input_files"] = [] @@ -70,14 +70,14 @@ def get_common_param(margs): # param_dict["read_count"] = read_count if len(param_dict["input_files"]) == 0: - parsing_error_msg += "No input file(s) can be found. \n" + parsing_error_msg += "No input file(s) were provided.\n" else: for input_filepath in param_dict["input_files"]: if not os.path.isfile(input_filepath): parsing_error_msg += "Cannot find the input file: " + input_filepath + "\n" if (margs.outputfolder is None or margs.outputfolder == ""): - parsing_error_msg += "No output file is provided. \n" + parsing_error_msg += "No output directory was provided.\n" else: output_dir = margs.outputfolder param_dict["output_folder"] = output_dir @@ -92,7 +92,7 @@ def get_common_param(margs): # Set up logging to file and stdout if margs.log is None or margs.log == "": - parsing_error_msg += "No log file is provided. \n" + parsing_error_msg += "No log file was provided.\n" # Set up logging to stdout logging.basicConfig(stream=sys.stdout, @@ -113,10 +113,6 @@ def get_common_param(margs): param_dict["threads"] = margs.threads - # Plot style parameters - param_dict["fontsize"] = margs.fontsize - param_dict["markersize"] = margs.markersize - # Reset the param_dict if there are parsing errors if parsing_error_msg != "": param_dict = {} @@ -465,11 +461,6 @@ def set_file_parser_defaults(file_parser): help="The number of threads used. Default: 1.") file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_", help="The prefix for output filenames. Default: `QC_`.") - file_parser.add_argument("--fontsize", type=int, default=14, - help="Font size for plots. Default: 14") - file_parser.add_argument("--markersize", type=int, default=10, - help="Marker size for plots. Default: 10") - def pod5_module(margs): """POD5 file input module.""" @@ -647,20 +638,18 @@ def pod5_module(margs): formatter_class=RawTextHelpFormatter) set_file_parser_defaults(bam_parser) -bam_parser.add_argument("--ref", type=str, default="", - help="Reference genome file for the BAM file, used for base modification analysis. Default: None.") - -# Add argument for whether to run base modification analysis bam_parser.add_argument("--mod", action="store_true", help="Run base modification analysis on the BAM file. Default: False.") -# Add argument for base modification filtering threshold -bam_parser.add_argument("--modprob", type=float, default=0.5, - help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.5.") - # Add argument for gene BED file required for RNA-seq transcript analysis (TIN, etc.) bam_parser.add_argument("--genebed", type=str, default="", - help="Gene BED file required for RNA-seq analysis. Default: None.") + help="Gene BED12 file required for calculating TIN scores from RNA-seq BAM files. Default: None.") + +bam_parser.add_argument("--modprob", type=float, default=0.8, + help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.8.") + +bam_parser.add_argument("--ref", type=str, default="", + help="The reference genome FASTA file to use for identifying CpG sites.") # Add TIN sample size argument bam_parser.add_argument("--sample-size", type=int, default=100, diff --git a/src/plot_utils.py b/src/plot_utils.py index 64c046a..6e16c02 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -437,7 +437,7 @@ def plot(output_data, para_dict, file_type): plot_filepaths = getDefaultPlotFilenames() # Get the font size for plotly plots - font_size = para_dict["fontsize"] + font_size = 14 # Create the summary table create_summary_table(output_data, plot_filepaths, file_type) @@ -511,7 +511,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None): create_pod5_table(pod5_output, plot_filepaths) # Generate the signal plots - marker_size = para_dict["markersize"] + marker_size = 10 read_count_max = para_dict["read_count"] read_count = len(pod5_output.keys()) @@ -634,7 +634,7 @@ def plot_signal(output_data, para_dict): # Get input parameters output_dir = para_dict["output_folder"] - marker_size = para_dict["markersize"] + marker_size = 10 read_count_max = para_dict["read_count"] # Get read and base counts