From 13b891b4f70e96d0c433cce035cbe7cf4c93afb2 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 10 Feb 2016 11:03:30 -0500 Subject: [PATCH 001/212] Initial commit --- README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..f6fcff3 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# genomics_scripts \ No newline at end of file From 033d3912b4da66281650a36387b830d2f3b05e77 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 10 Feb 2016 11:34:48 -0500 Subject: [PATCH 002/212] add contig filtering script; include brief description in readme --- README.md | 3 +- filter_contigs.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 filter_contigs.py diff --git a/README.md b/README.md index f6fcff3..dfa4270 100644 --- a/README.md +++ b/README.md @@ -1 +1,2 @@ -# genomics_scripts \ No newline at end of file +# Misc Genomics Scripts +- **filter_contigs.py**: cleans up a _de novo_ assembly from SPAdes, Velvet, or IDBA (requires biopython). IDBA includes space-delimited data in their contig headers, and because SeqIO parses on whitespace, these will need to be removed or replaced (e.g., `sed -i 's/ /|/g' assembly.fna`). SPAdes and Velvet lack whitespace in their contig deflines, so those output files can be directly fed into this filtering script. diff --git a/filter_contigs.py b/filter_contigs.py new file mode 100644 index 0000000..8d059c0 --- /dev/null +++ b/filter_contigs.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + + +import argparse +import os +import re +import sys +from Bio import SeqIO +from Bio.SeqUtils import GC + + +def parseArgs(): + parser = argparse.ArgumentParser(description='filters contigs (or scaffolds) based on length, coverage, GC skew, and complexity', + epilog='NOTE: headers in IDBA contigs must have spaces removed or replaced with a non-whitespace character') + parser.add_argument('-i', '--infile', + required=True, help='input FastA file from SPAdes or Velvet') + parser.add_argument('-o', '--outfile', help='output FastA file [basename.filtered.infile_ext]') + parser.add_argument('-p', '--outpath', help='output path [cwd of input file]') + parser.add_argument('-c', '--cov', type=int, default=5, + help='minimum coverage (for SPAdes and Velvet) or minimum read count (for IDBA) [5]') + parser.add_argument('-g', '--gcskew', default=True, action='store_false', + help='switch to turn on saving >88 and <12%% GC contigs') + parser.add_argument('-m', '--complex', default=True, action='store_false', + help='switch to turn on saving low-complexity contigs') + parser.add_argument('-l', '--len', type=int, default=500, + help='minimum contig length (in bp) [500]') + return parser.parse_args() + +def filter_contig(record, min_len, min_cov, gc, complexity): + ''' removes short and low coverage contigs ''' + if len(record.seq) >= min_len: + cov_pattern = re.compile('cov_([0-9.]+)_') #SPAdes and Velvet + cov_match = cov_pattern.search(record.name) + if cov_match: + if float((cov_match.group(0)).lstrip('cov_')) >= min_cov: + accepted_record = gc_filter(record, gc, complexity) + if accepted_record: + return accepted_record + else: + read_counts = re.compile('read_count_([0-9]+)') #IDBA (requires discarding or replacing spaces in deflines due to Biopython SeqIO's parsing) + read_match = read_counts.search(record.name) + if read_match: + if int((read_match.group(0)).lstrip('read_count_')) >= min_cov: + accepted_record = gc_filter(record, gc, complexity) + if accepted_record: + return accepted_record + else: #keep all contigs if deflines renamed and lack depth data + accepted_record = gc_filter(record, gc, complexity) + if accepted_record: + return accepted_record + +def gc_filter(record, gc, complexity): + ''' removes contigs with highly skewed GC ''' + if gc: + gc_content = float(GC(record.seq)) + if 12 <= gc_content <= 88: + accepted_record = complexity_filter(record, complexity) + if accepted_record: + return accepted_record + else: + accepted_record = complexity_filter(record, complexity) + if accepted_record: + return accepted_record + +def complexity_filter(record, complexity): + ''' removes contigs with only 1 or 2 nucleotides represented ''' + if complexity: + i = 0 + for nucleotide in ['A', 'T', 'C', 'G']: + if nucleotide in record.seq: + i += 1 + if i > 2: + return record + else: + return record + +def main(): + args = parseArgs() + infile = args.infile + outfile = args.outfile + outpath = args.outpath + min_cov = args.cov + min_len = args.len + gc = args.gcskew + complexity = args.complex + + in_ext = os.path.splitext(os.path.basename(infile))[-1] + if outfile is None: + out_ext = '.filtered' + in_ext + base = os.path.splitext(os.path.basename(infile))[0] + outfile = base + out_ext + if outpath is None: + outpath = os.path.dirname(infile) + + with open(outfile, 'w') as filtered_fasta: + with open(infile, 'rU') as input_fasta: + for record in SeqIO.parse(input_fasta, 'fasta'): + class_name = type(record) #'Bio.SeqRecord.SeqRecord' + r = filter_contig(record, min_len, min_cov, gc, complexity) + if isinstance(r, class_name): + SeqIO.write(r, filtered_fasta, 'fasta') + +if __name__ == '__main__': + main() From cb884e26ab8d46e5a419aece82f0266010a413db Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Tue, 16 Feb 2016 10:08:36 -0500 Subject: [PATCH 003/212] fix spades cov filtering by removing trailing underscore --- filter_contigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter_contigs.py b/filter_contigs.py index 8d059c0..444c36a 100644 --- a/filter_contigs.py +++ b/filter_contigs.py @@ -32,7 +32,7 @@ def filter_contig(record, min_len, min_cov, gc, complexity): cov_pattern = re.compile('cov_([0-9.]+)_') #SPAdes and Velvet cov_match = cov_pattern.search(record.name) if cov_match: - if float((cov_match.group(0)).lstrip('cov_')) >= min_cov: + if float((cov_match.group(0)).lstrip('cov_').rstrip('_')) >= min_cov: accepted_record = gc_filter(record, gc, complexity) if accepted_record: return accepted_record From 888fd433a0393af51a4023e4ca187024e837bc5a Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 17 Feb 2016 13:22:05 -0500 Subject: [PATCH 004/212] made executable --- filter_contigs.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 filter_contigs.py diff --git a/filter_contigs.py b/filter_contigs.py old mode 100644 new mode 100755 From 6ef860711831c9bb1cf78caca6b0939c000f5682 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 17 Feb 2016 14:24:31 -0500 Subject: [PATCH 005/212] fixed bug where infile and inpath were both specified; also clarified arg for outfilename --- filter_contigs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/filter_contigs.py b/filter_contigs.py index 444c36a..55b5c0e 100755 --- a/filter_contigs.py +++ b/filter_contigs.py @@ -14,7 +14,7 @@ def parseArgs(): epilog='NOTE: headers in IDBA contigs must have spaces removed or replaced with a non-whitespace character') parser.add_argument('-i', '--infile', required=True, help='input FastA file from SPAdes or Velvet') - parser.add_argument('-o', '--outfile', help='output FastA file [basename.filtered.infile_ext]') + parser.add_argument('-o', '--outfilename', help='output FastA filename [basename.filtered.infile_ext]') parser.add_argument('-p', '--outpath', help='output path [cwd of input file]') parser.add_argument('-c', '--cov', type=int, default=5, help='minimum coverage (for SPAdes and Velvet) or minimum read count (for IDBA) [5]') @@ -77,7 +77,7 @@ def complexity_filter(record, complexity): def main(): args = parseArgs() infile = args.infile - outfile = args.outfile + outfilename = args.outfilename outpath = args.outpath min_cov = args.cov min_len = args.len @@ -85,14 +85,16 @@ def main(): complexity = args.complex in_ext = os.path.splitext(os.path.basename(infile))[-1] - if outfile is None: + if outfilename is None: out_ext = '.filtered' + in_ext base = os.path.splitext(os.path.basename(infile))[0] - outfile = base + out_ext + outfilename = base + out_ext if outpath is None: outpath = os.path.dirname(infile) - with open(outfile, 'w') as filtered_fasta: + out = os.path.join(outpath, outfilename) + + with open(out, 'w') as filtered_fasta: with open(infile, 'rU') as input_fasta: for record in SeqIO.parse(input_fasta, 'fasta'): class_name = type(record) #'Bio.SeqRecord.SeqRecord' From 534df991d79b650aa58648e391e88233f224f42c Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Fri, 19 Feb 2016 11:47:13 -0500 Subject: [PATCH 006/212] QA reads bash scripts added --- QualAssessCleanSeqs.bash | 200 +++++++++++++++++++++++++++++++++++++++ QualAssessRawSeqs.bash | 184 +++++++++++++++++++++++++++++++++++ 2 files changed, 384 insertions(+) create mode 100755 QualAssessCleanSeqs.bash create mode 100755 QualAssessRawSeqs.bash diff --git a/QualAssessCleanSeqs.bash b/QualAssessCleanSeqs.bash new file mode 100755 index 0000000..2f99b0c --- /dev/null +++ b/QualAssessCleanSeqs.bash @@ -0,0 +1,200 @@ +#!/bin/bash +# Dependencies: awk, bc, cut, GNU sed, grep, gunzip, mkdir, readlink, paste, printf, wc + +usage() { + echo " + USAGE: $0 Input_R1.paired.fq Input_R2.paired.fq Input.single.fq Outdir + + The trimmed reads must be in Illumina 1.8+ FastQ + format, and filenames must be formatted: + .fastq.gz or .fastq, where + is the name of the sample. + " + } + +#Help and usage information +if [[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]]; then + usage + exit 1 +fi + +#Require 4 arguments +if [ $# -ne 4 ]; then + usage + exit 1 +fi + +#Make outdir if absent +if [ ! -d "$4" ]; then + mkdir -p "$4" + echo "created output directory path: $4" +fi + +#File input sequence information +# file regex from Illumina 1.8+ (gunzip extracted): +# ([0-9A-Za-z]{1,19})_S([0-9A-Za-z]{1,3})_L([0-9]{3})_R[12]_([0-9]{3}).fastq +SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1,2) +SAMPLEL=$(basename $1) +SAMPLER=$(basename $2) +SAMPLES=$(basename $3) + + +###Filename handling +dIFS=$IFS #default IFS +IFS=$'' #change for newlines +#takes Trimmed unpaired R1 and R2 fq files as input +#tests file input extensions +if [[ $1 == *.gz && $2 == *.gz && $3 == *.gz ]]; then #all files are gunzipped + SAMPLEL=$(basename $SEQ1 .gz) + SAMPLER=$(basename $SEQ2 .gz) + SAMPLES=$(basename $SEQ2 .gz) + echo "extracting the gunzipped file $1..." + gunzip -c $1 > "$4"/"$SAMPLEL" + echo "$1 was extracted..." + SEQ1=$("$4"/"$SAMPLEL") + echo "extracting the gunzipped file $2..." + gunzip -c $2 > "$4"/"$SAMPLER" + echo "$2 was extracted..." + SEQ2=$("$4"/"$SAMPLER") + echo "extracting the gunzipped file $3..." + gunzip -c $3 > "$4"/"$SAMPLES" + echo "$3 was extracted..." + SEQ3=$("$4"/"$SAMPLES") +elif [[ $1 == *q && $2 == *q && $3 == *q ]]; then #all files are fq format + echo "all three files are in FastQ format..." + SEQ1="$1" + SEQ2="$2" + SEQ3="$3" +else #for now skip possibily of some but not all gunzipped + usage + exit 1 +fi +IFS=$dIFS #back to default IFS +wait + +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " $SAMPLEL, $SAMPLER," +echo " and $SAMPLES sequences were read in..." +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Count total nucleotides +TN1=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ1) +TN2=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ2) +TN3=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ3) +BP_TOT="$(($TN1 + $TN2 + $TN3))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " counted the total nucleotides..." +echo " Total: $BP_TOT" +echo " R1: $TN1" +echo " R2: $TN2" +echo " single: $TN3" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +OPSYS=$(uname -s) + +###Count nucleotides >=Q20 and >=Q30 for Illumina 1.8+ +if [[ "$OPSYS" == Linux ]]; then #should be linux where sed default is GNU sed + Q301=$(sed -n '4~4'p $SEQ1 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q302=$(sed -n '4~4'p $SEQ2 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q303=$(sed -n '4~4'p $SEQ3 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q20to301=$(sed -n '4~4'p $SEQ1 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to302=$(sed -n '4~4'p $SEQ2 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to303=$(sed -n '4~4'p $SEQ3 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q201=$(($Q20to301 + $Q301)) + Q202=$(($Q20to302 + $Q302)) + Q203=$(($Q20to303 + $Q303)) +elif [[ "$OPSYS" == Darwin ]]; then #use gsed instead of sed in Mac OS X here + Q301=$(gsed -n '4~4'p $SEQ1 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q302=$(gsed -n '4~4'p $SEQ2 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q303=$(gsed -n '4~4'p $SEQ3 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q20to301=$(gsed -n '4~4'p $SEQ1 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to302=$(gsed -n '4~4'p $SEQ2 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to303=$(gsed -n '4~4'p $SEQ3 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q201=$(($Q20to301 + $Q301)) + Q202=$(($Q20to302 + $Q302)) + Q203=$(($Q20to303 + $Q303)) +else + echo 'ERROR: Your operating system does not appear to be supported.' >&2 + exit 1 +fi +wait +Q20_TOT="$(($Q201 + $Q202 + $Q203))" +Q30_TOT="$(($Q301 + $Q302 + $Q303))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' counted the Q20+ and Q30+ nucleotides...' +echo " ${Q201} nucleotides are >=Q20 in $SAMPLEL" +echo " ${Q301} nucleotides are >=Q30 in $SAMPLEL" +echo " ${Q202} nucleotides are >=Q20 in $SAMPLER" +echo " ${Q302} nucleotides are >=Q30 in $SAMPLER" +echo " ${Q203} nucleotides are >=Q20 in $SAMPLES" +echo " ${Q303} nucleotides are >=Q30 in $SAMPLES" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Calculate percentages of >=Q20 and >=Q30 +PQ201="$(echo "scale=2;($Q201/$TN1)*100" | bc)" +PQ202="$(echo "scale=2;($Q202/$TN2)*100" | bc)" +PQ203="$(echo "scale=2;($Q203/$TN3)*100" | bc)" +PQ301="$(echo "scale=2;($Q301/$TN1)*100" | bc)" +PQ302="$(echo "scale=2;($Q302/$TN2)*100" | bc)" +PQ303="$(echo "scale=2;($Q303/$TN3)*100" | bc)" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' calculated the percentages of Q20 and Q30...' +echo " ${PQ201}% nucleotides are >=Q20 in $SAMPLEL" +echo " ${PQ301}% nucleotides are >=Q30 in $SAMPLEL" +echo " ${PQ202}% nucleotides are >=Q20 in $SAMPLER" +echo " ${PQ302}% nucleotides are >=Q30 in $SAMPLER" +echo " ${PQ203}% nucleotides are >=Q20 in $SAMPLES" +echo " ${PQ303}% nucleotides are >=Q30 in $SAMPLES" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Count number of reads +NUMLN_L=$(wc -l $SEQ1| awk '{print $1}') +NUMLN_R=$(wc -l $SEQ2 | awk '{print $1}') +NUMLN_single=$(wc -l $SEQ3 | awk '{print $1}') +wait +NUM_L="$(($NUMLN_L / 4))" +NUM_R="$(($NUMLN_R / 4))" +NUM_single="$(($NUMLN_single / 4))" +READ_TOT="$(($NUM_L + NUM_R + NUM_single))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' counted the number of reads...' +echo " There are $NUM_L reads in $SAMPLEL" +echo " There are $NUM_R reads in $SAMPLER" +echo " There are $NUM_single reads in $SAMPLES" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Create results output file +printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ +Sample_Name Q20_Total_[bp] Q30_Total_[bp] Q20_R1_[bp] Q20_R2_[bp] Q20_single_[bp] Q20_R1_[%] Q20_R2_[%] Q20_single_[%] Q30_R1_[bp] Q30_R2_[bp] Q30_single_[bp] Q30_R1_[%] Q30_R2_[%] Q30_single_[%] Total_Sequenced_[bp] Total_Sequenced_[reads] NUMR1_[bp] NUMR2_[bp] NUMsingle_[bp] \ +"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q201" "$Q202" "$Q203" "$PQ201%" "$PQ202"% "$PQ203"% "$Q301" "$Q302" "$Q303" "$PQ301"% "$PQ302"% "$PQ303"% "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" "$NUM_single" > "$4"/QualAssessTrimmedSeqs_"$SAMPLE"_results.tsv ; +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' created results output file...' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +if [[ "$OPSYS" == Linux ]]; then + R1FILEPATH=$(readlink -m $1) + R2FILEPATH=$(readlink -m $2) + singleFILEPATH=$(readlink -m $3) +elif [[ "$OPSYS" == Darwin ]]; then + R1FILEPATH=$(echo `pwd`/`ls "$1"`) + R2FILEPATH=$(echo `pwd`/`ls "$2"`) + singleFILEPATH=$(echo `pwd`/`ls "$3"`) +fi + +#Create log file +printf "`date`\n$USER\n%s\n%s\n\n" \ +"$R1FILEPATH" "$R2FILEPATH" "$singleFILEPATH" \ +> "$4"/QualAssessTrimmedSeqs_"$SAMPLE"_results.log ; +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' created log output file...' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " Quality assessment of $SAMPLE Trimmed sequences completed" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' diff --git a/QualAssessRawSeqs.bash b/QualAssessRawSeqs.bash new file mode 100755 index 0000000..ff52832 --- /dev/null +++ b/QualAssessRawSeqs.bash @@ -0,0 +1,184 @@ +#!/bin/bash +# Dependencies: awk, bc, cut, GNU sed, grep, gunzip, mkdir, readlink, paste, printf, wc + +usage() { + echo " + USAGE: $0 Input_R1.fastq Input_R2.fastq Outdir + + The raw reads must be in Illumina 1.8+ FastQ + format, and filenames must be formatted: + .fastq.gz or .fastq, where + is the name of the sample. + " + } + +#Help and usage information +if [[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]]; then + usage + exit 1 +fi + +#Require 3 arguments +if [ $# -ne 3 ]; then + usage + exit 1 +fi + +#Make outdir if absent +if [ ! -d "$3" ]; then + mkdir -p "$3" + echo "created output directory path: $3" +fi + +#File input sequence information +# file regex from Illumina 1.8+ (gunzip extracted): +# ([0-9A-Za-z]{1,19})_S([0-9A-Za-z]{1,3})_L([0-9]{3})_R[12]_([0-9]{3}).fastq +SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1,2) +SAMPLEL=$(basename $1) +SAMPLER=$(basename $2) + +#Filename handling +dIFS=$IFS #default IFS +IFS=$'' #change for newlines +if [[ $1 == *.gz && $2 == *.gz ]]; then #both files are gunzipped + SAMPLEL=$(basename $SEQ1 .gz) + SAMPLER=$(basename $SEQ2 .gz) + echo "extracting the gunzipped file $1..." + gunzip -c $1 > "$3"/"$SAMPLEL" + echo "$1 was extracted..." + SEQ1=$("$3"/"$SAMPLEL") + echo "extracting the gunzipped file $2..." + gunzip -c $2 > "$3"/"$SAMPLER" + echo "$2 was extracted..." + SEQ2=$("$3"/"$SAMPLER") +elif [[ $1 == *q && $2 == *q ]]; then #both files are fastq format + echo "both are FastQ format..." + SEQ1=$(echo $1) + SEQ2=$(echo $2) +elif [[ $1 == *.gz && $2 == *q ]]; then + SAMPLEL=$(basename $SEQ1 .gz) + echo "extracting the gunzipped file $1..." + gunzip -c $1 > "$3"/"$SAMPLEL" + echo "$1 was extracted..." + SEQ1=$("$3"/"$SAMPLEL") + SEQ2="$2" #$2 is still in fastq format +elif [[ $1 == *q && $2 == *.gz ]]; then + SAMPLER=$(basename $SEQ2 .gz) + SEQ1="$1" #$1 is still in fastq format + echo "extracting the gunzipped file $2..." + gunzip -c $2 > "$3"/"$SAMPLER" + echo "$2 was extracted..." + SEQ2=$("$3"/"$SAMPLER") +else + usage + exit 1 +fi +IFS=$dIFS #back to default IFS +wait + +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " $SAMPLEL and $SAMPLER sequences were read in..." +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Count total nucleotides +TN1=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ1) +TN2=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ2) +BP_TOT="$(($TN1 + $TN2))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " counted the total nucleotides..." +echo " Total: $BP_TOT" +echo " R1: $TN1" +echo " R2: $TN2" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +OPSYS=$(uname -s) + +#Count nucleotides >=Q20 and >=Q30 for Illumina 1.8+ +if [[ "$OPSYS" == Linux ]]; then #should be linux where sed default is GNU sed + Q30L=$(sed -n '4~4'p $SEQ1 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q30R=$(sed -n '4~4'p $SEQ2 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q20to30L=$(sed -n '4~4'p $SEQ1 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to30R=$(sed -n '4~4'p $SEQ2 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20L="$(($Q20to30L + $Q30L))" + Q20R="$(($Q20to30R + $Q30R))" +elif [[ "$OPSYS" == Darwin ]]; then #use gsed instead of sed in Mac OS X here + Q30L=$(gsed -n '4~4'p $SEQ1 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q30R=$(gsed -n '4~4'p $SEQ2 | grep -o '[\?@A-J]' | paste -s -d"\0" - | wc -m) + Q20to30L=$(gsed -n '4~4'p $SEQ1 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20to30R=$(gsed -n '4~4'p $SEQ2 | grep -o '[5-9:\;\<=\>]' | paste -s -d"\0" - | wc -m) + Q20L="$(($Q20to30L + $Q30L))" + Q20R="$(($Q20to30R + $Q30R))" +else + echo 'ERROR: Your operating system does not appear to be supported.' >&2 + exit 1 +fi +wait +Q20_TOT="$(($Q20L + $Q20R))" +Q30_TOT="$(($Q30L + $Q30R))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' counted the Q20+ and Q30+ nucleotides...' +echo " ${Q20L} nucleotides are >=Q20 in $SAMPLEL" +echo " ${Q30L} nucleotides are >=Q30 in $SAMPLEL" +echo " ${Q20R} nucleotides are >=Q20 in $SAMPLER" +echo " ${Q30R} nucleotides are >=Q30 in $SAMPLER" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Calculate percentages of >=Q20 and >=Q30 +PQ20L="$(echo "scale=2;($Q20L/$TN1)*100" | bc)" +PQ20R="$(echo "scale=2;($Q20R/$TN2)*100" | bc)" +PQ30L="$(echo "scale=2;($Q30L/$TN1)*100" | bc)" +PQ30R="$(echo "scale=2;($Q30R/$TN2)*100" | bc)" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' calculated the percentages of Q20+ and Q30+...' +echo " ${PQ20L}% nucleotides are >=Q20 in $SAMPLEL" +echo " ${PQ30L}% nucleotides are >=Q30 in $SAMPLEL" +echo " ${PQ20R}% nucleotides are >=Q20 in $SAMPLER" +echo " ${PQ30R}% nucleotides are >=Q30 in $SAMPLER" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Count number of reads +NUMLN_L=$(wc -l $SEQ1| awk '{print $1}') +NUMLN_R=$(wc -l $SEQ2 | awk '{print $1}') +wait +NUM_L="$(($NUMLN_L / 4))" +NUM_R="$(($NUMLN_R / 4))" +READ_TOT="$(($NUM_L + NUM_R))" +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' counted the number of reads...' +echo " There are $NUM_L reads in $SAMPLEL" +echo " There are $NUM_R reads in $SAMPLER" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +#Create results output file +printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ +Sample_Name Q20_Total_[bp] Q30_Total_[bp] Q20_R1_[bp] Q20_R2_[bp] Q20_R1_[%] Q20_R2_[%] Q30_R1_[bp] Q30_R2_[bp] Q30_R1_[%] Q30_R2_[%] Total_Sequenced_[bp] Total_Sequenced_[reads] NUMR1_[bp] NUMR2_[bp] \ +"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q20L" "$Q20R" "$PQ20L%" "$PQ20R%" "$Q30L" "$Q30R" "$PQ30L%" "$PQ30R%" "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" > "$3"/QualAssessRawSeqs_"$SAMPLE"_results.tsv ; +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' created results output file...' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + +if [[ "$OPSYS" == Linux ]]; then + R1FILEPATH=$(readlink -m $1) + R2FILEPATH=$(readlink -m $2) +elif [[ "$OPSYS" == Darwin ]]; then + R1FILEPATH=$(echo `pwd`/`ls "$1"`) + R2FILEPATH=$(echo `pwd`/`ls "$2"`) +fi + +#Create log file +printf "`date`\n$USER\n%s\n%s\n\n" \ +"$R1FILEPATH" "$R2FILEPATH" \ +> "$3"/QualAssessRawSeqs_"$SAMPLE"_results.log ; +wait +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo ' created log output file...' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " Quality assessment of $SAMPLE raw sequences completed" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' From 85f6f8bdc7966f54900122ab029535d422ff023a Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 7 Mar 2016 09:56:43 -0500 Subject: [PATCH 007/212] new feature: rename contig headers --- filter_contigs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/filter_contigs.py b/filter_contigs.py index 55b5c0e..d502624 100755 --- a/filter_contigs.py +++ b/filter_contigs.py @@ -6,6 +6,7 @@ import re import sys from Bio import SeqIO +from Bio import SeqRecord from Bio.SeqUtils import GC @@ -14,6 +15,7 @@ def parseArgs(): epilog='NOTE: headers in IDBA contigs must have spaces removed or replaced with a non-whitespace character') parser.add_argument('-i', '--infile', required=True, help='input FastA file from SPAdes or Velvet') + parser.add_argument('-b', '--baseheader', help='contig header prefix (with \'_#\' as suffix) [basename]') parser.add_argument('-o', '--outfilename', help='output FastA filename [basename.filtered.infile_ext]') parser.add_argument('-p', '--outpath', help='output path [cwd of input file]') parser.add_argument('-c', '--cov', type=int, default=5, @@ -78,6 +80,7 @@ def main(): args = parseArgs() infile = args.infile outfilename = args.outfilename + baseheader = args.baseheader outpath = args.outpath min_cov = args.cov min_len = args.len @@ -91,16 +94,21 @@ def main(): outfilename = base + out_ext if outpath is None: outpath = os.path.dirname(infile) + if baseheader is None: + baseheader = os.path.splitext(os.path.basename(infile))[0] out = os.path.join(outpath, outfilename) with open(out, 'w') as filtered_fasta: with open(infile, 'rU') as input_fasta: + i = 1 for record in SeqIO.parse(input_fasta, 'fasta'): class_name = type(record) #'Bio.SeqRecord.SeqRecord' r = filter_contig(record, min_len, min_cov, gc, complexity) if isinstance(r, class_name): - SeqIO.write(r, filtered_fasta, 'fasta') + renamed_rec = SeqRecord.SeqRecord(id='{}_{}'.format(baseheader, i), seq=r.seq, description='') + SeqIO.write(renamed_rec, filtered_fasta, 'fasta') + i += 1 if __name__ == '__main__': main() From cb8605081af56cc1840054ba27cbe1949f252f6c Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Tue, 8 Mar 2016 14:46:06 -0500 Subject: [PATCH 008/212] removed unnecessary vars and now cleanup automatically (discard gunzip output if GZ files given) --- QualAssessCleanSeqs.bash | 104 +++++++++++++++++++-------------------- QualAssessRawSeqs.bash | 97 ++++++++++++++++++------------------ 2 files changed, 98 insertions(+), 103 deletions(-) diff --git a/QualAssessCleanSeqs.bash b/QualAssessCleanSeqs.bash index 2f99b0c..f5af3ed 100755 --- a/QualAssessCleanSeqs.bash +++ b/QualAssessCleanSeqs.bash @@ -33,33 +33,25 @@ fi #File input sequence information # file regex from Illumina 1.8+ (gunzip extracted): # ([0-9A-Za-z]{1,19})_S([0-9A-Za-z]{1,3})_L([0-9]{3})_R[12]_([0-9]{3}).fastq -SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1,2) -SAMPLEL=$(basename $1) -SAMPLER=$(basename $2) -SAMPLES=$(basename $3) +SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1) - -###Filename handling -dIFS=$IFS #default IFS -IFS=$'' #change for newlines -#takes Trimmed unpaired R1 and R2 fq files as input -#tests file input extensions +#Filename handling if [[ $1 == *.gz && $2 == *.gz && $3 == *.gz ]]; then #all files are gunzipped - SAMPLEL=$(basename $SEQ1 .gz) - SAMPLER=$(basename $SEQ2 .gz) - SAMPLES=$(basename $SEQ2 .gz) + SAMPLEL=$(basename $1 .gz) + SAMPLER=$(basename $2 .gz) + SAMPLES=$(basename $3 .gz) echo "extracting the gunzipped file $1..." - gunzip -c $1 > "$4"/"$SAMPLEL" + gunzip -c "$1" > "$4"/"$SAMPLEL" echo "$1 was extracted..." - SEQ1=$("$4"/"$SAMPLEL") + SEQ1="$4"/"$SAMPLEL" echo "extracting the gunzipped file $2..." - gunzip -c $2 > "$4"/"$SAMPLER" + gunzip -c "$2" > "$4"/"$SAMPLER" echo "$2 was extracted..." - SEQ2=$("$4"/"$SAMPLER") + SEQ2="$4"/"$SAMPLER" echo "extracting the gunzipped file $3..." - gunzip -c $3 > "$4"/"$SAMPLES" + gunzip -c "$3" > "$4"/"$SAMPLES" echo "$3 was extracted..." - SEQ3=$("$4"/"$SAMPLES") + SEQ3="$4"/"$SAMPLES" elif [[ $1 == *q && $2 == *q && $3 == *q ]]; then #all files are fq format echo "all three files are in FastQ format..." SEQ1="$1" @@ -69,14 +61,13 @@ else #for now skip possibily of some but not all gunzipped usage exit 1 fi -IFS=$dIFS #back to default IFS wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo " $SAMPLEL, $SAMPLER," -echo " and $SAMPLES sequences were read in..." -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " $SEQ1, $SEQ2," +echo " and $SEQ3 sequences were read in..." +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Count total nucleotides TN1=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ1) @@ -84,13 +75,13 @@ TN2=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SE TN3=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ3) BP_TOT="$(($TN1 + $TN2 + $TN3))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo " counted the total nucleotides..." echo " Total: $BP_TOT" echo " R1: $TN1" echo " R2: $TN2" echo " single: $TN3" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' OPSYS=$(uname -s) @@ -123,15 +114,15 @@ wait Q20_TOT="$(($Q201 + $Q202 + $Q203))" Q30_TOT="$(($Q301 + $Q302 + $Q303))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' counted the Q20+ and Q30+ nucleotides...' -echo " ${Q201} nucleotides are >=Q20 in $SAMPLEL" -echo " ${Q301} nucleotides are >=Q30 in $SAMPLEL" -echo " ${Q202} nucleotides are >=Q20 in $SAMPLER" -echo " ${Q302} nucleotides are >=Q30 in $SAMPLER" -echo " ${Q203} nucleotides are >=Q20 in $SAMPLES" -echo " ${Q303} nucleotides are >=Q30 in $SAMPLES" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " ${Q201} nucleotides are >=Q20 in $SEQ1" +echo " ${Q301} nucleotides are >=Q30 in $SEQ1" +echo " ${Q202} nucleotides are >=Q20 in $SEQ2" +echo " ${Q302} nucleotides are >=Q30 in $SEQ2" +echo " ${Q203} nucleotides are >=Q20 in $SEQ3" +echo " ${Q303} nucleotides are >=Q30 in $SEQ3" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Calculate percentages of >=Q20 and >=Q30 PQ201="$(echo "scale=2;($Q201/$TN1)*100" | bc)" @@ -141,15 +132,15 @@ PQ301="$(echo "scale=2;($Q301/$TN1)*100" | bc)" PQ302="$(echo "scale=2;($Q302/$TN2)*100" | bc)" PQ303="$(echo "scale=2;($Q303/$TN3)*100" | bc)" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' calculated the percentages of Q20 and Q30...' -echo " ${PQ201}% nucleotides are >=Q20 in $SAMPLEL" -echo " ${PQ301}% nucleotides are >=Q30 in $SAMPLEL" -echo " ${PQ202}% nucleotides are >=Q20 in $SAMPLER" -echo " ${PQ302}% nucleotides are >=Q30 in $SAMPLER" -echo " ${PQ203}% nucleotides are >=Q20 in $SAMPLES" -echo " ${PQ303}% nucleotides are >=Q30 in $SAMPLES" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " ${PQ201}% nucleotides are >=Q20 in $SEQ1" +echo " ${PQ301}% nucleotides are >=Q30 in $SEQ1" +echo " ${PQ202}% nucleotides are >=Q20 in $SEQ2" +echo " ${PQ302}% nucleotides are >=Q30 in $SEQ2" +echo " ${PQ203}% nucleotides are >=Q20 in $SEQ3" +echo " ${PQ303}% nucleotides are >=Q30 in $SEQ3" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Count number of reads NUMLN_L=$(wc -l $SEQ1| awk '{print $1}') @@ -161,21 +152,22 @@ NUM_R="$(($NUMLN_R / 4))" NUM_single="$(($NUMLN_single / 4))" READ_TOT="$(($NUM_L + NUM_R + NUM_single))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' counted the number of reads...' echo " There are $NUM_L reads in $SAMPLEL" echo " There are $NUM_R reads in $SAMPLER" echo " There are $NUM_single reads in $SAMPLES" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Create results output file printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ Sample_Name Q20_Total_[bp] Q30_Total_[bp] Q20_R1_[bp] Q20_R2_[bp] Q20_single_[bp] Q20_R1_[%] Q20_R2_[%] Q20_single_[%] Q30_R1_[bp] Q30_R2_[bp] Q30_single_[bp] Q30_R1_[%] Q30_R2_[%] Q30_single_[%] Total_Sequenced_[bp] Total_Sequenced_[reads] NUMR1_[bp] NUMR2_[bp] NUMsingle_[bp] \ -"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q201" "$Q202" "$Q203" "$PQ201%" "$PQ202"% "$PQ203"% "$Q301" "$Q302" "$Q303" "$PQ301"% "$PQ302"% "$PQ303"% "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" "$NUM_single" > "$4"/QualAssessTrimmedSeqs_"$SAMPLE"_results.tsv ; +"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q201" "$Q202" "$Q203" "$PQ201%" "$PQ202%" "$PQ203%" "$Q301" "$Q302" "$Q303" "$PQ301%" "$PQ302%" "$PQ303%" "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" "$NUM_single" +> "$4"/QualAssessTrimSeqs_"$SAMPLE"_results.tab ; wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' created results output file...' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' if [[ "$OPSYS" == Linux ]]; then R1FILEPATH=$(readlink -m $1) @@ -187,14 +179,18 @@ elif [[ "$OPSYS" == Darwin ]]; then singleFILEPATH=$(echo `pwd`/`ls "$3"`) fi +[[ $1 == *.gz ]] && rm -f "$3"/"$SAMPLEL" +[[ $2 == *.gz ]] && rm -f "$3"/"$SAMPLER" +[[ $3 == *.gz ]] && rm -f "$3"/"$SAMPLES" + #Create log file printf "`date`\n$USER\n%s\n%s\n\n" \ "$R1FILEPATH" "$R2FILEPATH" "$singleFILEPATH" \ -> "$4"/QualAssessTrimmedSeqs_"$SAMPLE"_results.log ; +> "$4"/QualAssessTrimSeqs_"$SAMPLE"_results.log ; wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' created log output file...' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo " Quality assessment of $SAMPLE Trimmed sequences completed" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " Quality assessment of $SAMPLE trimmed sequences completed" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' diff --git a/QualAssessRawSeqs.bash b/QualAssessRawSeqs.bash index ff52832..1408181 100755 --- a/QualAssessRawSeqs.bash +++ b/QualAssessRawSeqs.bash @@ -33,65 +33,60 @@ fi #File input sequence information # file regex from Illumina 1.8+ (gunzip extracted): # ([0-9A-Za-z]{1,19})_S([0-9A-Za-z]{1,3})_L([0-9]{3})_R[12]_([0-9]{3}).fastq -SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1,2) -SAMPLEL=$(basename $1) -SAMPLER=$(basename $2) +SAMPLE=$(basename $1 | cut -d . -f 1 | cut -d _ -f 1) #Filename handling -dIFS=$IFS #default IFS -IFS=$'' #change for newlines if [[ $1 == *.gz && $2 == *.gz ]]; then #both files are gunzipped - SAMPLEL=$(basename $SEQ1 .gz) - SAMPLER=$(basename $SEQ2 .gz) + SAMPLEL=$(basename $1 .gz) + SAMPLER=$(basename $2 .gz) echo "extracting the gunzipped file $1..." - gunzip -c $1 > "$3"/"$SAMPLEL" + gunzip -c "$1" > "$3"/"$SAMPLEL" echo "$1 was extracted..." - SEQ1=$("$3"/"$SAMPLEL") + SEQ1="$3"/"$SAMPLEL" echo "extracting the gunzipped file $2..." - gunzip -c $2 > "$3"/"$SAMPLER" + gunzip -c "$2" > "$3"/"$SAMPLER" echo "$2 was extracted..." - SEQ2=$("$3"/"$SAMPLER") + SEQ2="$3"/"$SAMPLER" elif [[ $1 == *q && $2 == *q ]]; then #both files are fastq format echo "both are FastQ format..." - SEQ1=$(echo $1) - SEQ2=$(echo $2) + SEQ1="$1" + SEQ2="$2" elif [[ $1 == *.gz && $2 == *q ]]; then - SAMPLEL=$(basename $SEQ1 .gz) + SAMPLEL=$(basename $1 .gz) echo "extracting the gunzipped file $1..." - gunzip -c $1 > "$3"/"$SAMPLEL" + gunzip -c "$1" > "$3"/"$SAMPLEL" echo "$1 was extracted..." - SEQ1=$("$3"/"$SAMPLEL") + SEQ1="$3"/"$SAMPLEL" SEQ2="$2" #$2 is still in fastq format elif [[ $1 == *q && $2 == *.gz ]]; then - SAMPLER=$(basename $SEQ2 .gz) + SAMPLER=$(basename $2 .gz) SEQ1="$1" #$1 is still in fastq format echo "extracting the gunzipped file $2..." - gunzip -c $2 > "$3"/"$SAMPLER" + gunzip -c "$2" > "$3"/"$SAMPLER" echo "$2 was extracted..." - SEQ2=$("$3"/"$SAMPLER") + SEQ2="$3"/"$SAMPLER" else usage exit 1 fi -IFS=$dIFS #back to default IFS wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo " $SAMPLEL and $SAMPLER sequences were read in..." -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " $SEQ1 and $SEQ2 sequences were read in..." +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Count total nucleotides TN1=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ1) TN2=$(awk 'BEGIN {SUM=0;} {if(NR%4==2) {SUM+=length($0);}} END {print SUM;}' $SEQ2) BP_TOT="$(($TN1 + $TN2))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo " counted the total nucleotides..." echo " Total: $BP_TOT" echo " R1: $TN1" echo " R2: $TN2" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' OPSYS=$(uname -s) @@ -118,13 +113,13 @@ wait Q20_TOT="$(($Q20L + $Q20R))" Q30_TOT="$(($Q30L + $Q30R))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' counted the Q20+ and Q30+ nucleotides...' -echo " ${Q20L} nucleotides are >=Q20 in $SAMPLEL" -echo " ${Q30L} nucleotides are >=Q30 in $SAMPLEL" -echo " ${Q20R} nucleotides are >=Q20 in $SAMPLER" -echo " ${Q30R} nucleotides are >=Q30 in $SAMPLER" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " ${Q20L} nucleotides are >=Q20 in $SEQ1" +echo " ${Q30L} nucleotides are >=Q30 in $SEQ1" +echo " ${Q20R} nucleotides are >=Q20 in $SEQ2" +echo " ${Q30R} nucleotides are >=Q30 in $SEQ2" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Calculate percentages of >=Q20 and >=Q30 PQ20L="$(echo "scale=2;($Q20L/$TN1)*100" | bc)" @@ -132,13 +127,13 @@ PQ20R="$(echo "scale=2;($Q20R/$TN2)*100" | bc)" PQ30L="$(echo "scale=2;($Q30L/$TN1)*100" | bc)" PQ30R="$(echo "scale=2;($Q30R/$TN2)*100" | bc)" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' calculated the percentages of Q20+ and Q30+...' -echo " ${PQ20L}% nucleotides are >=Q20 in $SAMPLEL" -echo " ${PQ30L}% nucleotides are >=Q30 in $SAMPLEL" -echo " ${PQ20R}% nucleotides are >=Q20 in $SAMPLER" -echo " ${PQ30R}% nucleotides are >=Q30 in $SAMPLER" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " ${PQ20L}% nucleotides are >=Q20 in $SEQ1" +echo " ${PQ30L}% nucleotides are >=Q30 in $SEQ1" +echo " ${PQ20R}% nucleotides are >=Q20 in $SEQ2" +echo " ${PQ30R}% nucleotides are >=Q30 in $SEQ2" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Count number of reads NUMLN_L=$(wc -l $SEQ1| awk '{print $1}') @@ -148,20 +143,21 @@ NUM_L="$(($NUMLN_L / 4))" NUM_R="$(($NUMLN_R / 4))" READ_TOT="$(($NUM_L + NUM_R))" wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' counted the number of reads...' -echo " There are $NUM_L reads in $SAMPLEL" -echo " There are $NUM_R reads in $SAMPLER" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo " There are $NUM_L reads in $SEQ1" +echo " There are $NUM_R reads in $SEQ2" +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' #Create results output file printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ Sample_Name Q20_Total_[bp] Q30_Total_[bp] Q20_R1_[bp] Q20_R2_[bp] Q20_R1_[%] Q20_R2_[%] Q30_R1_[bp] Q30_R2_[bp] Q30_R1_[%] Q30_R2_[%] Total_Sequenced_[bp] Total_Sequenced_[reads] NUMR1_[bp] NUMR2_[bp] \ -"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q20L" "$Q20R" "$PQ20L%" "$PQ20R%" "$Q30L" "$Q30R" "$PQ30L%" "$PQ30R%" "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" > "$3"/QualAssessRawSeqs_"$SAMPLE"_results.tsv ; +"$SAMPLE" "$Q20_TOT" "$Q30_TOT" "$Q20L" "$Q20R" "$PQ20L%" "$PQ20R%" "$Q30L" "$Q30R" "$PQ30L%" "$PQ30R%" "$BP_TOT" "$READ_TOT" "$NUM_L" "$NUM_R" \ +> "$3"/QualAssessRawSeqs_"$SAMPLE"_results.tab ; wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' created results output file...' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' if [[ "$OPSYS" == Linux ]]; then R1FILEPATH=$(readlink -m $1) @@ -171,14 +167,17 @@ elif [[ "$OPSYS" == Darwin ]]; then R2FILEPATH=$(echo `pwd`/`ls "$2"`) fi +[[ $1 == *.gz ]] && rm -f "$3"/"$SAMPLEL" +[[ $2 == *.gz ]] && rm -f "$3"/"$SAMPLER" + #Create log file printf "`date`\n$USER\n%s\n%s\n\n" \ "$R1FILEPATH" "$R2FILEPATH" \ > "$3"/QualAssessRawSeqs_"$SAMPLE"_results.log ; wait -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo ' created log output file...' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' echo " Quality assessment of $SAMPLE raw sequences completed" -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' -echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' +echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' From a048f989240d136149ce1f74348401df4adea8fc Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Thu, 10 Mar 2016 17:04:26 -0500 Subject: [PATCH 009/212] add example batch usage --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index dfa4270..9ff0269 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,5 @@ # Misc Genomics Scripts - **filter_contigs.py**: cleans up a _de novo_ assembly from SPAdes, Velvet, or IDBA (requires biopython). IDBA includes space-delimited data in their contig headers, and because SeqIO parses on whitespace, these will need to be removed or replaced (e.g., `sed -i 's/ /|/g' assembly.fna`). SPAdes and Velvet lack whitespace in their contig deflines, so those output files can be directly fed into this filtering script. + + **Example batch usage**: + If there are many SPAdes assembly output directories beginning with 3009 that need filtering, first cd into the parent dir containing all of the 3009\* dirs and execute: `for F in 3009*/contigs.fasta; do B=$(dirname $F | awk -F \/ '{print $1}'); filter_contigs.py -i $F -g -m -c 5 -l 500 -o "$B".fna -p ~/SPAdes_Assems/filtered_contigs; done` This took me 1 min for 340 assemblies. From cfb2761dd733eeb00a84e4bc6c52522e8037ec72 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Fri, 18 Mar 2016 17:25:18 -0400 Subject: [PATCH 010/212] duplicate finder bash script --- find_dupes.bash | 128 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100755 find_dupes.bash diff --git a/find_dupes.bash b/find_dupes.bash new file mode 100755 index 0000000..d852595 --- /dev/null +++ b/find_dupes.bash @@ -0,0 +1,128 @@ +#!/bin/bash + +set -e +function usage() { + echo " + Given a FastA file, identifies repetitive regions, and outputs a BED file + + usage: `basename $0` -r reference.fasta -o maskedRegions.bed [-l ] [-i ] [-t ] + + -r [default: ./reference.fasta] FastA formatted input file + -o [default: ./maskedRegions.bed] BED formatted output file + -i [default: 99] Minimum percent identity required between two + repeat regions + -l [default: 249] Minimum repeat length to find (in bp) + -t [default: ./tmp/] Temporary directory where intermediate files + are written to and removed + + Paths can be absolute or relative. + " + } + +# Defaults +PERC_ID=99 +LEN=249 +OUTFILE="$PWD/maskedRegions.bed" +REF="$PWD/reference.fasta" +TMP="$PWD/tmp" + +nopts=$# +for ((i=1 ; i <= nopts ; i++)); do + case "$1" in + -i | --percid) + PERC_ID="$2" + echo " min percent identity required: $2" + shift 2 + ;; + -l | --len) + LEN="$2" + echo " min length: $2" + shift 2 + ;; + -o | --out) + OUTFILE="$2" + echo " output file: $2" + shift 2 + ;; + -r | --ref) + REF="$2" + echo " reference genome: $2" + shift 2 + ;; + -t | --temp) + TMP="$2" + echo " temporary directory: $2" + shift 2 + ;; + -h | --help) + usage + exit 1 + ;; + \?) + echo " ERROR: $2 is not a valid argument" + usage + exit 1 + ;; + esac +done + +# Input file requirements and dependency check +[[ ! -e "$REF" || ! -s "$REF" ]] && { echo "ERROR: $REF cannot be read"; exit 1; } +command -v nucmer >/dev/null 2>&1 || { echo 'ERROR: nucmer not found' >&2; exit 1; } +command -v show-coords >/dev/null 2>&1 || { echo 'ERROR: show-coords not found' >&2; exit 1; } + +# Create tmp dir (if absent) +if [ ! -d "$TMP" ]; then + mkdir -p "$TMP" +else + echo "WARNING: Temporary directory $TMP already exists." + echo 'Files present in the specified tmp dir might disrupt analysis.' +fi + +# Primary system commands +cd "$TMP" #cannot specific outpath in nucmer, so change into tmp dir +nucmer --nosimplify --maxmatch -p DUPES "$REF" "$REF" 2> /dev/null +show-coords -I "$PERC_ID" -L "$LEN" -r -l -o -T DUPES.delta > filtered.coords + +# Parse nucmer output into BED format +[[ -a DUPES.bed ]] && rm DUPES.bed +while IFS=$'\t' read -r -a line; do + if [ ${line[0]} -gt ${line[1]} ]; then + l0=${line[0]} + l1=${line[1]} + line[0]=$l1 + line[1]=$l0 + fi + if [ ${line[2]} -gt ${line[3]} ]; then + l2=${line[2]} + l3=${line[3]} + line[2]=$l3 + line[3]=$l2 + fi + echo -e "${line[9]}\t${line[0]}\t${line[1]}" >> DUPES.bed + echo -e "${line[9]}\t${line[2]}\t${line[3]}" >> DUPES.bed +done < <(grep '^[1-9]' filtered.coords) + +sort -t$'\t' -k2 -n DUPES.bed > DUPES_sorted.bed +awk '{if(++dup[$0]==1) {print $0}}' DUPES_sorted.bed > DUPES_sorted_deduped.bed + +# Remove full-length chromosome matches; handles >1 contig +while read -r -a deflines; do + CHROM_ID=${deflines[1]} + END_POS=${deflines[3]} + # awk -v chrom="${CHROM_ID}" -v end="${END_POS}" '!/chrom\t1\tend/' DUPES_sorted_deduped.bed > tmp.bed + # mv tmp.bed DUPES_sorted_deduped.bed + TAB=$'\t' + sed -i "/${CHROM_ID}${TAB}1${TAB}${END_POS}/d" DUPES_sorted_deduped.bed +done < <(grep '^>' DUPES.delta) + +# Merge common overlapping regions +bedtools merge -i DUPES_sorted_deduped.bed > maskedRegions.bed + +REGIONS=$(wc -l maskedRegions.bed | awk '{print $1}') +SITES=$(awk -F'\t' 'BEGIN{SUM=0}; {SUM+=$3-$2}; END{print SUM}' maskedRegions.bed) +echo "found $REGIONS regions and $SITES total sites to mask" + +# Cleanup +mv -v maskedRegions.bed "$OUTFILE" +rm -r "$TMP" From e7d481b8cc289912d8917b4baff8cf0069a9fb75 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 20 Apr 2016 11:44:04 -0400 Subject: [PATCH 011/212] splits up nucleotide seqs and optionally removes gaps --- split_multifasta.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 split_multifasta.py diff --git a/split_multifasta.py b/split_multifasta.py new file mode 100755 index 0000000..1080814 --- /dev/null +++ b/split_multifasta.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import argparse +import os +import sys +from Bio import SeqIO +from Bio.Alphabet import generic_dna +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +def parseArgs(): + parser = argparse.ArgumentParser( + description='splits multi-FastA file into individual FastA files') + parser.add_argument('-i', '--infile', + required=True, help='input multi-FastA file') + parser.add_argument('-o', '--outdir', + help='output directory path [cwd of input file]') + parser.add_argument('-g', '--nogaps', action='store_true', default=False, + help='toggle on removal of \'~\' and \'-\' gaps [off]') + return parser.parse_args() + +def main(): + args = parseArgs() + infile = args.infile + outdir = args.outdir + + if outdir is None: + outdir = os.path.dirname(infile) + + mfasta = SeqIO.parse(infile, 'fasta') + + for rec in mfasta: + seq = str(rec.seq).upper() + if args.nogaps: + seq = seq.replace('-', '').replace('~', '') + name = (rec.id).split()[0] + new_rec = SeqRecord(Seq(seq, generic_dna), id=name, description='') + SeqIO.write(new_rec, '{}.fasta'.format(name), 'fasta') + + +if __name__ == '__main__': + main() From 55e54169e7b3cb2d0650281ce12540e8e67dbbac Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 2 May 2016 11:56:08 -0400 Subject: [PATCH 012/212] quickly calc %mapped metric for entire dir of BAMs --- calc_percent_reads_mapped.bash | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 calc_percent_reads_mapped.bash diff --git a/calc_percent_reads_mapped.bash b/calc_percent_reads_mapped.bash new file mode 100755 index 0000000..ea0c14d --- /dev/null +++ b/calc_percent_reads_mapped.bash @@ -0,0 +1,61 @@ +#!/bin/bash + +function usage { + echo " + Usage: `basename $0` -b /InputPath/bam-dir [-o ./stats.mapping.tab] + " + } + +nopts=$# +for ((i=1 ; i <= nopts ; i++)); do + case $1 in + -b | --bams) # dir of BAM files + BAMDIR=$2 + echo " calculating mapping statistics on BAMs in: $2" + shift 2 + ;; + -o | --out) # output file (optionally with filepath) + OUT=$2 + echo " output file saved as: $2" + shift 2 + ;; + -h | --help) + usage + exit 1 + ;; + esac +done + +# depend checks +[[ -z $BAMDIR ]] && { usage; exit 1; } +command -v bamtools >/dev/null 2>&1 || { echo 'ERROR: bamtools not found' >&2; exit 1; } + +# default output +[[ -z $OUT ]] && OUT='stats.mapping.tab' + +# Create output path and output summary file +DIR=$(dirname "$OUT") +if [ ! -d "$DIR" ]; then + mkdir -p "$DIR" + echo ' Created output dir...' +else + echo "WARNING: Output dir $DIR already exists. Competing files will be overwritten." +fi +echo -e "Sample\tReads_Mapped[%]" > "$OUT" # overwrite to avoid appending data to different projects (if script exec >1 with same outfile) + +for bam in "$BAMDIR"/*.bam; do + b=$(basename "$bam") + bamtools stats -in "$bam" > "$DIR"/tmpstatsfull."$b".tmp + + grep 'Mapped reads' tmpstatsfull."$b".tmp | awk 'BEGIN {FS="\t"}; {print $2}' | sed 's/[()]//g' > "$DIR"/tmpstatsperc."$b".tmp + rm "$DIR"/tmpstatsfull."$b".tmp + + if [ -s tmpstatsperc."$b".tmp ]; then # percent was extracted + p=$(cat "$DIR"/tmpstatsperc."$b".tmp) + echo -e "$b\t$p" >> "$OUT" + else # no percent avail + echo -e "$b\tError" >> "$OUT" + fi + rm "$DIR"/tmpstatsperc."$b".tmp +done + From 04206a41fe1083616d25281d9182d4617abc8525 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 2 May 2016 12:04:22 -0400 Subject: [PATCH 013/212] calc core SNPs per ref genome size, as percentage --- calc_percent_core_genome_SNPs.bash | 41 ++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 calc_percent_core_genome_SNPs.bash diff --git a/calc_percent_core_genome_SNPs.bash b/calc_percent_core_genome_SNPs.bash new file mode 100755 index 0000000..be28652 --- /dev/null +++ b/calc_percent_core_genome_SNPs.bash @@ -0,0 +1,41 @@ +#!/bin/bash + + +function usage { + echo " + Usage: `basename $0` -c coreSNPs.fasta -r reference.fasta + " + } + +nopts=$# +for ((i=1 ; i <= nopts ; i++)); do + case $1 in + -c | --core) # FastA format + CORESNPS=$2 + echo " core SNP file: $2" + shift 2 + ;; + -r | --ref) # FastA format + REF=$2 + echo " ref genome file: $2" + shift 2 + ;; + -h | --help) + usage + exit 1 + ;; + esac +done + +[[ -z $CORESNPS || -z $REF ]] && { usage; exit 1; } + +REF_GENOME=$(grep -v '^>' $REF | tr -d [:space:] | wc -c) +echo " found $REF_GENOME sites in the reference genome" + +TOT_SITES=$(grep -v '^>' $CORESNPS | tr -d [:space:] | wc -c) +TOT_SMPL=$(grep -c '^>' $CORESNPS) +SITES_PER_SMPL=$((TOT_SITES / TOT_SMPL)) +echo " found $SITES_PER_SMPL sites in each sample in $CORESNPS" + +CORE_PER_REF=$(echo "scale=4;($SITES_PER_SMPL/$REF_GENOME)*100" | bc) +echo -e "\n ${CORE_PER_REF}% of the reference genome was identified as core sites\n" From 00e2e2b3e12709ae9f14cd78932e26bbac7f2619 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 4 May 2016 18:35:46 -0400 Subject: [PATCH 014/212] old example of just read cleaning and assembly --- SolexaQA_and_SPAdes.sh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 SolexaQA_and_SPAdes.sh diff --git a/SolexaQA_and_SPAdes.sh b/SolexaQA_and_SPAdes.sh new file mode 100755 index 0000000..1984592 --- /dev/null +++ b/SolexaQA_and_SPAdes.sh @@ -0,0 +1,31 @@ +#!/bin/sh + + +[[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]] && {echo 'Proj dir required as input'; exit 1;} + +# Setup Proj where RawFQs already exist +mkdir -p "$1"/{TrimFQs,Asm} +cd "$1"/RawFQs + +# Qual Trim +for i in *_L001_R1_001.fastq; do + b=$(basename $i _R1_001.fastq); + SolexaQA++ dynamictrim $b_L001_R1_001.fastq $b_L001_R2_001.fastq -d ../TrimFQs --sanger -h 20; + SolexaQA++ lengthsort ../TrimFQs/$b_L001_R1_001.fastq.trimmed ../TrimFQs/$b_L001_R2_001.fastq.trimmed -d ../TrimFQs -l 50; +done + +# Merge PDF summaries of trimming +cd ../TrimFQs +gs -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -q -sOutputFile=Summary_Q20_Trim.pdf *.fastq_trimmed.segments_hist.pdf +gs -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -q -sOutputFile=Summary_Q20_Trim.Sort.pdf *.fastq.trimmed.summary.txt.pdf +rm *.fastq_trimmed.segments_hist.pdf *.fastq.trimmed.summary.txt.pdf + +# Rename file extensions because my QA script && SPAdes mandate FQ or FASTQ extension +rename "s/\.fastq\.trimmed\.paired/\.trimmed\.paired\.fq/" *.fastq.trimmed.paired +rename "s/\.fastq\.trimmed\.single/\.trimmed\.single\.fq/" *.fastq.trimmed.single + +# Assemble +for i in *.trimmed.single.fq; do + b=$(basename $i _L001_R1_001.trimmed.single.fq); + spades.py -k 41,79,85,97 --careful --phred-offset 33 --pe1-1 "$b"_L001_R1_001.trimmed.paired.fq --pe1-2 "$b"_L001_R2_001.trimmed.paired.fq --pe1-s "$b"_L001_R1_001.trimmed.single.fq -o ../Asm/$b -t 30; +done From 57e190b1d06923e09576b1d3e06d8918662be82d Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Wed, 4 May 2016 22:06:16 -0400 Subject: [PATCH 015/212] quick script for nucl freqs --- calc_ATCG_content.bash | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 calc_ATCG_content.bash diff --git a/calc_ATCG_content.bash b/calc_ATCG_content.bash new file mode 100644 index 0000000..7cb907f --- /dev/null +++ b/calc_ATCG_content.bash @@ -0,0 +1,32 @@ +#!/bin/bash + +function usage { + echo " + Usage: `basename $0` /InputPath/input.fasta + " + } + +# check for infile +[[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]] && { usage; exit 1;} +if test -f "$1" -a -r "$1"; then + GENOME=$1 # input FastA file +else + usage; exit 1; +fi + +# Calculate full ATCG content +grep -v '^>' "$GENOME" | tr -d [:space:] > /tmp/sequence.txt +TOT_NUM_A=$(grep -o 'A' /tmp/sequence.txt | wc -l) +TOT_NUM_T=$(grep -o 'T' /tmp/sequence.txt | wc -l) +TOT_NUM_C=$(grep -o 'C' /tmp/sequence.txt | wc -l) +TOT_NUM_G=$(grep -o 'G' /tmp/sequence.txt | wc -l) +TOT_NUM_N=$(grep -o 'N' /tmp/sequence.txt | wc -l) +TOT_NUM_GAP=$(grep -o '-' /tmp/sequence.txt | wc -l) +rm /tmp/sequence.txt + +echo "A: $TOT_NUM_A +T: $TOT_NUM_T +C: $TOT_NUM_C +G: $TOT_NUM_G +N: $TOT_NUM_N +gaps: $TOT_NUM_GAP" From 348a51b8eede7a8d8881851fbaaa3b18752e2f08 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Wed, 4 May 2016 22:08:57 -0400 Subject: [PATCH 016/212] just calc nucl freqs, gaps, and Ns --- calc_ATCG_content.bash | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 calc_ATCG_content.bash diff --git a/calc_ATCG_content.bash b/calc_ATCG_content.bash old mode 100644 new mode 100755 From 6096d320eac07b0d668d2b9f8e5e18472f2e6b90 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 23 May 2016 12:39:49 -0400 Subject: [PATCH 017/212] save disk space before backups by removing intermediate SPAdes files and dirs --- prune_SPAdes_assembly_dirs.bash | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 prune_SPAdes_assembly_dirs.bash diff --git a/prune_SPAdes_assembly_dirs.bash b/prune_SPAdes_assembly_dirs.bash new file mode 100755 index 0000000..00260b3 --- /dev/null +++ b/prune_SPAdes_assembly_dirs.bash @@ -0,0 +1,14 @@ +#!/bin/bash + +# recursively searches within input dir to remove most dirs and files SPAdes creates +# but maintains essential log files, contigs, and scaffolds FastA files + +# usage: bash ~/scripts/prune_SPAdes_assembly_dirs.bash + +mapfile -t asm_dirs < <(find -L "$1" -type d -regextype posix-extended -regex '\/.*\/K[1-9][0-9]{1,2}' -print | xargs dirname | uniq) + +for directory in "${asm_dirs[@]}"; do + rm -v "$directory"/{assembly_graph.fastg,before_rr.fasta,contigs.paths,dataset.info,input_dataset.yaml,scaffolds.paths}; +done + +find -L "$1" -type d -regextype posix-extended -regex '\/.*\/(K[1-9][0-9]{1,2}|misc|tmp)' -print | xargs rm -rv From e8c9a1cf88cfa83c60af7bedee1df787eacd3cc3 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 23 May 2016 17:43:16 -0400 Subject: [PATCH 018/212] depend on bedtools too --- find_dupes.bash | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/find_dupes.bash b/find_dupes.bash index d852595..59909a8 100755 --- a/find_dupes.bash +++ b/find_dupes.bash @@ -1,7 +1,7 @@ #!/bin/bash set -e -function usage() { +function usage { echo " Given a FastA file, identifies repetitive regions, and outputs a BED file @@ -70,6 +70,7 @@ done [[ ! -e "$REF" || ! -s "$REF" ]] && { echo "ERROR: $REF cannot be read"; exit 1; } command -v nucmer >/dev/null 2>&1 || { echo 'ERROR: nucmer not found' >&2; exit 1; } command -v show-coords >/dev/null 2>&1 || { echo 'ERROR: show-coords not found' >&2; exit 1; } +command -v bedtools >/dev/null 2>&1 || { echo 'ERROR: bedtools not found' >&2; exit 1; } # Create tmp dir (if absent) if [ ! -d "$TMP" ]; then From 9898a6cbb316a1c5ad40c8e86596b374bd4b2167 Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 23 May 2016 17:45:39 -0400 Subject: [PATCH 019/212] update examples for dupe-finder and cleaning up env from BLAST and SPAdes files --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 9ff0269..3d3bf73 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,15 @@ # Misc Genomics Scripts +- **calc_ATCG_content.bash**: a unix way to quickly get A,T,C,G,N,- content from a FastA file; handles linewraps and multiple records + - **filter_contigs.py**: cleans up a _de novo_ assembly from SPAdes, Velvet, or IDBA (requires biopython). IDBA includes space-delimited data in their contig headers, and because SeqIO parses on whitespace, these will need to be removed or replaced (e.g., `sed -i 's/ /|/g' assembly.fna`). SPAdes and Velvet lack whitespace in their contig deflines, so those output files can be directly fed into this filtering script. **Example batch usage**: If there are many SPAdes assembly output directories beginning with 3009 that need filtering, first cd into the parent dir containing all of the 3009\* dirs and execute: `for F in 3009*/contigs.fasta; do B=$(dirname $F | awk -F \/ '{print $1}'); filter_contigs.py -i $F -g -m -c 5 -l 500 -o "$B".fna -p ~/SPAdes_Assems/filtered_contigs; done` This took me 1 min for 340 assemblies. + +- **find_dupes.bash**: given a FastA file, identifies repetitive regions, and outputs a BED file; flexible opts for defining repetitive sites; depends on BEDTools and MUMmer (nucmer) + +- ** + +#### Cleaning up disk space +- BLAST searching requires index files that can be easily and quickly re-generated, so remove all leftover binary files within the Desktop dir: `find ~/Desktop -type f -regextype posix-extended -regex '.*\.(nih|nin|nsq|psi|psq)' -print | xargs rm -v` +- SPAdes keeps a lot of intermediate files, so delete these but keep essential log and FastA files to repeat the assembly if necessary: `bash ~/genomics_scripts/prune_SPAdes_assembly_dirs.bash ~/Desktop` From a3be48c35e5af472627446c938fcc6c710cbb60a Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Mon, 23 May 2016 17:47:12 -0400 Subject: [PATCH 020/212] stylistic fmt change --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d3bf73..a466f5c 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ - **find_dupes.bash**: given a FastA file, identifies repetitive regions, and outputs a BED file; flexible opts for defining repetitive sites; depends on BEDTools and MUMmer (nucmer) -- ** +*** -#### Cleaning up disk space +# Cleaning up disk space - BLAST searching requires index files that can be easily and quickly re-generated, so remove all leftover binary files within the Desktop dir: `find ~/Desktop -type f -regextype posix-extended -regex '.*\.(nih|nin|nsq|psi|psq)' -print | xargs rm -v` - SPAdes keeps a lot of intermediate files, so delete these but keep essential log and FastA files to repeat the assembly if necessary: `bash ~/genomics_scripts/prune_SPAdes_assembly_dirs.bash ~/Desktop` From d653bd2ca759a786bd2ffd36b0d817124c640a4a Mon Sep 17 00:00:00 2001 From: Christopher Gulvik Date: Wed, 25 May 2016 14:51:31 -0400 Subject: [PATCH 021/212] expand blast files --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a466f5c..f7c6347 100644 --- a/README.md +++ b/README.md @@ -11,5 +11,5 @@ *** # Cleaning up disk space -- BLAST searching requires index files that can be easily and quickly re-generated, so remove all leftover binary files within the Desktop dir: `find ~/Desktop -type f -regextype posix-extended -regex '.*\.(nih|nin|nsq|psi|psq)' -print | xargs rm -v` +- BLAST searching requires index files that can be easily and quickly re-generated, so remove all leftover binary files within the Desktop dir: `find ~/Desktop -type f -regextype posix-extended -regex '.*\.(nhr|nih|nin|nog|nsd|nsi|nsq|psi|psq)' -print | xargs rm -v` - SPAdes keeps a lot of intermediate files, so delete these but keep essential log and FastA files to repeat the assembly if necessary: `bash ~/genomics_scripts/prune_SPAdes_assembly_dirs.bash ~/Desktop` From 9e1975dc41edf77694a82d2f1496f7fc9a08d753 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 9 Jun 2016 18:13:07 -0400 Subject: [PATCH 022/212] download FastQ files from NCBI from BioSample Accessions --- README.md | 3 +++ biosample2FastQ.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100755 biosample2FastQ.py diff --git a/README.md b/README.md index f7c6347..02ebab4 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ # Misc Genomics Scripts +- **biosample2FastQ.py**: download FastQ read files from NCBI for a given biosample (or SRR) accession + - **calc_ATCG_content.bash**: a unix way to quickly get A,T,C,G,N,- content from a FastA file; handles linewraps and multiple records - **filter_contigs.py**: cleans up a _de novo_ assembly from SPAdes, Velvet, or IDBA (requires biopython). IDBA includes space-delimited data in their contig headers, and because SeqIO parses on whitespace, these will need to be removed or replaced (e.g., `sed -i 's/ /|/g' assembly.fna`). SPAdes and Velvet lack whitespace in their contig deflines, so those output files can be directly fed into this filtering script. @@ -8,6 +10,7 @@ - **find_dupes.bash**: given a FastA file, identifies repetitive regions, and outputs a BED file; flexible opts for defining repetitive sites; depends on BEDTools and MUMmer (nucmer) + *** # Cleaning up disk space diff --git a/biosample2FastQ.py b/biosample2FastQ.py new file mode 100755 index 0000000..809640e --- /dev/null +++ b/biosample2FastQ.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +import os +import sys + +# Downloads raw FastQ reads given a BioSample accession (SAMN########) +# or SRA Run sccession (SRR#######) +# Depends on: +# entrez direct scripts +# a SRA toolkit binary + +os.system('esearch -db sra -query {} | efetch -format docsum | ' + 'xtract -element Runs > /tmp/run_info'.format(sys.argv[1])) + +download = [] +with open('/tmp/run_info', 'r') as run_info: + for l in run_info: + if 'SRR' in l: + SRR = l.lstrip(' Date: Sat, 11 Jun 2016 00:34:30 -0400 Subject: [PATCH 023/212] add version opt --- QualAssessCleanSeqs.bash | 3 +++ QualAssessRawSeqs.bash | 3 +++ 2 files changed, 6 insertions(+) diff --git a/QualAssessCleanSeqs.bash b/QualAssessCleanSeqs.bash index f5af3ed..3dd309f 100755 --- a/QualAssessCleanSeqs.bash +++ b/QualAssessCleanSeqs.bash @@ -16,6 +16,9 @@ usage() { if [[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]]; then usage exit 1 +elif [[ "$1" == "-v" || "$1" == "--version" ]]; then + echo 'v1.1' + exit 0 fi #Require 4 arguments diff --git a/QualAssessRawSeqs.bash b/QualAssessRawSeqs.bash index 1408181..425a9e1 100755 --- a/QualAssessRawSeqs.bash +++ b/QualAssessRawSeqs.bash @@ -16,6 +16,9 @@ usage() { if [[ "$1" == "" || "$1" == "--help" || "$1" == "-h" ]]; then usage exit 1 +elif [[ "$1" == "-v" || "$1" == "--version" ]]; then + echo '1.1' + exit 0 fi #Require 3 arguments From f0f4b139ac3053365119708cf9a114514df825c9 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Fri, 17 Jun 2016 15:29:08 -0400 Subject: [PATCH 024/212] genome to protein MWs --- gbk2proteome-molec-weights.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 gbk2proteome-molec-weights.py diff --git a/gbk2proteome-molec-weights.py b/gbk2proteome-molec-weights.py new file mode 100755 index 0000000..05f8b3c --- /dev/null +++ b/gbk2proteome-molec-weights.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +# Given a genbank file, prints to stdout a tab-delimited summary +# of the proteome including the molecular weight in Daltons +# Usage: python script.py + +from Bio import SeqIO +from Bio import SeqUtils +from decimal import Decimal, ROUND_DOWN, ROUND_UP +import os +import sys + +infile = SeqIO.parse(sys.argv[1], 'genbank') +filename = os.path.splitext(os.path.splitext(os.path.basename(sys.argv[1]))[0])[0] + +for rec in infile: + for s in rec.features: + if (s.type == 'CDS' and 'gene') in s.qualifiers: + gene = s.qualifiers['gene'][0] + locus = s.qualifiers['locus_tag'][0] + product = s.qualifiers['product'][0] + protein_mw = Decimal(SeqUtils.molecular_weight(s.qualifiers['translation'][0], + seq_type='protein')).quantize(Decimal('.001'), rounding=ROUND_UP) + print '{}\t{}\t{}\t{}'.format(gene,locus,product,protein_mw) From d186d5263b588a9d5a13b9a52fe385d7307b554c Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Fri, 17 Jun 2016 16:43:26 -0400 Subject: [PATCH 025/212] when ambiguous amino acid found, estimate MW and indicate with ~ as prefix to MW --- gbk2proteome-molec-weights.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/gbk2proteome-molec-weights.py b/gbk2proteome-molec-weights.py index 05f8b3c..62edd17 100755 --- a/gbk2proteome-molec-weights.py +++ b/gbk2proteome-molec-weights.py @@ -1,24 +1,37 @@ #!/usr/bin/env python # Given a genbank file, prints to stdout a tab-delimited summary -# of the proteome including the molecular weight in Daltons +# of the proteome including the molecular weight in Daltons, where +# '~' indicates the MW is estimated due to ambiguous amino acid(s) + # Usage: python script.py from Bio import SeqIO from Bio import SeqUtils -from decimal import Decimal, ROUND_DOWN, ROUND_UP +from decimal import Decimal, ROUND_UP import os +import string import sys -infile = SeqIO.parse(sys.argv[1], 'genbank') +def calc_molec_weight(protein_seq): + return Decimal(SeqUtils.molecular_weight(protein_seq, seq_type='protein') + ).quantize(Decimal('.001'), rounding=ROUND_UP) + +infile = SeqIO.parse(sys.argv[1], 'genbank') filename = os.path.splitext(os.path.splitext(os.path.basename(sys.argv[1]))[0])[0] for rec in infile: for s in rec.features: if (s.type == 'CDS' and 'gene') in s.qualifiers: - gene = s.qualifiers['gene'][0] - locus = s.qualifiers['locus_tag'][0] - product = s.qualifiers['product'][0] - protein_mw = Decimal(SeqUtils.molecular_weight(s.qualifiers['translation'][0], - seq_type='protein')).quantize(Decimal('.001'), rounding=ROUND_UP) + gene = s.qualifiers['gene'][0] + locus = s.qualifiers['locus_tag'][0] + product = s.qualifiers['product'][0] + protein = s.qualifiers['translation'][0] + if 'X' in protein: + num_X = protein.count('X') + estimated_mw = Decimal(num_X) * Decimal('128.16') #UNK=(C5)(H6)(N1)(O3)=128.16 + peptide_mw = calc_molec_weight(protein.replace('X', '')) + protein_mw = '~' + str(peptide_mw + estimated_mw) + else: + protein_mw = calc_molec_weight(protein) print '{}\t{}\t{}\t{}'.format(gene,locus,product,protein_mw) From c3917f67eb6256ca29457a26c0183001611bd556 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Tue, 21 Jun 2016 19:01:08 -0400 Subject: [PATCH 026/212] get a single GenBank file given accession(s) --- genbankacc2gbk.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100755 genbankacc2gbk.py diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py new file mode 100755 index 0000000..41693af --- /dev/null +++ b/genbankacc2gbk.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + + +import os +import re +import shutil +import sys +from Bio import Entrez + +def multi_gbk(acc_list, gbk): + # time.sleep(3) #be nice to NCBI + # fetched = Entrez.efetch(db='nucleotide', id=acc_list, rettype='gbwithparts', retmode='text').read() + # for record in fetched.split('//\n'): + # if wgs_acc_list[0] not in record: + # sys.exit('ERROR: improperly retrieved accession {}'.format(wgs_acc_list[0])) + # del wgs_acc_list[0] + # with open(gbk, 'w') as of: + # of.write(fetched) + for acc in acc_list: + url = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id={}&rettype=gbwithparts&retmode=gb'.format(acc) + os.system('wget --no-glob -O {} {}'.format(acc + '.gbk.tmp', url)) + if acc not in open(acc + '.gbk.tmp', 'r').read(): + sys.exit('ERROR: unable to find {} in retrieved gbk.tmp file'.format(acc)) + with open(gbk, 'w') as outfile: + for f in [s + '.gbk.tmp' for s in acc_list]: + with open(f, 'r') as contig: + shutil.copyfileobj(contig, outfile) + os.remove(f) + if len(fetched) < 7500: + sys.exit('ERROR: suspiciously short GenBank record') + +def get_gbk(accession, gbk): + fetched = Entrez.efetch(db='nucleotide', id=accession, rettype='gbwithparts', retmode='text').read() + if accession not in fetched: + sys.exit('ERROR: unable to retrieve accession number {}'.format(accession)) + if 'WGS ' in fetched: #entry is the master record for a whole genome shotgun sequencing project and contains no sequence data + for line in fetched.split('\n'): + if line.startswith('WGS '): + wgs_accs = line.lstrip('WGS ').split('-') + wgs_acc_pref_first = re.sub('\d', '', wgs_accs[0]) + wgs_acc_pref_last = re.sub('\d', '', wgs_accs[1]) + if wgs_acc_pref_first != wgs_acc_pref_last: + sys.exit('ERROR: unable to parse WGS info') + wgs_acc_suff_first = int(re.sub('\D', '', wgs_accs[0])) + wgs_acc_suff_last = int(re.sub('\D', '', wgs_accs[1])) + acc_suffs = range(wgs_acc_suff_first, wgs_acc_suff_last, 1) + wgs_acc_list = [wgs_acc_pref_first + str(s) for s in acc_suffs] + print '\t{} accession is an incomplete chromosome.\n\tRetrieving {} individual contigs...'.format(accession, str(len(wgs_acc_list))) + multi_gbk(wgs_acc_list, gbk) + elif len(fetched) < 7500: + sys.exit('{}\nERROR: suspiciously short record for {}'.format(fetched, accession)) + else: + with open(gbk, 'w') as of: + of.write(fetched) + print '\tfound {}'.format(accession) + +def main(): + Entrez.email = 'yourname@univ.edu' + if len(sys.argv) == 2: + get_gbk(sys.argv[1], '{}.gbk'.format(sys.argv[1])) + elif len(sys.argv) > 2: + accs = [acc for acc in sys.argv[1:]] + multi_gbk(wgs_acc_list, '{}.gbk'.format(','.join(accs))) + else: + sys.exit('Usage: genbankacc2gbk.py [additional GenBank accessions...]') + +if __name__ == '__main__': + main() \ No newline at end of file From 7e8b71a1f05864ffcb634d96c7d6d3bcdfa5dfa3 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 09:48:08 -0400 Subject: [PATCH 027/212] extract a protein seq from genbank given locus_tag --- locus_tag2faa.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 locus_tag2faa.py diff --git a/locus_tag2faa.py b/locus_tag2faa.py new file mode 100755 index 0000000..adb12b4 --- /dev/null +++ b/locus_tag2faa.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + + +# Usage: python script.py + +from Bio import SeqIO +import sys + +infile = SeqIO.parse(sys.argv[1], 'genbank') +want = str(sys.argv[2]) + +for record in infile: + for s in record.features: + if 'locus_tag' and 'translation' and 'protein_id' and 'product' in s.qualifiers: + locus_record = s.qualifiers['locus_tag'][0] + if locus_record == want: + print '>' + s.qualifiers['locus_tag'][0] + '|' + s.qualifiers['protein_id'][0] + '|' + s.qualifiers['product'][0] + '\n' + s.qualifiers['translation'][0] From b3ac39a16513c9874c7ad1f9ea52452f0926eb8c Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 13:07:03 -0400 Subject: [PATCH 028/212] now adds \"CONTIG\" line when \>1 record in GBK output file --- genbankacc2gbk.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py index 41693af..ee41307 100755 --- a/genbankacc2gbk.py +++ b/genbankacc2gbk.py @@ -3,31 +3,26 @@ import os import re -import shutil import sys +import time from Bio import Entrez def multi_gbk(acc_list, gbk): - # time.sleep(3) #be nice to NCBI - # fetched = Entrez.efetch(db='nucleotide', id=acc_list, rettype='gbwithparts', retmode='text').read() - # for record in fetched.split('//\n'): - # if wgs_acc_list[0] not in record: - # sys.exit('ERROR: improperly retrieved accession {}'.format(wgs_acc_list[0])) - # del wgs_acc_list[0] - # with open(gbk, 'w') as of: - # of.write(fetched) - for acc in acc_list: - url = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id={}&rettype=gbwithparts&retmode=gb'.format(acc) - os.system('wget --no-glob -O {} {}'.format(acc + '.gbk.tmp', url)) - if acc not in open(acc + '.gbk.tmp', 'r').read(): - sys.exit('ERROR: unable to find {} in retrieved gbk.tmp file'.format(acc)) - with open(gbk, 'w') as outfile: - for f in [s + '.gbk.tmp' for s in acc_list]: - with open(f, 'r') as contig: - shutil.copyfileobj(contig, outfile) - os.remove(f) + fetched = Entrez.efetch(db='nucleotide', id=acc_list, rettype='gbwithparts', retmode='text').read().rstrip('\n') + print 'found {} GenBank records'.format(str(len(fetched.split('//\n\n')))) if len(fetched) < 7500: sys.exit('ERROR: suspiciously short GenBank record') + contigs = [] + for record in fetched.split('\n\n'): + if acc_list[0] not in record: + sys.exit('ERROR: improperly retrieved accession {}'.format(acc_list[0])) + del acc_list[0] + contig = record.rstrip('\n').split('\n') + i = contig.index('ORIGIN ') + contig.insert(i, 'CONTIG') #adding this line to multigenbank style enables biopython to parse in seqio as gb + contigs.extend(contig) + with open(gbk, 'w') as of: + of.write('\n'.join(s for s in contigs)) def get_gbk(accession, gbk): fetched = Entrez.efetch(db='nucleotide', id=accession, rettype='gbwithparts', retmode='text').read() @@ -44,8 +39,10 @@ def get_gbk(accession, gbk): wgs_acc_suff_first = int(re.sub('\D', '', wgs_accs[0])) wgs_acc_suff_last = int(re.sub('\D', '', wgs_accs[1])) acc_suffs = range(wgs_acc_suff_first, wgs_acc_suff_last, 1) - wgs_acc_list = [wgs_acc_pref_first + str(s) for s in acc_suffs] + wgs_acc_list = [wgs_acc_pref_first + str('{num:08d}'.format(num=s)) for s in acc_suffs] print '\t{} accession is an incomplete chromosome.\n\tRetrieving {} individual contigs...'.format(accession, str(len(wgs_acc_list))) + time.sleep(3) #be nice to NCBI before doing a second efetch request + print wgs_acc_list multi_gbk(wgs_acc_list, gbk) elif len(fetched) < 7500: sys.exit('{}\nERROR: suspiciously short record for {}'.format(fetched, accession)) @@ -55,14 +52,14 @@ def get_gbk(accession, gbk): print '\tfound {}'.format(accession) def main(): + if len(sys.argv) == 1 or sys.argv[1] == '-h' or '--help': + sys.exit('\tUsage: genbankacc2gbk.py [additional GenBank accessions...]\n\n\tif more than one accession is provided all will be merged into a single output file\n') Entrez.email = 'yourname@univ.edu' if len(sys.argv) == 2: get_gbk(sys.argv[1], '{}.gbk'.format(sys.argv[1])) elif len(sys.argv) > 2: accs = [acc for acc in sys.argv[1:]] - multi_gbk(wgs_acc_list, '{}.gbk'.format(','.join(accs))) - else: - sys.exit('Usage: genbankacc2gbk.py [additional GenBank accessions...]') + multi_gbk(accs, '{}.gbk'.format(','.join(accs))) if __name__ == '__main__': main() \ No newline at end of file From d6e39df5deb6bfd998e717c0e02f31452a8203e6 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 17:09:56 -0400 Subject: [PATCH 029/212] fix help msg --- genbankacc2gbk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py index ee41307..cffe59d 100755 --- a/genbankacc2gbk.py +++ b/genbankacc2gbk.py @@ -52,7 +52,7 @@ def get_gbk(accession, gbk): print '\tfound {}'.format(accession) def main(): - if len(sys.argv) == 1 or sys.argv[1] == '-h' or '--help': + if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': sys.exit('\tUsage: genbankacc2gbk.py [additional GenBank accessions...]\n\n\tif more than one accession is provided all will be merged into a single output file\n') Entrez.email = 'yourname@univ.edu' if len(sys.argv) == 2: @@ -62,4 +62,4 @@ def main(): multi_gbk(accs, '{}.gbk'.format(','.join(accs))) if __name__ == '__main__': - main() \ No newline at end of file + main() From 51fd7ab5393c32be08fb384cf3a04d5621645880 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 17:12:10 -0400 Subject: [PATCH 030/212] silence WGS contig list printing --- genbankacc2gbk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py index cffe59d..9d8cf5f 100755 --- a/genbankacc2gbk.py +++ b/genbankacc2gbk.py @@ -42,7 +42,6 @@ def get_gbk(accession, gbk): wgs_acc_list = [wgs_acc_pref_first + str('{num:08d}'.format(num=s)) for s in acc_suffs] print '\t{} accession is an incomplete chromosome.\n\tRetrieving {} individual contigs...'.format(accession, str(len(wgs_acc_list))) time.sleep(3) #be nice to NCBI before doing a second efetch request - print wgs_acc_list multi_gbk(wgs_acc_list, gbk) elif len(fetched) < 7500: sys.exit('{}\nERROR: suspiciously short record for {}'.format(fetched, accession)) From e1a54357b33bc0b3066cd18553cd238def1debd4 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 17:17:52 -0400 Subject: [PATCH 031/212] cleaner printing to stdout --- genbankacc2gbk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py index 9d8cf5f..6b4071e 100755 --- a/genbankacc2gbk.py +++ b/genbankacc2gbk.py @@ -9,7 +9,7 @@ def multi_gbk(acc_list, gbk): fetched = Entrez.efetch(db='nucleotide', id=acc_list, rettype='gbwithparts', retmode='text').read().rstrip('\n') - print 'found {} GenBank records'.format(str(len(fetched.split('//\n\n')))) + print '\tfound {} GenBank records'.format(str(len(fetched.split('//\n\n')))) if len(fetched) < 7500: sys.exit('ERROR: suspiciously short GenBank record') contigs = [] From 8ae1f80df390acd46c859c5312046e65337a8378 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 23 Jun 2016 18:58:57 -0400 Subject: [PATCH 032/212] fix +1 error and capture the last contig for WGS requests --- genbankacc2gbk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genbankacc2gbk.py b/genbankacc2gbk.py index 6b4071e..581f1b6 100755 --- a/genbankacc2gbk.py +++ b/genbankacc2gbk.py @@ -38,7 +38,7 @@ def get_gbk(accession, gbk): sys.exit('ERROR: unable to parse WGS info') wgs_acc_suff_first = int(re.sub('\D', '', wgs_accs[0])) wgs_acc_suff_last = int(re.sub('\D', '', wgs_accs[1])) - acc_suffs = range(wgs_acc_suff_first, wgs_acc_suff_last, 1) + acc_suffs = range(wgs_acc_suff_first, wgs_acc_suff_last + 1, 1) wgs_acc_list = [wgs_acc_pref_first + str('{num:08d}'.format(num=s)) for s in acc_suffs] print '\t{} accession is an incomplete chromosome.\n\tRetrieving {} individual contigs...'.format(accession, str(len(wgs_acc_list))) time.sleep(3) #be nice to NCBI before doing a second efetch request From 06cc0ef490905cc1d009c39b52b57898c5d3bf03 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Tue, 28 Jun 2016 00:00:22 -0400 Subject: [PATCH 033/212] extracts gene and prot seqs from gbk given HMM models --- HMM.search.summarize.py | 186 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100755 HMM.search.summarize.py diff --git a/HMM.search.summarize.py b/HMM.search.summarize.py new file mode 100755 index 0000000..c8ee07b --- /dev/null +++ b/HMM.search.summarize.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python + + +import os +import string +import sys +from argparse import ArgumentParser +from decimal import Decimal, ROUND_HALF_UP +from subprocess import check_output, Popen +from time import strftime +from Bio import SeqIO +from Bio.Alphabet import generic_dna +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +def parseArgs(): + parser = ArgumentParser(description='Extracts proteins from a FastA or GenBank proteome') + parser.add_argument('-i', '--infile', required=True, help='input proteome FastA file <.faa> or GenBank file <.gb||.gbk> with translations') + parser.add_argument('-m', '--hmm', required=True, help='input HMM database file') + parser.add_argument('-b', '--base', required=False, default=None, help='prefix string to append to output files [`basename infile .`]') + parser.add_argument('-o', '--outpath', required=False, default=None, help='output directory [./base--_