Skip to content

Latest commit

 

History

History
282 lines (204 loc) · 11.8 KB

hpc_functionalannotation.md

File metadata and controls

282 lines (204 loc) · 11.8 KB

Protein coding gene annotation using PROKKA

##PROKKA cannot accept long fasta headers as generated by SPAdes. So remove everything after "_length" and add a serial no. to each fasta header-

#saving the header info for each file-
#grep "^>" ${IN_DIR}/230-${num}-250above-scaffolds.fasta > ${IN_DIR}/${num}-headerinfo #original header file
#sed 's/>//g' 230-03-headerinfo > 230-03-headerinfo2  #to remove the ">"

##cleaning for prokka run-
#removing everything after "_length" and add serial number to the header-
cat ${IN_DIR}/230-${num}_scaffolds.fasta | sed 's/_length.*//' | awk '/>/{print $0"number"(++i)}!/>/' > ${IN_DIR}/230-${num}_newname_scaffolds.fasta
#awk command adds a serial number to the header
#--------------------------------------------------------------------------------------------------------------------------------------------------

cd ${CONTIGS_DIR}
prokka --outdir ${OUT_DIR}/230-${num}_250above_prokka --addgenes --addmrna --metagenome 230-${num}-250above-newheader-scaffolds.fasta
#prokka needs to be run in the scaffolds folder. Prokka v.1.14.5 installed.

CAZY annotation

a)map the protein sequences to CAZY database

#!/bin/bash
#SBATCH --job-name=hummrCAZY250above
#SBATCH --partition=compute
#SBATCH --time=7-0
#SBATCH --mem=80G
#SBATCH --ntasks=1
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,FAIL,END

#SBATCH --output=hmmr250_nt_%A-%a.out
#SBATCH -o out_hmmr250.%j
#SBATCH -e err_hmmr250.%j

##SBATCH --array 1-100
##num=$(printf "%02d" $SLURM_ARRAY_TASK_ID)

#load modules-
module load hmmer/3.1b2

cd /work/BourguignonU/jigyasa/microcerotermes/functional_annotation/all_prokka_outputs

#files=(*.faa)
#file1=${files[${SLURM_ARRAY_TASK_ID}]} #it reads each index at a time
#file2=${file1/-prokka.faa/}
#file3=${file2/.txt/}


DB_DIR="/work/student/jigyasa-arora/BourguignonU_data/main_ID230+ID231_2/ID230/joinedfiles/K21-kate_assembly/prodigal/500above/cazy_output/hummer_cazy"


#for metagenomes use e-5 (dbCAN db recommendation, Ye et al, 2012)
#non-stringent hmmscan-
hmmscan -E 1.0e-5 -o 229-01-HMMoutputfile --tblout 229-01-HMM-persequence-output --domtblout 229-01-HMM-perdomain-output ${DB_DIR}/dbCAN-fam-HMMs.txt 229-01-prokka.faa

#NOTE- Even though a non-stringent evlaue cutoff used in the initial analysis, the final version has evalue <1e-30 (see below).

b) get the hmmscan output file to a text delimited file

sh hmmscan-parser.sh 229-01-HMM-perdomain-output > dbcan-229-01-HMM-perdomain-output

c) use a more stringent evalue cutoff for final CAZYme annotation. Get best hit per protein sequence

awk -F"," '$7<1e-30 {print $2"\t"$3"\t"$4"\t"$7"\t"$12}' dbcan-229-01-HMM-perdomain-output > evalue30-229-01-HMM-perdomain-output

export LC_ALL=C LC_LANG=C; sort -k2,2 -k5,5gr -k4,4g evalue30-cazy-output-229-01.txt > sorted-evalue30-cazy-output-229-01.txt #sort by proteinID, bitscore, evalue

for next in $(cut -f2 sorted-evalue30-cazy-output-229-01.txt | sort | uniq -u); do grep -w -m 1 "$next" sorted-evalue30-cazy-output-229-01.txt; done > top-sorted-evalue30-cazy-229-01.txt
# cut and sort proteinID column, grep fullwords (-w) and stop after first match (-m 1)

cat top-sorted-evalue30-* > allsamples-evalue30-cazymes.txt

d) check if each protein is annotated only once, if not then get the best match out-

awk -F"\t" '{print $6"_"$2}' allsamples-evalue30-cazymes.txt | sed 's/"//g' > colnames.txt #get the proteinIDs per samples

sort colnames.txt | uniq -u > 2-colnames.txt #sort and get uniq IDs out
wc -l *colnames*
[1] 168640 2-colnames.txt
[2] 168640 colnames.txt

KEGG annotation-Using default parameters.

#!/bin/bash
#SBATCH --job-name=bedtoolsmap
#SBATCH --partition=largemem
#SBATCH --time=7-0
#SBATCH --mem=120G
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,FAIL,END

#SBATCH --output=bedtoolsmap_nt_%A-%a.out
#SBATCH -o out_bedtoolsmap.%j
#SBATCH -e err_bedtoolsmap.%j

#SBATCH --array 0-225
#num=$(printf "%02d" $SLURM_ARRAY_TASK_ID)

files=(*.faa)
file1=${files[${SLURM_ARRAY_TASK_ID}]} #it reads each index at a time
#file2=${file1/-prokka.faa/}
#file3=${file2/.txt/}

module load hmmer/3.1b2

/flash/BourguignonU/DB/kofamscan/db/bin/kofam_scan/exec_annotation -o ${OUT_DIR}/${file2}-kofamdetail ${IN_DIR}/${file1} -p /flash/BourguignonU/DB/kofamscan/db/profiles -k /flash/BourguignonU/DB/kofamscan/db/ko_list --cpu 10 -f detail-tsv --tmp-dir ${OUT_DIR}/${file2}-tmp
#-f mapper -one can be converted to heatmap
#-f detail -for tpm analysis

a)To extract individual KEGG ID per proteinID- Based on KO profile threshold and E.value<=1e-30 (A). If the KEGGids we are examining are present then extract them in the output file. Otherwise take the KEGGid with the lowest evalue (B).

## save as "kofam_extract_thresholdonly.R"
#USAGE- Rscript [kofam] [output_file]
args <- commandArgs(TRUE)
kofam <- args[1]
output_file1 <- args[2]
output_file2 <- args[3]

library(dplyr)
kofam<-read.csv(file=kofam,sep="\t",header=TRUE)

kolist<-read.csv("/flash/BourguignonU/DB/kofamscan/db/ko_list",header=TRUE,sep="\t")
kolist<-kolist%>%select(knum,threshold)
colnames(kolist)<-c("knum","kthreshold")

#join the two files-
kofam_klist<-merge(kofam,kolist,by.x=c("KO"),by.y=c("knum"),all.x=TRUE)
kofam_klist$X.<-NULL
kofam_klist<-unique(kofam_klist)

#filter the KOs by their thresholds-
kofam_klist<-kofam_klist%>%mutate(keepkos=ifelse(as.numeric(thrshld)>=as.numeric(kthreshold),"aboveandequal","less"))
kofam_klist_selected<-kofam_klist%>%filter(keepkos=="aboveandequal")

kofam_klist_selected$E.value<-as.numeric(as.character(kofam_klist_selected$E.value))

write.csv(kofam_klist_selected,file=output_file1) #...(A)
#-------------------------------------------------------------------------------------------
#get all koids per geneid in a single columns-
library(data.table)
setDT(kofam_klist_selected)
kofamall<-as.data.table(kofam_klist_selected)[, toString(KO), by = (gene.name)]

#and get the kegg with the smallest evalue-
setDT(kofam_klist_selected)
kofamevalue<-setDT(kofam_klist_selected)[ , .SD[which.min(E.value)], by = gene.name]

#make sure that genes of interest are present in the dataset ifnot then make sure that other KEGGids with min. e-value are showed-
library(tidyr)
library(stringr)

kofamall_evalue<-merge(kofamall,kofamevalue,by="gene.name") #join  the two dataframes

#get all the genes from the metabolic pathways-
#<on multiple keggids>
genes<-c("K01938","K01491","K13990","K00297","K00122","K00198","K14138","K00194","K00197","K15023","K00625","K00925","K00394","K00395","K00958","K11180","K11181","K02586","K02591","K02588","K02587","K02592","K02585","K00370","K00371","K00374","K02567","K02568","K00362","K00363","K03385","K15876","K01428","K01429","K01430","K00265","K00266","K01915","K03320","K11959","K11960","K11961","K11962","K11963","K00926","K00611","K01940","K01755","K00399","K00401","K00402")

#(B) str_extract based on genes of interest-
kofamall_evalue <- kofamall_evalue %>% mutate(kofamselected=(str_extract(V1, str_c(genes, collapse = "|"))))
#keep the extracted genes OR KEGGids with min evalue-
kofamall_evalue<-kofamall_evalue%>%mutate(finalkofam=ifelse(is.na(kofamselected),as.character(KO),kofamselected))

#select the columns of interest-
kofamall_evalue2<-kofamall_evalue%>%select(gene.name,finalkofam)
kofamall_evalue2$samples<-gsub("_.*$","",kofamall_evalue$gene.name)
kofamall_evalue2$protein1<-unlist(lapply(strsplit(as.character(kofamall_evalue2$gene.name),split="_"),"[",2))
kofamall_evalue2$protein2<-unlist(lapply(strsplit(as.character(kofamall_evalue2$gene.name),split="_"),"[",3))
kofamall_evalue2$proteins<-paste(kofamall_evalue2$protein1,kofamall_evalue2$protein2,sep="_")
colnames(kofamall_evalue2)[which(names(kofamall_evalue2) == "finalkofam")] <- "annotation"
colnames(kofamall_evalue2)[which(names(kofamall_evalue2) == "gene.name")] <- "fullproteinnames"

kofamall_evalue2<-kofamall_evalue2%>%select(samples,proteins,annotation,fullproteinnames)
write.csv(kofamall_evalue2,file=output_file2)
#----------------------------------------------------------------------------------------------
#to run the Rscript-
files=(filename-*-kofamdetail)
file1=${files[${SLURM_ARRAY_TASK_ID}]} #it reads each index at a time

module load R/3.6.1

OUT_DIR="/flash/BourguignonU/Jigs/tpm_2021/kofam"
Rscript ${OUT_DIR}/kofam_extract_thresholdonly.R ${IN_DIR}/${file1} ${OUT_DIR}/selected-${file1}

#join all the files together-
cat selected* > final-all-metagenomes-multiplekofamids-annotation.txt

Pfam annotation for hydrogenases catalytic subunit classification. Proteins were annotated with PfamA database, and those annotated as PF00374 (NiFe) or PF02906 (Fe-Fe) were further classified via https://services.birc.au.dk/hyddb/ Hydrogenases database.

#!/bin/bash
#SBATCH --job-name=hmm
#SBATCH --partition=compute
#SBATCH --time=7-0
#SBATCH --mem=100G
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,FAIL,END

#SBATCH --output=hmm_nt_%A-%a.out
#SBATCH -o out_hmm.%j
#SBATCH -e err_hmm.%j

#SBATCH --array 0-224
##num=$(printf "%02d" $SLURM_ARRAY_TASK_ID)

#module load-
module load hmmer/3.1b2
module load  singularity/3.0.3

cd /work/BourguignonU/jigyasa/microcerotermes/functional_annotation/funcitonal_outputs/faa_files

files=(filtered-*faa) #protein files
##echo "list: " ${files[${SLURM_ARRAY_TASK_ID}]} # this generates a list of $sf files
file1=${files[${SLURM_ARRAY_TASK_ID}]} #it reads each index at a time
file2=${file1/filtered-/}
#file3=${file2/.txt/}

##USING FILTERED FILES-
#to remove asterisk containing fasta headers-
#grep -B1 "*" 230-59-150above-fraggene.fasta.faa > headers_with_asterisk.txt #-B1 : print a line before "*"
#grep "^>"  headers_with_asterisk.txt |sed 's/>//g' > headers_with_asterisk2.txt #to get the header info
#extract out the fasta headers-
#python3 remove_asterisk.py ${IN_DIR}/230-59-150above-fraggene.fasta.faa ${IN_DIR}/headers_with_asterisk2.txt > ${IN_DIR}/230-59-150above-fraggene.filtered.faa
#or
#filterbyname.sh in=229-${num}-prokka.faa out=remaining/filtered-229-${num}-prokka.faa include=f names=2-lines-with-astrisk-229-${num}-prokka.faa.txt substring=t ignorejunk


DB_DIR="/work/student/jigyasa-arora/BourguignonU_data/main_ID230+ID231_2/ID230/230run_nomismatch_joinedfiles/protein/pfam_hmm"

singularity exec /work/student/jigyasa-arora/pfa.sif pfam_scan.pl -fasta ${file1} -dir ${DB_DIR}/ -outfile pfam-output-${file2} -e_seq 1e-4 -cpu 10

#e-value cut-off used is 1e-4

a) Pfam annotated proteins were further filtered by a stringent evalue cutoff.

awk -F"\t" '$13 <1e-30 {print $0}' pfam-output-229-01.txt > evalue30-pfam-output-229-01.txt

b) Extract NiFe and FeFe catalytic subunits out-

grep "PF00374" evalue30-pfam-output-229-01.txt < nife-evalue30-pfam-output-229-01.txt
grep "PF02906" evalue30-pfam-output-229-01.txt < fefe-evalue30-pfam-output-229-01.txt

c) Generate a final pfam annotated file for all samples-

cat evalue_30_pfam-output-* > allsamples-evalue30-pfamoutput.txt
awk '{print $1","$6","$16}' allsamples-evalue30-pfamoutput.txt > 3columns-allsamples-evalue30-pfamoutput.txt #extract proteinID, pfamID, sampleID out

#in R-
pfam<-read.csv("pfam<-read.csv("3columns-allsamples-evalue30-pfamoutput.txt",header=FALSE)

library(data.table)
setDT(pfam)
pfam2<-as.data.table(pfam)[, toString(V2), by = list(V1,V3)] #convert pfam annotation column toString using proteinID and sampleID columns as groups.
write.csv(pfam2,file="3columns-allsamples-multiplepfamids-evalue30-pfamoutput.txt")

##NOTE- for pfam annotation, one annotation per proteinID was not taken into consideration as only hydrogenases were examined from pfam annotation. Hydrogenase annotation via HydDB requires examination of protein sequences, removing the problem of double annotation of proteinID. Those protein sequences annotated as Hydrogenases were kept for statistical analysis in R.