-
Notifications
You must be signed in to change notification settings - Fork 0
/
genome_analyses_scripts.sh
262 lines (185 loc) · 7.27 KB
/
genome_analyses_scripts.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#pipeline to analyse different assemblies
###############################
# GENOME ASSEMBLY USING ABYSS #
###############################
#!/bin/bash
#np for number of threads
#k for kmer size
#using multiple libraries for one sample
abyss-pe np=8 k=31 name=path/to/output/prefix lib='lib1 lib2 lib3 lib4 lib5' lib1='path/to/lib1/file.sra' lib2='path/to/lib2/file.sra' lib3='path/to/lib3/file.sra' lib4='path/to/lib4/file.sra' lib5='path/to/lib5/file.sra' &> path/to/verbose.log
#OR
#using one library
abyss-pe np=8 k=31 name=path/to/output/prefix in='path/to/reads_1.fastq.gz path/to/reads_2.fastq.gz' &> path/to/verbose.log
################################
# GENOME ASSEMBLY USING SPADES #
################################
#NOTE: SPAdes is very processing and memory heavy -> use one of the bigger linux boxes
#!/bin/bash
#-t for number of threads
#-m for amount of maximum memory in Mb
spades.py --pe1-1 path/to/lib1/reads_1.fastq.gz --pe1-2 path/to/lib1/reads_2.fastq.gz --pe2-1 path/to/lib2/reads_1.fastq.gz --pe2-2 path/to/lib2/reads_2.fastq.gz -o path/to/output -t 14 -m 200
#############################
# GENOME ASSEMBLY USING SGA #
#############################
#! /bin/bash -x
#this example has two libraries for one sample
cd path/to/reads
IN1=lib1/reads_1.fastq.gz
IN2=lib1/reads_2.fastq.gz
IN3=lib2/reads_1.fastq.gz
IN4=lib2/reads_2.fastq.gz
# Parameters
SGA_BIN=sga
BWA_BIN=bwa
SAMTOOLS_BIN=samtools
BAM2DE_BIN=sga-bam2de.pl
ASTAT_BIN=sga-astat.py
DISTANCE_EST=DistanceEst
# Overlap parameter used for the final assembly. This is the only argument
# to the script
OL=75
# The number of threads to use
CPU=8
# To save memory, we index $D reads at a time then merge the indices together
D=4000000
# Correction k-mer value
CK=31
# The minimum k-mer coverage for the filter step. Each 27-mer
# in the reads must be seen at least this many times
COV_FILTER=2
# Overlap parameter used for FM-merge. This value must be no greater than the minimum
# overlap value you wish to try for the assembly step.
MOL=55
# Parameter for the small repeat resolution algorithm
R=10
# The number of pairs required to link two contigs into a scaffold
MIN_PAIRS=5
# The minimum length of contigs to include in a scaffold
MIN_LENGTH=200
# Distance estimate tolerance when resolving scaffold sequences
SCAFFOLD_TOLERANCE=1
# Turn off collapsing bubbles around indels
MAX_GAP_DIFF=0
#
# Dependency checks
#
# Check the required programs are installed and executable
prog_list="$SGA_BIN $BWA_BIN $SAMTOOLS_BIN $BAM2DE_BIN $DISTANCE_EST $ASTAT_BIN"
for prog in $prog_list; do
hash $prog 2>/dev/null || { echo "Error $prog not found. Please place $prog on your PATH or update the *_BIN variables in this script"; exit 1; }
done
# Check the files are found
file_list="$IN1 $IN2 $IN3 $IN4"
for input in $file_list; do
if [ ! -f $input ]; then
echo "Error input file $input not found"; exit 1;
fi
done
# First, preprocess the data to remove ambiguous basecalls
$SGA_BIN preprocess --pe-mode 1 -o output.fastq $IN1 $IN2 $IN3 $IN4
#
# Error correction
#
# Build the index that will be used for error correction
# As the error corrector does not require the reverse BWT, suppress
# construction of the reversed index
$SGA_BIN index -a ropebwt -t $CPU --no-reverse output.fastq
# Perform error correction with a 41-mer.
# The k-mer cutoff parameter is learned automatically
$SGA_BIN correct -k $CK --discard --learn -t $CPU -o reads.ec.k$CK.fastq output.fastq
#
# Contig assembly
#
# Index the corrected data.
$SGA_BIN index -a ropebwt -t $CPU reads.ec.k$CK.fastq
# Remove exact-match duplicates and reads with low-frequency k-mers
$SGA_BIN filter -x $COV_FILTER -t $CPU --homopolymer-check --low-complexity-check reads.ec.k$CK.fastq
# Merge simple, unbranched chains of vertices
$SGA_BIN fm-merge -m $MOL -t $CPU -o merged.k$CK.fa reads.ec.k$CK.filter.pass.fa
# Build an index of the merged sequences
$SGA_BIN index -d 1000000 -t $CPU merged.k$CK.fa
# Remove any substrings that were generated from the merge process
$SGA_BIN rmdup -t $CPU merged.k$CK.fa
# Compute the structure of the string graph
$SGA_BIN overlap -m $MOL -t $CPU merged.k$CK.rmdup.fa
# Perform the contig assembly without bubble popping
$SGA_BIN assemble -m $OL -g $MAX_GAP_DIFF -r $R -o assemble.m$OL merged.k$CK.rmdup.asqg.gz
#
# Scaffolding/Paired end resolution
#
CTGS=assemble.m$OL-contigs.fa
GRAPH=assemble.m$OL-graph.asqg.gz
# Realign reads to the contigs
sga-align --name output.pe $CTGS $IN1 $IN2 $IN3 $IN4
# Make contig-contig distance estimates
sga-bam2de.pl -n $MIN_PAIRS --prefix libPE output.pe.bam #received error -> maybe output.pe
# Make contig copy number estimates
sga-astat.py -m $MIN_LENGTH output.pe.refsort.bam > libPE.astat
$SGA_BIN scaffold -m $MIN_LENGTH --pe libPE.de -a libPE.astat -o scaffolds.n$MIN_PAIRS.scaf $CTGS
$SGA_BIN scaffold2fasta -m $MIN_LENGTH -a $GRAPH -o scaffolds.n$MIN_PAIRS.fa -d $SCAFFOLD_TOLERANCE --use-overlap --write-unplaced scaffolds.n$MIN_PAIRS.scaf
#####################################
# GENOME ASSEMBLY USING soapdenovo2 #
#####################################
#NOTE for soapdenovo2 analyses you will need a config file that tells the assembler what to do
###########################
# CONFIG FILE soapdenovo2 #
###########################
#this is an example config file with two libraries for one sample
#maximal read length
max_rd_len=100
[LIB]
#average insert size
avg_ins=500
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=3
#use only first 100 bps of each read
rd_len_cutoff=100
#in which order the reads are used while scaffolding
rank=2
# cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=32
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=path/to/reads_1.fastq.gz
q2=path/to/reads_2.fastq.gz
[LIB] #second library
avg_ins=200
reverse_seq=0
asm_flags=3
rd_len_cutoff=100
rank=1
pair_num_cutoff=3
map_len=32
q1=path/to/reads_1.fastq.gz
q2=path/to/reads_2.fastq.gz
##########################
# ACTUAL RUN soapdenovo2 #
##########################
#!/bin/bash
#-K is kmer size
#-p is number of threads
#-R resolve repears by reads
cd path/to/read_files
SOAPdenovo-63mer all -s path/to/file.config -K 31 -R -p 14 -o path/to/output/dir 1>path/to/file.ass.log 2>path/to/file.ass.err
#-t is number of threads
#-l is maximum read length
GapCloser -b path/to/file.config -a path/to/soapdenovo-output.scafSeq -o path/to/output -t 14 -l 100 1>path/to/file.ass.log 2>path/to/file.ass.err
######################################
# GENOME RE-ASSEMBLY USING redundans #
######################################
#!/bin/bash
cd /home/redundans/src/redundans
./redundans.py -v -i path/to/reads*.fastq.gz -f path/to/scaffolds -o path/to/output -t 8 --log path/to/log
####################################
# GENOME ANNOTATION USING AUGUSTUS #
####################################
#!/bin/bash
augustus --species=coprinus path/to/filtered/scaffolds_fasta > path/to/filtered/name.gff
######################################
# GENE STATISTICS USING GENOME-TOOLS #
######################################
#!/bin/bash
gt gff3 -sort -tidy path/to/augustus.gff | gt stat -genelengthdistri -o path/to/output-genelengthdist.txt