-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-sequences-by-coordinates.sh
executable file
·79 lines (68 loc) · 2.54 KB
/
get-sequences-by-coordinates.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
usage() {
echo ""
echo "get-sequences-by-coordinates.sh -t FEATURE_TABLE -f FASTA -o OUT_DIR"
echo " The FASTA file should contain all of the contigs referenced in the feature table."
echo " The feature table should be formatted like a KBase domain annotation output file,"
echo " with each line containing the following fields in order:"
echo ""
echo " Contig_name Feature_name Start_position End_position Feature_strand(+/-)"
echo ""
echo " Any additional fields following these in a line will be ignored. The feature table"
echo " should not contain a header and should not have any empty lines. If contig names"
echo " or feature names are not unique in the feature table, you may lose data."
echo " Requires biopython to run."
echo ""
}
table=""
fasta=""
output=""
while getopts ":ht:f:o:" opt; do
case $opt in
h) usage
exit 1
;;
t) table="${OPTARG}"
;;
f) fasta="${OPTARG}"
;;
o) output="${OPTARG}"
;;
\?) echo "Invalid option: -${OPTARG}. Use the -h option for more information.">&2
exit 1
;;
:) echo "Option -$OPTARG requires an argument. Use the -h option for more information.">&2
exit 1
;;
esac
done
if [[ ! -f "$table" ]]; then
echo "File does not exist: $table"
exit 1
fi
if [[ ! -f "$fasta" ]]; then
echo "File does not exist: $fasta"
exit 1
fi
if [[ ! -d "$output" ]]; then
mkdir "$output"
fi
# Get real directory where script file is located,
# even if the script was called via a symlink
SOURCE=${BASH_SOURCE[0]}
while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
SOURCE=$(readlink "$SOURCE")
[[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
temp=$(mktemp)
# Format is Contig, Feature, Start, Stop, Strand, other stuff...
# Each domain annotation has its own entry, but we only care about the overall features.
# Calling uniq on the first five columns will collapse everything to one line per feature.
cut -f 1,2,3,4,5 "$table" | uniq > $temp
while read contig name start stop strand; do
$DIR/get-sequence-by-coordinates.py -o "$output" "$fasta" "$contig" "$start" "$stop" "$strand" "$name"
done < $temp
# Clean up
rm -f $temp