-
Notifications
You must be signed in to change notification settings - Fork 0
/
beagleMissFilt.R
229 lines (158 loc) · 7.19 KB
/
beagleMissFilt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/bin/env R
# Author: Oliver Stuart
# Date: 26/03/2020
# Project: Locust extinction
#This script takes a beagle file from the locust extinction project and filters out loci missing among spretus individuals.
#It can be modified to examine other individuals, but for this project we're only looking at the samples from M. spretus.
#The inputs are a beagle file and a double which specifies the maximum site-specific missingness for filtering.
#The beagle file and the list used to make it MUST be in the same directory. The naming conventions is:
#names.for.file.beagle.gz
#names.for.file.list
##################################
######## Initial Checks ##########
##################################
#Are all required packages installed?
is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])
packages <- c("optparse")
if(sum(is.installed(packages)) < length(packages)){
print(paste0("Something is missing, check that all required packages (",paste(packages,collapse=", "),") are installed."))
stop()
}
suppressMessages(library("optparse"))
#Bring in the inputs
option_list = list(
make_option(c("-b", "--beagle"), type="character", default=NULL,
help="beagle file to be modified", metavar="character"),
make_option(c("-m", "--miss"), type="integer", default=NULL,
help="a percentage value for missingness filtering", metavar="integer")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
#Do the inputs actually exist?
if (is.null(opt$beagle) | is.null(opt$miss)){
print_help(opt_parser)
stop("Provide a beagle file and a missingness threshold.", call.=FALSE)
}
##################################
######### Preparing data #########
##################################
#Read in beagle file
input.beagle <- read.delim(opt$beagle,
check.names=F,
header=T,
colClasses="character")
output.beagle <- gsub("\\.beagle.gz","\\.filt\\.beagle",opt$beagle)
#Get associated list
input.list <- gsub("\\.beagle.gz","\\.list",opt$beagle)
#Prepare sample name data.frame for selecting spretus data in beagle file
sample.names <- readLines(input.list)
sample.names <- gsub(".o6.bam","",sample.names)
sample.names <- gsub(".o6.sub.bam","",sample.names)
sample.names <- strsplit(sample.names,split="/")
sample.names <- unlist(lapply(sample.names,`[[`,length(sample.names[[1]])))
species.map <- read.table("~/Desktop/grasshoppers/data/speciesmaps/speciesmap.txt",header=T,stringsAsFactors=F)
sample.names <- merge(data.frame(id=sample.names,ord=1:length(sample.names)),species.map,by="id")
sample.names <- sample.names[order(sample.names$ord),]
##################################
### Define missingmess function ##
##################################
indMissGet <- function(beagle){
#make indices of the number of individuals and the number of sites
n_ind <- (ncol(beagle) - 3) / 3
n_site <- nrow(beagle)
#for each individual
for(i in 1:n_ind){
#make an empty vector to store individual level missing data to begin with
if(i == 1){ind_miss <- c()}
#for each site
for(j in 1:n_site){
#make an empty vector to store site level missingness to begin with
if(j == 1){site_miss <- c()}
#if all entries for this particular individual == 0.333333
#honey that's a missing data point
if(sum(beagle[j,(1+i*3):(3+i*3)] == 0.333333) == 3){
#add a 1 to the site missingness vector
site_miss <- c(site_miss,1)
} else {
#if not, there's something there, yay, nice, cool
site_miss <- c(site_miss,0)
}
#once finished looping over sites
if(j == n_site){
#take the mean of the missingness vector to get the proportion of missing sites
ind_miss <- c(ind_miss,mean(site_miss))
}
}
#once finished looping over individuals
if(i == n_ind){
#make a nice data.frame object to analyse
return(data.frame(id = colnames(beagle)[seq(from=4,to=ncol(beagle)-2,by=3)],
miss = ind_miss,
stringsAsFactors=F))
}
}
}
siteMissGet <- function(beagle){
#make indices of the number of individuals and the number of sites
n_ind <- (ncol(beagle) - 3) / 3
n_site <- nrow(beagle)
#for each site
for(i in 1:n_site){
#make an empty vector to store individual level missing data to begin with
if(i == 1){site_miss <- c()}
#for each site
for(j in 1:n_ind){
#make an empty vector to store site level missingness to begin with
if(j == 1){ind_miss <- c()}
#if all entries for this particular individual == 0.333333
#honey that's a missing data point
if(sum(beagle[i,(1+j*3):(3+j*3)] == 0.333333) == 3){
#add a 1 to the site missingness vector
ind_miss <- c(ind_miss,1)
} else {
#if not, there's something there, yay, nice, cool
ind_miss <- c(ind_miss,0)
}
#once finished looping over sites
if(j == n_ind){
#take the mean of the missingness vector to get the proportion of missing sites
site_miss <- c(site_miss,mean(ind_miss))
}
}
#once finished looping over individuals
if(i == n_site){
#make a nice data.frame object to analyse
return(data.frame(site = as.character(beagle$marker),
miss = site_miss,
stringsAsFactors=F))
}
}
}
##################################
###### Beagle manipulation #######
##################################
#Get index of spretus files in data.frame
#Get the sample names in the beagle file based on their position
inds <- paste0("Ind",grep("spretus",sample.names$species) - 1)
#Subset the beagle file based on these names
spretus <- cbind(input.beagle[,c(1:3)],input.beagle[,colnames(input.beagle) %in% inds])
#Calculate the site-specific missingness for the spretus samples in the beagle file
spr.site.miss <- siteMissGet(spretus)
#Select the sites below the missingness threshold
good.loci <- spr.site.miss[spr.site.miss$miss < (opt$miss/100),]
#Filter out any site with all missing data in spretus
beagle.filt <- input.beagle[input.beagle$marker %in% good.loci$site,]
##################################
######### Write outputs ##########
##################################
#Write the beagle file
write.table(beagle.filt,output.beagle,col.names=T,row.names=F,quote=F,sep="\t")
#Write the report
rep <- paste0(output.beagle,".report")
cat("input file properties",sep="\n",file=rep)
cat(paste0("name = ",opt$beagle),sep="\n",file=rep,append=T)
cat(paste0("n_inds = ",(ncol(input.beagle)/3)-1),sep="\n",file=rep,append=T)
cat(paste0("n_loci = ",nrow(input.beagle)),sep="\n",file=rep,append=T)
cat(paste0("n_loci after filtering = ",nrow(beagle.filt)),sep="\n",file=rep,append=T)
cat(paste0("mean site missingness in spretus before filtering = ",mean(spr.site.miss$miss)),sep="\n",file=rep,append=T)
cat(paste0("mean site missingness in spretus before filtering = ",mean(good.loci$miss)),sep="\n",file=rep,append=T)