-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcf2struc.R
158 lines (111 loc) · 5.12 KB
/
vcf2struc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env R
#This script takes a .vcf file of ONLY SNPs, ideally from FreeBayes, and gives you a structure file formatted for infocalc.
#The format is something like this:
#loc1 loc2 loc3 loc4
#ind pop.num pop.name region country loc1.geno loc2.geno loc3.geno loc4.geno
##################################
######## Initial Checks ##########
##################################
#Read in the file name
inputs <- commandArgs(trailingOnly=T)
#Is there an input?
if(length(inputs) != 2){
print("This script needs exactly two inputs; a vcf file and a table indicating the population of each sample. Whichever other file you specify as input will be interpreted as the table, regardless of its extension.")
stop()
}
#Is there a .vcf file being used as input?
if(identical(grep(".vcf",inputs),integer(0)) == T){
print("One of the inputs needs to be a vcf file; check the file extension of your vcf.")
stop()
}
#Are all required packages installed?
is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])
packages <- c("vcfR")
if(sum(is.installed(packages)) < length(packages)){
print("Something is missing, check that the vcfR is installed.")
stop()
} else {
print("Loading packages.")
}
suppressMessages(library("vcfR"))
##################################
######## Data Preparation ########
##################################
vcf.file <- inputs[grep(".vcf",inputs)] ; print(paste0("Input .vcf file = ",vcf.file))
table.file <- inputs[inputs != vcf.file] ; print(paste0("Input table file = ",table.file))
#Read in .vcf as character lines
vcf <- readLines(vcf.file)
#Remove header, selecting only the rows beginning from the header, removing meta-information
vcf <- vcf[grep("CHROM",vcf):length(vcf)]
#Split apart columns of the data table by the tab separator
vcf <- strsplit(vcf,"\t")
#Unlist the split data starting from the second list element, make a dataframe
geno <- data.frame(matrix(unlist(vcf[2:length(vcf)]),nrow=length(vcf)-1, byrow=T),
stringsAsFactors=FALSE)
#Make headers for the dataframe
colnames(geno) <- unlist(vcf[1])
#Remove extraneous columns to match what vcfR gives you
geno <- geno[,c(1,2,9:ncol(geno))]
#Find sites with full information, i.e. those that have more than just GT in the FORMAT field
complete <- geno[grep("DP",geno[,3]),]
#Pare down genotype fields to only the genotype value
for (i in 1:nrow(complete)){
#The first three characters in the string should be e.g. "0/0"
complete[i,4:ncol(complete)] <- substr(start=1,
stop=3,
complete[i,4:ncol(complete)])
}
#Replace missing data with NA
complete <- gsub(".:.","-9|-9",as.matrix(complete))
#Replace all / with | for consistency with the remaining genotype values
complete <- gsub("/","|",complete)
#Replace rows with full information in original genotype object with new, pared down rows
geno[grep("DP",geno[,3]),] <- complete
#Select only genotypes
geno <- geno[,4:ncol(geno)]
#Replace NAs with characters appropriate for structure
geno[is.na(geno)==T|geno=="."] <- "-9|-9"
geno[is.na(geno)==T|geno==".|."] <- "-9|-9"
geno[is.na(geno)==T|geno=="./."] <- "-9|-9"
##################################
### Writing the structure file ###
##################################
#Make the output file connection
output.file <- paste0(vcf.file,".struc")
#Get the number of samples
n <- length(vcf[[1]])-9
#The 9 refers to the first nine fields of a standard .vcf file
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
#The first line is a list of all loci
loci <- unlist(vcf)[seq(from=1,
to=length(unlist(vcf))-length(vcf[[1]])+1,
by=length(unlist(vcf[1])))][-1]
cat(paste(loci,collapse=" "),sep="\n",file=output.file)
#Now, time to make the prepending information
samples <- colnames(geno)
species <- read.table(table.file,header=T)
#Check that sample names match up
if(sum(samples %in% species$id) != length(samples)){
print("There are some samples in the vcf file unaccounted for in your table file; check your inputs and run again. Also check that the column header of the species map table is 'id'.")
unlink(output.file)
stop()
}
#Add a new index column to the species table
index <- seq(from=1,to=length(unique(species$species)),by=1)
index <- data.frame(index=index,species=unique(species$species))
species <- merge(species,index,by="species")
#Write the prepend
species$prepend <- paste(species$id,species$index,1)
samples <- data.frame(id=samples,order=seq(from=1,to=length(samples)))
#Order the table by the order of the samples in the vcf file
final <- merge(species,samples,by="id")
final <- final[order(final$order),]$prepend
#Writing in the loci, one individual at a time, adding the prepend (which is now correctly ordered) to the beginning
for(i in 1:ncol(geno)){
genos <- unlist(strsplit(geno[,i],"|",fixed=T))
geno1 <- c(final[i],genos[seq(from=1,to=length(genos)-1,by=2)])
geno2 <- c(final[i],genos[seq(from=2,to=length(genos),by=2)])
cat(paste(geno1,collapse=" "),sep="\n",file=output.file,append=T)
cat(paste(geno2,collapse=" "),sep="\n",file=output.file,append=T)
}
#And we're done!