forked from fischhoff/bacteria
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bacteria.Rmd
100 lines (74 loc) · 3.98 KB
/
bacteria.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---
title: "bacteria"
author: "Ilya"
date: "12/7/2018"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
####install packages
```{r packages, echo=FALSE}
pkgTest <- function(x)
{
if (x %in% rownames(installed.packages()) == FALSE) {
install.packages(x, dependencies= TRUE)
}
library(x, character.only = TRUE)
}
neededPackages <- c("data.table", "dplyr", "reshape2", "corrplot", "RColorBrewer")
for (package in neededPackages){pkgTest(package)}
```
####read in data from Brbic et al. (ProTraits: http://protraits.irb.hr/data.html)
```{r}
PT = fread("ProTraits_precisionScores.txt",blank.lines.skip=TRUE)
# PT[1,]
names(PT)
unique(PT$Phenotype)
# PT_wide = reshape(PT, idvar = "Organism_name", timevar = "Phenotype", direction = "wide")
PT$Organism_name=factor(PT$Organism_name)
PT$Phenotype=factor(PT$Phenotype)
PT$`Integrated_score_+`=as.numeric(PT$`Integrated_score_+`)
dfmatrix <- acast(PT, Organism_name ~ Phenotype,value.var='Integrated_score_+',fun.aggregate=sum,margins=FALSE)
corrmatrix <- cor(dfmatrix, use="pairwise")
#col1 <- colorRampPalette(brewer.pal(9,"BrBG"))
#https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
corrplot(corrmatrix, method = "color", type = "upper",
cl.pos = "n", tl.pos = "n")
```
####read in data from Barberan et al. 2016 (IJSEM: https://figshare.com/articles/International_Journal_of_Systematic_and_Evolutionary_Microbiology_IJSEM_phenotypic_database/4272392)
```{r ijsem}
path = "4272392/"
#read table
ijsem<-read.delim(paste0(path,"IJSEM_pheno_db_v1.0.txt"), sep="\t", header=T, check.names=F, fill=T,
na.strings=c("NA", "", "Not indicated", " Not indicated","not indicated", "Not Indicated", "n/a", "N/A", "Na", "Not given", "not given","Not given for yeasts", "not indicated, available in the online version", "Not indicated for yeasts", "Not Stated", "Not described for yeasts", "Not determined", "Not determined for yeasts"))
#simplify column names
colnames(ijsem)<-c("Habitat", "Year", "DOI", "rRNA16S", "GC", "Oxygen",
"Length", "Width", "Motility", "Spore", "MetabAssays", "Genus", "Species", "Strain", "pH_optimum", "pH_range", "Temp_optimum", "Temp_range", "Salt_optimum", "Salt_range", "Pigment", "Shape", "Aggregation", "FirstPage", "CultureCollection", "CarbonSubstrate", "Genome", "Gram", "Subhabitat", "Biolog")
#clean Habitat column
levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater (river, lake, pond)"]<-"freshwater"
levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater sediment (river, lake, pond"]<-"freshwater sediment"
#clean Oxygen column
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="aerobic"]<-"obligate aerobe"
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="anerobic"]<-"obligate anerobe"
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="microerophile"]<-"microaerophile"
#clean pH_optimum column
ijsem$pH_optimum<-as.character(ijsem$pH_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$pH_optimum<-suppressWarnings(sapply(ijsem$pH_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}))
#remove pH values <0 and >10
ijsem$pH_optimum[ijsem$pH_optimum<0 | ijsem$pH_optimum>10]<-NA
#clean Temp_optimum column
ijsem$Temp_optimum<-as.character(ijsem$Temp_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$Temp_optimum<-suppressWarnings(sapply(ijsem$Temp_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}))
#clean Salt_optimum column
ijsem$Salt_optimum<-as.character(ijsem$Salt_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$Salt_optimum<-suppressWarnings(sapply(ijsem$Salt_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}))
#there are some formatting issues that should be solved
summary(ijsem)
```