-
Notifications
You must be signed in to change notification settings - Fork 3
/
sars_ani_validation.R
199 lines (152 loc) · 8.15 KB
/
sars_ani_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# SARS-ANI: A Global Open Access Dataset of Reported SARS-CoV-2 Events in Animals
# Script to perform technical validation
######################## Prepare the R environment: install tools (packages) ########################
install.packages("dplyr")
install.packages("stringr")
install.packages ("taxize")
install.packages("rentrez")
install.packages("plyr")
library(dplyr)
library(stringr)
library (taxize)
library (rentrez)
library (plyr)
########################### Set working directory to source file location ###########################
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
############################ Codes for Quality Control & Data Cleaning ##############################
# Import the dataset
sars_df <- read.csv("sars_ani_data.csv", encoding="UTF-8")
########## GENERAL DATA CHECK ##########
# Look at the first lines of the dataset
head (sars_df)
# Print column names
colnames(sars_df)
# Check that the data in each column uses consistent coding and check potential typos
unique(sars_df$primary_source)
unique(sars_df$secondary_source)
unique(sars_df$host_com_orig)
unique(sars_df$host_sci_orig)
unique(sars_df$host_com_res)
unique(sars_df$host_sci_res)
unique(sars_df$host_colloq)
unique(sars_df$host_sci_spec_res)
unique(sars_df$family)
unique(sars_df$epidemiological_unit)
unique(sars_df$sex)
unique(sars_df$country_name)
unique(sars_df$country_iso3)
unique(sars_df$subnational_administration)
unique(sars_df$city)
unique(sars_df$location_detail)
unique(sars_df$date_confirmed)
unique(sars_df$date_reported)
unique(sars_df$date_published)
unique(sars_df$test)
unique(sars_df$test_2)
unique(sars_df$test_3)
unique(sars_df$sampling_type)
unique(sars_df$sampling_type_2)
unique(sars_df$sampling_type_3)
unique(sars_df$negative_test)
unique(sars_df$negative_sampling_type)
unique(sars_df$reason_for_testing)
unique(sars_df$symptoms)
unique(sars_df$outcome)
unique(sars_df$living_conditions)
unique(sars_df$source_of_infection)
unique(sars_df$variant)
unique(sars_df$control_measures)
unique(sars_df$related_to_other_entries)
unique(sars_df$number_cases)
unique(sars_df$number_susceptible)
unique(sars_df$number_tested)
unique(sars_df$age)
########## CHECK DATES ##########
# Create a column "Date" where Date = confirmed when confirmed is given and Date = reported if date confirmed is missing and Date = published if the two others are missing
# Remove trailing and leading whitespace
sars_df <- sars_df %>%
mutate (Date = if_else(
date_confirmed %in% c( "NS", NA,""), date_reported , date_confirmed)) %>%
mutate (Date = if_else(
Date %in% c( "NS", NA,""), date_published, Date)) %>%
mutate(across(where(is.character), str_trim))
# check that we have a date for each event
length(which(sars_df$Date == "NS"))
length(which(is.na(sars_df$Date)))
# the results should be zero --> all records have a date assigned with the rule confirmed > reported > published
########## TAXONOMIC VALIDATION ##########
# Set the API key for the NCBI for a single R session
## See how getting NCBI API key value: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
## See how using API key: https://cran.r-project.org/web/packages/rentrez/vignettes/rentrez_tutorial.html
set_entrez_key("XXXXXX") # enter your own API key
# Confirm it is available
Sys.getenv("ENTREZ_KEY")
# Resolve taxonomic names - scientific names (check misspelling)
# Using the Global Names Resolver (GNR) service provided by the Encyclopedia of Life http://resolver.globalnames.org/.
# The preferred data source is set to the National Center for Biotechnology Information (NCBI) https://www.ncbi.nlm.nih.gov/
### First remove "NS" from the list of Latin names
list_sci <- unique(sars_df$host_sci_orig)[!unique(sars_df$host_sci_orig) %in% "NS"] # remove the "NS" from the list
gnr_res_sci <- gnr_resolve (sci = list_sci, preferred_data_sources = "4")
#View(gnr_res_sci)
# Extract taxonomic names which scores >= 0.98
resolved_sci <- gnr_res_sci %>%
filter (score >= 0.98)
# View(resolved_sci)
# Identify unique combination of host names (as extracted from the source and as resolved against the NCBI backbone)
unique_combi <- sars_df %>%
distinct(host_com_orig, host_sci_orig, host_com_res, host_sci_res)
# Left join gnr_resolved scientific names
# Retrieve scientific names from English names (use host_com_orig to retrieve host_sci_res)
# Using the National Center for Biotechnology Information (NCBI) https://www.ncbi.nlm.nih.gov/
host_res_1 <- unique_combi %>%
left_join(resolved_sci, by = c("host_sci_orig" = "submitted_name")) %>% # add list of scientific names checked for misspelling using gnr_resolve
select (!c(user_supplied_name, data_source_title, score)) %>% # removed unused column
mutate (validate_sci_from_com_orig = comm2sci(com = host_com_orig, db = "ncbi") ) # retrieve scientific names from the common names provided in the source
# choose Neogale vison for "mink"
# choose western gorilla for gorilla
#View(host_res_1)
# Identify NCBI-resolved scientific names based on information provided in the source (common and scientific name)
host_res_2 <- host_res_1 %>%
mutate(ncbi_sci_res = ifelse(validate_sci_from_com_orig == "character(0)", matched_name, validate_sci_from_com_orig))
#View(host_res_2)
# Retrieve English common names from scientific names (use host_sci_orig to retrieve host_com_res)
# Using the National Center for Biotechnology Information (NCBI) https://www.ncbi.nlm.nih.gov/
host_res_3 <- host_res_2 %>%
mutate (ncbi_com_res = sci2comm(sci = ncbi_sci_res, db = "ncbi") )
#View(host_res_3)
# Validate host_sci_res with ncbi_sci_res (compare and find differences)
# Validate host_com_res with validate_com_from_sci_res
# Create Boolean vectors to check difference
host_res_4 <- host_res_3 %>%
mutate (sars_ani_sci_valid = ifelse(ncbi_sci_res == host_sci_res, TRUE, FALSE))%>%
mutate (sars_ani_com_valid = ifelse(ncbi_com_res == host_com_res, TRUE, FALSE))%>%
# Identify names to check manually
filter (sars_ani_sci_valid == FALSE | sars_ani_com_valid == FALSE)
View(host_res_4)
# When common and scientific names are not resolved at this point, we will resolve them manually in the NCBI Taxonomy checker (https://www.ncbi.nlm.nih.gov/taxonomy/).
# Retrieve higher taxonomic names (family)
# Using the National Center for Biotechnology Information (NCBI) https://www.ncbi.nlm.nih.gov/
ncbi_res_vec <- unname(unlist(host_res_3$ncbi_sci_res)) # get the vector of resolved scientific names
ncbi_res_vec <- ncbi_res_vec[!is.na(ncbi_res_vec)] # remove NA
families_res <- tax_name( sci = ncbi_res_vec, get = "family", db = "ncbi")
families_res
# Retrieve from ITIS the taxonomic serial numbers (TSN) of a taxon for which we have a doubt
## Example of the dog
mynames <- c( "Canis familiaris", "Canis lupus familiaris", "Canis lupus")
tsn <- get_tsn (mynames)
ldply (tsn, itis_acceptname)
########## SEARCH FOR DUPLICATE EVENTS ##########
# Identification of duplicate events (i.e., unique event reported multiple time).
# Reports are flagged as duplicate when the geolocation information (i.e., country, subnational administration, city, specific location), species (common and Latin names), sex, symptoms, date of report or confirmation (summarized as "Date_1"), number of cases, number of deaths, number of susceptible, tests conducted, outcome, and relationship to another event ("related_ID") were identical.
duplicate <- sars_df %>%
mutate (Date = if_else(
date_confirmed %in% c( "NS", NA,""), date_reported , date_confirmed)) %>%
mutate (Date = if_else(
Date %in% c( "NS", NA,""), date_published, Date)) %>%
mutate(across(where(is.character), str_trim)) %>%
group_by(country_name, subnational_administration, city, location_detail, host_sci_res, host_com_res, sex, age, symptoms, number_cases, number_deaths, number_susceptible, test, test_2, test_3, outcome, Date, related_ID) %>%
tally() %>%
filter(n>1)
duplicate
# Print the table to investigate duplicates
write.csv(duplicate, file="duplicate.csv", row.names = F)