-
Notifications
You must be signed in to change notification settings - Fork 3
/
article_pdf_download.R
223 lines (189 loc) · 9.49 KB
/
article_pdf_download.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# ===============================
# PACKAGES
# ===============================
# library("devtools")
# install_github("massimoaria/bibliometrix") # Needs to be installed from GitHub not CRAN
library(tools) # for file manipulations
library(tidyverse) # for general purpose data manip
library(bibliometrix) # For reading and analyzing ISI stuff
library(crminer) # for getting texts from DOI
library(XML) # for parsing elsevier xml responses
library(rvest) # for parsing html responses
library(jsonlite) # for parsing json objects
# ===============================
# CONSTANTS
# ===============================
data_dir <- './data' # Relative path to repository. To be changed as needed
isi_dir <- file.path(data_dir, 'isi_searches')
# Create the main output directory
output_dir <- file.path(data_dir, 'pdf_output')
# Check if pdf_output directory exists
dir.create(output_dir, showWarnings = FALSE)
# PDF subdirectory
pdf_output_dir <- file.path(output_dir, 'pdfs')
# Check if pdf_output_dir directory exists
dir.create(pdf_output_dir, showWarnings = FALSE)
# Non-PDF files subdirectory
nopdf_output_dir <- file.path(output_dir, 'non-pdfs')
# Check if pdf_output_dir directory exists
dir.create(nopdf_output_dir, showWarnings = FALSE)
# ===============================
# FUNCTIONS
# ===============================
# elsevier links require a special webscraping process, so I wrote a function for it
#' This function uses webscraping techniques to download a PDF file from elseviers server
#'
#' @param elsevier_xml_link A character; elsevier link of type xml as returned by crminer's crmlinks() function
#' @param filepath A character; path to desired output location for downloaded PDF
#'
#' @return Nothing. Download PDFs to HD
#' @export
#'
#' @examples
#' elsevier_pdf_download('https://api.elsevier.com/content/article/PII:0167198795004585?httpAccept=text/xml','/Users/timothy/Documents/soilc-text_mapping/data')
elsevier_pdf_download <- function(elsevier_xml_link, filepath) {
# getting science direct link from elsevier xml contents
scidir_html_link <- elsevier_xml_link %>%
read_xml() %>%
xml_find_all('//@href') %>%
xml_text()
# getting route to initial pdf download link on science direct's domain
json_obj <- scidir_html_link[2] %>% # we need to index to the second link in the `scidir_html_link` obj
read_html() %>% # get html response from science direct link retrieved from elsevier XML response
html_nodes('body') %>% # start from the body section
html_nodes('div') %>% # get the div contents within that body section
html_nodes('script') %>% # get the script contents within that div
html_text() %>% # extract text (JSON string)
fromJSON() # convert JSON string to R obj
# constructing url; second arg in paste function is route to initial pdf link obtained by navigating R obj representation of the JSON string
url_of_interest <- paste0('https://www.sciencedirect.com', json_obj$article$pdfDownload$linkToPdf)
# getting temporary link
read_html(url_of_interest) %>%
html_node('head') %>% # start in the head section of the html
html_nodes('meta[http-equiv="Refresh"]') %>% # from there, seek a meta tag with attribute http-equiv="Refresh"
html_attr('content') %>% # within that tag, grab the character string containing the PDF direct access url
substr(7, 100000000L) %>% # the first 6 chars are not part of the url we want, so we slice them off
download.file(filepath) # download pdf to provided filepath
}
#' This function checks if a file is binary
#'
#' @param filepath A character; path to target file
#' @param max An integer; max number of characters in file to be checked (default 1000)
#'
#' @return A boolean; TRUE if file is binary and FALSE otherwise
#' @export
#'
#' @examples
#' is_binary('~/Desktop/example.pdf')
is_binary <- function(filepath,max=1000){
f=file(filepath,"rb",raw=TRUE)
b=readBin(f,"int",max,size=1,signed=FALSE)
close(f)
return(max(b)>128)
}
# ===============================
# MAIN
# ===============================
## STEP 1: READ DOIs/METADATA FROM BIB FILES
# Combine all the bib files. The user needs only add the file to the directory full of bib files
filepath_list <- as.list(file.path(isi_dir, dir(isi_dir)))
file_list <- convert2df(do.call(readFiles, filepath_list))
# removing duplicate records
# NOTE: turns out that this still lets some duplicate files through (see: 'STEP 4: POST-PROCESSING')
# Pretty time consuming step
soil.health <- duplicatedMatching(file_list, Field="TI")
## STEP 2: ORGANIZE LINKS
message('===============================\nORGANIZING LINKS\n===============================')
# Select attributes of interest and clean the author field
my_df <- data.frame(Name=paste(gsub(";.*$", "", soil.health$AU),soil.health$PY,soil.health$JI),
DOI=soil.health$DI, stringsAsFactors = FALSE)
my_df <- my_df[!is.na(my_df$DOI),] # Of 8693 observations, 6406 have valid DOIs
# collecting links
# Note: This step takes ~20min to run
my_df$links <- sapply(my_df$DOI, crm_links) # getting links for each DOI
# Remove references for which no DOI URL was found
my_df <- my_df[lapply(my_df$links, length) > 0,] # 5759 of 6406 DOIs returned links via crm_links()
# elsevier links require a separate download process, so we distinguish them here
for (i in 1:length(my_df$links)) {
if (grepl('elsevier',my_df$links[[i]][[1]])) { # checking for string 'elsevier' in link
my_df$elsevier[i] <- TRUE
} else {
my_df$elsevier[i] <- FALSE
}
}
# selecting a single link for each DOI (up until now, there has been a list of links assoc. to each DOI)
for (i in 1:dim(my_df)[1]) {
if (my_df$elsevier[i]) { # if it's from elsevier, we want to get the xml link
link <- my_df$links[[i]]$xml$xml
} else if ('pdf' %in% names(my_df$links[[i]])) { # otherwise, we prefer the 'pdf' link type
link <- my_df$links[[i]]$pdf$pdf
} else if ('unspecified' %in% names(my_df$links[[i]])) { # our last preference is the 'unspecified' link type
link <- my_df$links[[i]]$unspecified$unspecified
} else { # we don't handle links of type 'html' or 'plain', because they almost never provide pdf download; moreover, we only want xml links from elsevier because we only handle those
link <- NA
}
my_df$download_link[i] <- as.character(link)
}
## STEP 3: DOWNLOAD PDFS FROM LINKS
message('===============================\nDOWNLOADING PDFS FROM LINKS\n===============================')
# Here, the elsevier_pdf_download() function is called repeatedly via a loop that iterates through the rows of the dataframe created in the preceding
# 'organize links' section of the script
for (i in 1:nrow(my_df)) {
url <- my_df$download_link[i]
my_df$path[i] <- paste0(file.path(pdf_output_dir, my_df$Name[i]), '.pdf')
if (my_df$elsevier[i]) {
my_df$downloaded <- tryCatch(elsevier_pdf_download(url, my_df$path[i]),
error=function(cond) {
message(cond)
return(0)
},
finally = message(paste("\nProcessed URL:", url)))
} else {
# DOWNLOADING OTHER LINKS
my_df$downloaded[i] <- tryCatch(download.file(url, my_df$path[i]),
error=function(cond) {
message(cond)
return(0)
},
finally = message(paste("\nProcessed URL:", url)))
}
message('[', i, '/', dim(my_df)[1], ']')
}
message('===============================\nPDFS DOWNLOADED\n===============================')
## STEP 4: POST-PROCESSING
# distinguish real pdf files from other files (mainly html webpages)
for (i in 1:dim(my_df)[1]) {
if (file.exists(my_df$path[i])) {
my_df$downloaded[i] <- TRUE
my_df$is_pdf[i] <- is_binary(my_df$path[i])
} else {
my_df$downloaded[i] <- FALSE
my_df$is_pdf[i] <- FALSE
}
}
# Add the flags for downloaded and PDF to the data frame
my_df$downloaded <- as.logical(my_df$downloaded)
my_df$is_pdf <- as.logical(my_df$is_pdf)
# Extract some statistics
download_success <- sum(my_df$downloaded) # out of 5759 acquired links, 4604 produced downloaded files
unique_files <- length(unique(my_df$path[my_df$downloaded])) # out of 4604 downloaded files, 4539 are unique
unique_pdfs <- length(unique(my_df$path[my_df$downloaded & my_df$is_pdf])) # out of 4539 unique downloaded files, 4057 are binary files (PDFs)
message(sprintf("Over the %i acquired links, %i PDFs were succesfully downloaded", nrow(my_df), unique_pdfs))
# Extract the files info that were not PDFs
non_pdf_paths <- unique(my_df$path[my_df$downloaded & !my_df$is_pdf]) # For investigative purposes, here are the paths for the non-PDF files (482) that were downloaded
## Move the non-pdf files to a specific directory
# Create the destination list
html_paths <- file.path(nopdf_output_dir,
paste0(basename(file_path_sans_ext(non_pdf_paths)),
".html")
)
# Move the files
file.rename(from = non_pdf_paths, to = html_paths)
## Fix the double dot before file extension
pdf_files <- dir(pdf_output_dir, full.names = TRUE)
pdf_fixed <- gsub("\\.\\.pdf","\\.pdf",pdf_files)
file.rename(from = pdf_files , to = pdf_fixed)
# output information regarding the download processs to csv
summary_path <- file.path(output_dir, 'summary.csv')
write.csv(select(my_df, -links), file = summary_path, row.names = F)
message('\n Details of the PDF retrieval process have been stored in ', summary_path, '\n')