Skip to content

Commit

Permalink
performance: call to split was taking around 60s on full timetable file
Browse files Browse the repository at this point in the history
- longer than the subsequent multi-threaded section.
Replacing with a transpose of the dataframe reduces this to 3s or so.
The transpose causes the named dataframe to be coerced into a unnamed vector of int, so we have to index into the vector by number, and convert the date coerced into int back into a date.
  • Loading branch information
oweno-tfwm committed Aug 9, 2023
1 parent 8c0b5eb commit 04aa2f3
Showing 1 changed file with 29 additions and 21 deletions.
50 changes: 29 additions & 21 deletions R/atoc_export.R
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,15 @@ splitDates <- function(cal) {



DATE_EPOC <- as.Date("01/01/1970", format = "%d/%m/%Y")
WEEKDAY_NAME_VECTOR <- c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")
CHECKROWS_NAME_VECTOR <- c(WEEKDAY_NAME_VECTOR, "duration", "start_date", "end_date")

DURATION_INDEX <- match("duration", CHECKROWS_NAME_VECTOR)
START_DATE_INDEX <- match("start_date", CHECKROWS_NAME_VECTOR)
END_DATE_INDEX <- match("end_date", CHECKROWS_NAME_VECTOR)
MONDAY_INDEX <- match("monday", CHECKROWS_NAME_VECTOR)
SUNDAY_INDEX <- match("sunday", CHECKROWS_NAME_VECTOR)

# TODO: Does not work within functions, rejig to work in package.
#
Expand All @@ -268,24 +276,22 @@ splitDates <- function(cal) {
#' @noRd
#'
checkrows <- function(tmp) {
# tmp = res.calendar[i,]
#tmp = res[i,]
# message(paste0("done ",i))
if (tmp$duration < 7) {

if (tmp[DURATION_INDEX] < 7) {

days.valid <- weekdays(seq.POSIXt(
from = as.POSIXct.Date(tmp$start_date),
to = as.POSIXct.Date(tmp$end_date),
from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ),
to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ),
by = "DSTday"
))
days.valid <- tolower(days.valid)
days.match <- tmp[, c(
"monday", "tuesday", "wednesday", "thursday",
"friday", "saturday", "sunday"
)]
days.match <- sapply(days.match, function(x) {
x == 1
})
days.match <- days.match[days.match]
days.match <- names(days.match)

#get a vector of names of days of week that the timetable is valid on
days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX]
days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ]

if (any(days.valid %in% days.match)) {
return(TRUE)
} else {
Expand Down Expand Up @@ -396,10 +402,7 @@ makeCalendar <- function(schedule, ncores = 1) {
})
days <- matrix(unlist(days), ncol = 7, byrow = TRUE)
days <- as.data.frame(days)
names(days) <- c(
"monday", "tuesday", "wednesday", "thursday",
"friday", "saturday", "sunday"
)
names(days) <- WEEKDAY_NAME_VECTOR

res.calendar <- cbind(res.calendar, days)
res.calendar$Days <- NULL
Expand All @@ -408,22 +411,27 @@ makeCalendar <- function(schedule, ncores = 1) {
Sys.time(),
" Removing trips that only occur on days of the week that are non-operational"
))
res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar)))

#res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar)))
#performance - doing this split on 500k rows takes 60s - longer than the parallel execution below and consumes 3gb memory.

res.calendar.days <- res.calendar[,CHECKROWS_NAME_VECTOR]
res.calendar.days <- data.table::transpose(res.calendar.days)
#transpose on the same size runs in around 3s, but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer.


if (ncores > 1) {
cl <- parallel::makeCluster(ncores)
parallel::clusterEvalQ(cl, {
loadNamespace("UK2GTFS")
})
keep <- pbapply::pbsapply(res.calendar.split,
checkrows,
keep <- pbapply::pbsapply(res.calendar.days, checkrows,
cl = cl
)
parallel::stopCluster(cl)
rm(cl)
} else {
keep <- pbapply::pbsapply(res.calendar.split, checkrows)
keep <- pbapply::pbsapply(res.calendar.days, checkrows)
}

res.calendar <- res.calendar[keep, ]
Expand Down

0 comments on commit 04aa2f3

Please sign in to comment.