performance: call to split was taking around 60s on full timetable file

- longer than the subsequent multi-threaded section. Replacing with a transpose of the dataframe reduces this to 3s or so. The transpose causes the named dataframe to be coerced into a unnamed vector of int, so we have to index into the vector by number, and convert the date coerced into int back into a date.
itsleeds · Aug 9, 2023 · 04aa2f3 · 04aa2f3
1 parent 8c0b5eb
commit 04aa2f3
Showing 1 changed file with 29 additions and 21 deletions.
diff --git a/R/atoc_export.R b/R/atoc_export.R
@@ -253,7 +253,15 @@ splitDates <- function(cal) {
 
 
 
+DATE_EPOC <- as.Date("01/01/1970", format = "%d/%m/%Y")
+WEEKDAY_NAME_VECTOR <- c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")
+CHECKROWS_NAME_VECTOR <- c(WEEKDAY_NAME_VECTOR, "duration", "start_date", "end_date")
 
+DURATION_INDEX <- match("duration", CHECKROWS_NAME_VECTOR)
+START_DATE_INDEX <- match("start_date", CHECKROWS_NAME_VECTOR)
+END_DATE_INDEX <- match("end_date", CHECKROWS_NAME_VECTOR)
+MONDAY_INDEX <- match("monday", CHECKROWS_NAME_VECTOR)
+SUNDAY_INDEX <- match("sunday", CHECKROWS_NAME_VECTOR)
 
 # TODO: Does not work within functions, rejig to work in package.
 #
@@ -268,24 +276,22 @@ splitDates <- function(cal) {
 #' @noRd
 #'
 checkrows <- function(tmp) {
-  # tmp = res.calendar[i,]
+  #tmp = res[i,]
   # message(paste0("done ",i))
-  if (tmp$duration < 7) {
+
+  if (tmp[DURATION_INDEX] < 7) {
+
     days.valid <- weekdays(seq.POSIXt(
-      from = as.POSIXct.Date(tmp$start_date),
-      to = as.POSIXct.Date(tmp$end_date),
+      from = as.POSIXct.Date( as.Date(tmp[START_DATE_INDEX], DATE_EPOC) ),
+      to = as.POSIXct.Date( as.Date(tmp[END_DATE_INDEX], DATE_EPOC) ),
       by = "DSTday"
     ))
     days.valid <- tolower(days.valid)
-    days.match <- tmp[, c(
-      "monday", "tuesday", "wednesday", "thursday",
-      "friday", "saturday", "sunday"
-    )]
-    days.match <- sapply(days.match, function(x) {
-      x == 1
-    })
-    days.match <- days.match[days.match]
-    days.match <- names(days.match)
+
+    #get a vector of names of days of week that the timetable is valid on
+    days.match <- tmp[MONDAY_INDEX:SUNDAY_INDEX]
+    days.match <- WEEKDAY_NAME_VECTOR[ 1==days.match ]
+
     if (any(days.valid %in% days.match)) {
       return(TRUE)
     } else {
@@ -396,10 +402,7 @@ makeCalendar <- function(schedule, ncores = 1) {
   })
   days <- matrix(unlist(days), ncol = 7, byrow = TRUE)
   days <- as.data.frame(days)
-  names(days) <- c(
-    "monday", "tuesday", "wednesday", "thursday",
-    "friday", "saturday", "sunday"
-  )
+  names(days) <- WEEKDAY_NAME_VECTOR
 
   res.calendar <- cbind(res.calendar, days)
   res.calendar$Days <- NULL
@@ -408,22 +411,27 @@ makeCalendar <- function(schedule, ncores = 1) {
     Sys.time(),
     " Removing trips that only occur on days of the week that are non-operational"
   ))
-  res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar)))
+
+  #res.calendar.split <- split(res.calendar, seq(1, nrow(res.calendar)))
+  #performance - doing this split on 500k rows takes 60s - longer than the parallel execution below and consumes 3gb memory.
+
+  res.calendar.days <- res.calendar[,CHECKROWS_NAME_VECTOR]
+  res.calendar.days <- data.table::transpose(res.calendar.days)
+  #transpose on the same size runs in around 3s, but causes named dataframe with mixed datatypes to be coerced to unnamed vector of integer.
 
 
   if (ncores > 1) {
     cl <- parallel::makeCluster(ncores)
     parallel::clusterEvalQ(cl, {
       loadNamespace("UK2GTFS")
     })
-    keep <- pbapply::pbsapply(res.calendar.split,
-      checkrows,
+    keep <- pbapply::pbsapply(res.calendar.days, checkrows,
       cl = cl
     )
     parallel::stopCluster(cl)
     rm(cl)
   } else {
-    keep <- pbapply::pbsapply(res.calendar.split, checkrows)
+    keep <- pbapply::pbsapply(res.calendar.days, checkrows)
   }
 
   res.calendar <- res.calendar[keep, ]