Skip to content
This repository has been archived by the owner on Jun 30, 2023. It is now read-only.

Commit

Permalink
Merge pull request #228 from jds485/227-txt-attrs-segs
Browse files Browse the repository at this point in the history
add txt files with attribute and reach names
  • Loading branch information
jds485 authored Mar 29, 2023
2 parents e205f5f + a368c92 commit 8e292d3
Show file tree
Hide file tree
Showing 4 changed files with 403 additions and 131 deletions.
115 changes: 115 additions & 0 deletions 4_predict.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ source("4_predict/src/select_features.R")
source("4_predict/src/train_models.R")
source("4_predict/src/plot_diagnostics.R")
source("1_fetch/src/generate_credentials.R")
source("4_predict/src/write_attrs_reaches.R")

#Predict phase
p4_targets_list <- list(
Expand Down Expand Up @@ -569,5 +570,119 @@ p4_targets_list <- list(
test_data = p4_train_RF_dynamic_spatial$best_fit$splits[[1]]$data[-p4_train_RF_dynamic_spatial$best_fit$splits[[1]]$in_id,],
target_name = 'mean_value'),
deployment = 'main'
),


#Save attributes for each of the 3 splits
tar_target(p4_dynamic_attrs_txt,
write_attrs_files(p4_dynamic_attrs$input_data$testing,
drop_cols = c("PRMS_segid", "Date", "mean_value", "data_type"),
filepath = "4_predict/out/dynamic_attrs.txt"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_min_static_dynamic_attrs_txt,
write_attrs_files(p4_selected_min_static_dynamic_attrs$input_data$testing,
drop_cols = c("PRMS_segid", "Date", "mean_value", "data_type"),
filepath = "4_predict/out/min_static_dynamic_attrs.txt"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_static_dynamic_attrs_txt,
write_attrs_files(p4_selected_static_dynamic_attrs$input_data$testing,
drop_cols = c("PRMS_segid", "Date", "mean_value", "data_type"),
filepath = "4_predict/out/static_dynamic_attrs.txt"),
deployment = 'main',
format = 'file',
repository = 'local'
),
#Save the unique attributes as a csv (static dynamic contain all unique attributes)
tar_target(p4_all_model_attrs_csv,
{
statdyn <- read_csv(p4_static_dynamic_attrs_txt, col_names = FALSE,
show_col_types = FALSE)

write_csv(as.data.frame(statdyn$X1),
file = "4_predict/out/all_model_attrs.csv",
col_names = FALSE)
"4_predict/out/all_model_attrs.csv"
},
deployment = 'main',
format = 'file',
repository = 'local'
),

#Save reaches in the spatial validation (cv) and test sets,
tar_target(p4_spatial_cv_reaches_txt,
write_reaches_cv(train_dataset = p4_dynamic_attrs_spatial$input_data$training %>%
select(PRMS_segid, data_type),
cv_folds = cv_folds,
out_dir = "4_predict/out"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_spatial_test_reaches_txt,
write_reaches(p4_dynamic_attrs_spatial$input_data$testing$PRMS_segid,
filepath = "4_predict/out/spatial_test_reaches.txt"),
deployment = 'main',
format = 'file',
repository = 'local'
),
#Save temporal train start/end and test start/end dates
tar_target(p4_train_test_dates_txt,
write_dates(filter_rows_date(p4_dynamic_attrs_temporal,
'1984-09-30')$input_data,
cv_folds = cv_folds,
out_dir = "4_predict/out"),
deployment = 'main',
format = 'file',
repository = 'local'
),

#write file with predictions and observations from each model
tar_target(p4_min_static_dynamic_temporal_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_min_static_dynamic_temporal_test$pred,
out_dir = "4_predict/out/temporal/pred_obs/RF_min_static_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_static_dynamic_temporal_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_static_dynamic_temporal_test$pred,
out_dir = "4_predict/out/temporal/pred_obs/RF_static_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_dynamic_temporal_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_dynamic_temporal_test$pred,
out_dir = "4_predict/out/temporal/pred_obs/RF_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_min_static_dynamic_spatial_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_min_static_dynamic_spatial_test$pred,
out_dir = "4_predict/out/spatial/pred_obs/RF_min_static_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_static_dynamic_spatial_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_static_dynamic_spatial_test$pred,
out_dir = "4_predict/out/spatial/pred_obs/RF_static_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
),
tar_target(p4_dynamic_spatial_test_pred_obs_csv,
write_pred_obs(p4_pred_RF_dynamic_spatial_test$pred,
out_dir = "4_predict/out/spatial/pred_obs/RF_dynamic/"),
deployment = 'main',
format = 'file',
repository = 'local'
)
)
137 changes: 137 additions & 0 deletions 4_predict/src/write_attrs_reaches.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
write_attrs_files <- function(dataset, drop_cols, filepath){
#' @description generates a text file of column names
#'
#' @param dataset the dataset whose columns will be written to a txt file
#' @param drop_cols any columns to not include in the file.
#' Passed to select(-all_of(drop_cols))
#' @param filepath relative filepath to the txt file
#'
#' @return txt file containing the column names of dataset without the drop_columns

col_names <- select(dataset, -all_of(drop_cols)) %>%
colnames() %>%
sort()

write_csv(x = as.data.frame(col_names), file = filepath, col_names = FALSE)

return(filepath)
}

write_reaches <- function(reach_ids, filepath){
#' @description generates a text file of the reach_ids
#'
#' @param reach_ids character string of the IDs to be written to a txt file
#' @param filepath relative filepath to the txt file
#'
#' @return txt file containing the reach_ids

reach_ids <- sort(unique(reach_ids))

write_csv(x = as.data.frame(reach_ids), file = filepath, col_names = FALSE)

return(filepath)
}

write_reaches_cv <- function(train_dataset, cv_folds, out_dir){
#' @description generates a text file of the reach_ids within the validation set,
#' one text file per fold.
#'
#' @param train_dataset tibble with columns for PRMS_segid and data_type.
#' This will be split into cv_folds partitions based on the data_type proportions
#' @param cv_folds number of cross validation folds in training. This is used to
#' write validation files.
#' @param out_dir directory to save txt files
#'
#' @return txt files containing the reach_ids

#create validation file names
val_nums <- seq(1,cv_folds,1)
filepaths <- file.path(out_dir, c(paste0('spatial_val_reaches', val_nums, '.txt')))

#add group variable to dataset based on spatial splits
train_dataset$group <- 0

#determine validation splits and write to file
for(i in val_nums){
if(i == cv_folds){
ind_i <- which(train_dataset$group == 0)
inds_grp <- seq(1,length(ind_i),1)
}else{
#only use data that haven't been assigned to a group
ind_i <- which(train_dataset$group == 0)
#get the correct training proportion to use for a reduced dataset
train_prop_i <- nrow(train_dataset)/cv_folds/nrow(train_dataset[ind_i,])
#group row indices
inds_grp <- make_spatial_split_CVtraining(attrs_df = train_dataset[ind_i,],
train_prop = train_prop_i)
}
#assign group index
train_dataset$group[ind_i][inds_grp] <- i
write_csv(x = as.data.frame(sort(unique(train_dataset$PRMS_segid[train_dataset$group == i]))),
file = filepaths[i], col_names = FALSE)
}

return(filepaths)
}

write_dates <- function(dataset, cv_folds = NULL, out_dir){
#' @description generates a text file for each of the training and testing
#' start and end dates for use in Python.
#'
#' @param dataset list of training and testing datasets, each with a Date column
#' @param cv_folds number of cross validation folds in training. This is used to
#' write validation files.
#' @param out_dir directory to save txt files
#'
#' @return vector of paths to txt files containing the start and end dates

if (is.null(cv_folds)){
filenames <- file.path(out_dir, c('train_start.txt', 'train_end.txt', 'test_start.txt', 'test_end.txt'))
}else{
#create validation files from training time period
val_nums <- seq(1,cv_folds-1,1)
filenames <- file.path(out_dir, c('train_start.txt', 'train_end.txt', 'test_start.txt', 'test_end.txt',
paste0('val_start', val_nums+1, '.txt'), paste0('val_end', val_nums, '.txt')))
}

train_start <- min(dataset$training$Date) %>% as.data.frame()
train_end <- max(dataset$training$Date) %>% as.data.frame()
test_start <- min(dataset$testing$Date) %>% as.data.frame()
test_end <- max(dataset$testing$Date) %>% as.data.frame()

write_csv(train_start, file = filenames[1], col_names = FALSE)
write_csv(train_end, file = filenames[2], col_names = FALSE)
write_csv(test_start, file = filenames[3], col_names = FALSE)
write_csv(test_end, file = filenames[4], col_names = FALSE)

if (!is.null(cv_folds)){
#determine validation start and end dates and write to file
for(i in val_nums){
val_start <- train_start + (train_end - train_start)/cv_folds*i
val_end <- val_start - 1
write_csv(val_start, file = filenames[4+i], col_names = FALSE)
write_csv(val_end, file = filenames[8+i], col_names = FALSE)
}
}

return(filenames)
}

write_pred_obs <- function(pred_obs, out_dir){
#' @description generates a text file containing the predictions, observations,
#' date, and segment columns.
#'
#' @param pred_obs dataframe containing columns "Date", "PRMS_segid", "obs", and ".pred"
#' @param out_dir directory to save txt file
#'
#' @return vector of paths to txt file

filename <- file.path(out_dir, 'pred_obs.txt')

pred_obs <- select(pred_obs, "Date", "PRMS_segid", "obs", ".pred") %>%
rename(pred = '.pred')

write_csv(pred_obs, filename)

return(filename)
}
6 changes: 4 additions & 2 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ predict_dir <- "4_predict/out"
train_test_other <- c("vip","hypopt")
train_test_splits <- c("temporal","random","spatial")
train_test_features <- c("RF_static", "RF_min_static", "RF_static_dynamic",
"RF_min_static_dynamic","RF_dynamic")
"RF_min_static_dynamic","RF_dynamic","RGCN_static_dynamic",
"RGCN_min_static_dynamic","RGCN_dynamic")
train_test_res <- c("pred_obs","spatial_res","monthly_res","annual_res","temporal_res")
rf_xai_plot_types <- c("shap","dependence")
rf_xai_shap_options <- c("seasonal","lulc", "physio")
Expand Down Expand Up @@ -80,7 +81,8 @@ for(i in seq_along(p4_dirs)){
dir.create(p4_dirs[i], recursive = TRUE, showWarnings = FALSE)
}
dir.create("4_predict/out/XAI_splits", showWarnings = FALSE)

dir.create("4_predict/out/spatial/RGCN", showWarnings = FALSE)
dir.create("4_predict/out/temporal/RGCN", showWarnings = FALSE)

# Define columns of interest for harmonized WQP data
wqp_vars_select <- c("MonitoringLocationIdentifier","MonitoringLocationName","LongitudeMeasure","LatitudeMeasure",
Expand Down
Loading

0 comments on commit 8e292d3

Please sign in to comment.