internetofwater · dblodgett-usgs · Oct 1, 2024 · Oct 1, 2024 · Oct 2, 2024 · Oct 3, 2024
diff --git a/R/gage_locations.R b/R/gage_locations.R
@@ -107,7 +107,8 @@ get_cdec_gage_locations <- function(gages) {
            nhdpv2_COMID = comid_medres,
            provider_id = id) |>
     mutate(nhdpv2_REACH_measure = rep(NA_real_, n()),
-           nhdpv2_COMID = as.numeric(nhdpv2_COMID))
+           nhdpv2_COMID = as.numeric(nhdpv2_COMID),
+           nhdpv2_link_source = "https://cdec.water.ca.gov")
 }
 
 # gages <- targets::tar_read("co_gage")
@@ -117,31 +118,37 @@ get_co_gage_locations <- function(gages) {
     select(provider_id = `DWR Abbrev`) |>
     mutate(nhdpv2_REACHCODE = rep(NA_character_, n()),
            nhdpv2_COMID = rep(NA_integer_, n()),
-           nhdpv2_REACH_measure = rep(NA_real_, n()))
+           nhdpv2_REACH_measure = rep(NA_real_, n()),
+           nhdpv2_link_source = rep(NA_character_, n()))
 
 }
 
 get_nwis_hydrolocations <- function(nhdpv2_gage, 
                                     swim_gage,
                                     nwis_hydrolocation) {
+
   nh <- read.csv(nwis_hydrolocation, colClasses = c("character", 
                                                     "integer", 
                                                     "character", 
                                                     "numeric"))
 
+  nh <- mutate(nh, nhdpv2_link_source = "https://github.com/internetofwater/ref_gages/blob/main/data/nwis_hydrolocations.csv")
+
   if(any(swim_gage$Gage_no %in% nh$provider_id)) stop("duplicates in override registry")
 
   swim_gage <- sf::st_drop_geometry(swim_gage) |>
     select(provider_id = Gage_no, 
            nhdpv2_COMID = COMID,
            nhdpv2_REACHCODE = REACHCODE,
            nhdpv2_REACH_measure = REACH_meas) |>
-    filter(nhdpv2_COMID != -9999)
+    filter(nhdpv2_COMID != -9999) |>
+    mutate(nhdpv2_link_source = "https://doi.org/10.5066/P9J5CK2Y")
 
   nh <- bind_rows(nh, swim_gage)
 
   nhdpv2_gage <- filter(st_drop_geometry(nhdpv2_gage), 
-                        !provider_id %in% nh$provider_id)
+                        !provider_id %in% nh$provider_id) |>
+    mutate(nhdpv2_link_source = "https://www.epa.gov/waterdata/nhdplus-national-hydrography-dataset-plus")
 
   bind_rows(nhdpv2_gage, nh)
 
@@ -154,7 +161,7 @@ get_nwis_hydrolocations <- function(nhdpv2_gage,
 get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locations, nhdpv2_fline,
                                      da_diff_thresh = 0.5, search_radius_m = 500,
                                      max_matches_in_radius = 5) {
-  
+
   v2_area <- select(nhdplusTools::get_vaa(), 
                     nhdpv2_COMID = comid, 
                     nhdpv2_totdasqkm = totdasqkm)
@@ -171,6 +178,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
   all_gages$nhdpv2_REACHCODE <- NA
   all_gages$nhdpv2_REACH_measure <- NA
   all_gages$nhdpv2_COMID <- NA
+  all_gages$nhdpv2_link_source <- NA
+  all_gages$nhdpv2_offset_m <- NA
 
   for(hl in hydrologic_locations) {
 
@@ -188,6 +197,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
       hl$locations$nhdpv2_REACH_measure
     all_gages$nhdpv2_COMID[provider_selector][matcher] <- 
       hl$locations$nhdpv2_COMID
+    all_gages$nhdpv2_link_source[provider_selector][matcher] <- 
+      hl$locations$nhdpv2_link_source
 
     # Some gages missing reachcode/measure but have COMID
     update_index <- is.na(all_gages$nhdpv2_REACH_measure & !is.na(all_gages$nhdpv2_COMID))
@@ -204,13 +215,31 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
 
   all_gages <- left_join(all_gages, v2_area, by = "nhdpv2_COMID")
 
-  diff_da <- abs(all_gages$nhdpv2_totdasqkm -
-                   all_gages$drainage_area_sqkm) / 
+  all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm
+
+  norm_diff_da <- all_gages$da_diff / 
     all_gages$drainage_area_sqkm
 
-  bad_da <- all_gages[!is.na(diff_da) & diff_da > da_diff_thresh, ]
+  abs_norm_diff_da <- abs(norm_diff_da)
+
+  bad_da <- all_gages[!is.na(all_gages$da_diff) & # has an estimate
+
+                        ((all_gages$drainage_area_sqkm <= 100 & 
+                            # use unnormalized because differences so quantized 
+                            # due to catchment resolution.
+                            # when da_diff is negative, use within 25%
+                            (all_gages$da_diff > 10 | (all_gages$da_diff < 0 & abs_norm_diff_da > 0.25))) |
+
+                           # is tens of catchments and within 10%
-                           # is tens of catchments and within 10%
+                           # is tens of kms and within 10%
-                           # is tens of catchments and within 10%
+                           # is tens of kms and within 10%
+                          (all_gages$drainage_area_sqkm > 100 & 
+                             abs_norm_diff_da > (0.1)) | 
+
+                           # is hundreds of catchments and within 5%
-                           # is hundreds of catchments and within 5%
+                           # is hundreds of kms and within 5%
-                           # is hundreds of catchments and within 5%
+                           # is hundreds of kms and within 5%
+                           (all_gages$drainage_area_sqkm > 500 & 
+                             abs_norm_diff_da > (0.05))), ] 
 
   update_index <- which(is.na(all_gages$nhdpv2_COMID) | 
+                          !all_gages$nhdpv2_COMID %in% nhdpv2_fline$COMID | 
                           all_gages$provider_id %in% bad_da$provider_id)
 
   no_location <- all_gages[update_index, ]
@@ -251,18 +280,55 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
   linked_gages <- select(no_location, provider_id) |>
     mutate(id = seq_len(n())) |>
     left_join(select(linked_gages_dedup, 
-                     id, COMID, REACHCODE, REACH_meas), 
+                     id, COMID, REACHCODE, REACH_meas, nhdpv2_totdasqkm, offset), 
               by = "id")
 
   all_gages$nhdpv2_REACHCODE[update_index] <- linked_gages$REACHCODE
   all_gages$nhdpv2_REACH_measure[update_index] <- linked_gages$REACH_meas
   all_gages$nhdpv2_COMID[update_index] <- linked_gages$COMID
+  all_gages$nhdpv2_totdasqkm[update_index] <- linked_gages$nhdpv2_totdasqkm
+  all_gages$nhdpv2_link_source[update_index] <- rep("https://github.com/internetofwater/ref_gages", nrow(linked_gages))
+  all_gages$nhdpv2_offset_m[update_index] <- linked_gages$offset
+
+  all_gages$nhdpv2_totdasqkm <- round(all_gages$nhdpv2_totdasqkm, digits = 1)
+  all_gages$drainage_area_sqkm <- round(all_gages$drainage_area_sqkm, digits = 1)
+
+  all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm
 
-  all_gages
+  add_offset(all_gages, nhdpv2_fline)
 }
 
-add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {
+add_offset <- function(all_gages, nhdpv2_fline) {
+
+  missing_offset <- all_gages |>
+    filter(is.na(nhdpv2_offset_m) & !is.na(nhdpv2_REACHCODE))
+
+  missing_offset <- sf::st_transform(missing_offset, sf::st_crs(nhdpv2_fline))
+
+  new_indexes <- hydroloom::index_points_to_lines(nhdpv2_fline, sf::st_geometry(missing_offset),
+                                                  search_radius =  units::set_units(1000, "meters"),
+                                                  ids = as.integer(missing_offset$nhdpv2_COMID))
+
+  missing_offset$point_id <- seq_len(nrow(missing_offset))
+
+  missing_offset <- left_join(missing_offset, new_indexes, by = "point_id")
+
+  missing_offset$nhdpv2_offset_m <- missing_offset$offset
+
+  missing_offset <- sf::st_transform(missing_offset, sf::st_crs(all_gages))
+
+  missing_offset$nhdpv2_REACHCODE[is.na(missing_offset$nhdpv2_offset_m)] <- NA
+  missing_offset$nhdpv2_REACH_measure[is.na(missing_offset$nhdpv2_offset_m)] <- NA
+  missing_offset$nhdpv2_COMID[is.na(missing_offset$nhdpv2_offset_m)] <- NA
+  missing_offset$nhdpv2_totdasqkm[is.na(missing_offset$nhdpv2_offset_m)] <- NA
+  missing_offset$nhdpv2_link_source[is.na(missing_offset$nhdpv2_offset_m)] <- NA
+
+  dplyr::bind_rows(filter(all_gages, !id %in% missing_offset$id),
+                          select(missing_offset, all_of(names(all_gages))))
+}
 
+add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {
+
   mainstems <- mainstems[,c("head_nhdpv2_COMID", "uri"), drop = TRUE]
   mainstems$head_nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "", 
                                                  mainstems$head_nhdpv2_COMID))

diff --git a/R/registry.R b/R/registry.R
@@ -93,7 +93,10 @@ build_reference_location <- function(gl, reference_locations, registry, provider
   if(nrow(loc) == 0) return(existing_locations)
 
   # else return all the old plus some new
-  bind_rows(existing_locations, loc)
+  out <- bind_rows(existing_locations, loc)
 
+  write_csv(out, reference_locations)
+
+  out
 }
 
diff --git a/R/validation.R b/R/validation.R
@@ -0,0 +1,48 @@
+validate_ref_gage <- function(registry_csv, reference_file, 
+                              reference_locations_csv, 
+                              providers_lookup_csv, reference_out) {
+
+  registry <- read_csv(registry_csv)
+
+  reference <- read_sf(reference_file)
+
+  ref_locations <- read_csv(reference_locations_csv)    
+
+  providers <- read_csv(providers_lookup_csv)
+
+  if(any(!reference$id %in% registry$id)) {
+    stop("missing ids in registry")
+  }
+
+  if(any(!reference$id %in% ref_locations$id)) {
+    stop("missing ids in ref locations")
+  }
+
+  if(any(!reference$provider %in% providers$provider)) {
+    stop("missing providers")
+  }
+
+  if(any(!registry$provider %in% providers$id)) {
+    stop("registry has unknown providers")
+  }
+
+  if(any(sapply(names(registry), \(x) any(is.na(registry[[x]]))))) {
+    stop("NAs not allowed in registry")
+  }
+
+  if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_offset_m))) {
+    stop("if a comid is identified it must have an offset")
+  }
+
+  if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_REACH_measure))) {
+    stop("if a comid is identified it must have a measure") 
+  }
+
+  if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_link_source))) {
+    stop("if a comid is identified it must have a link source") 
+  }
+
+  if(any(sf::st_is_empty(sf::st_geometry(reference)))) {
+    stop("all geometry must not be empty")
+  }
+}
diff --git a/R/write_out.R b/R/write_out.R
@@ -1,6 +1,6 @@
 write_reference <- function(gage_hydrologic_locations, registry, providers, reference_file, nldi_file,
                             duplicate_locations) {
-  
+
   duplicate_locations$cluster_string <- unlist(lapply(duplicate_locations$cluster_id, \(x) {
     if(is.null(x)) return("")
     paste(paste0("https://geoconnex.us/ref/gages/", x), collapse = ",")
@@ -18,13 +18,16 @@ write_reference <- function(gage_hydrologic_locations, registry, providers, refe
     distinct()
 
   if(any(duplicated(out$identifier))) stop("duplicate identifiers?")
-  
+
   out <- out |>
     left_join(select(convert_provider_id(registry, providers), 
                      uri, identifier, id), by = "identifier") %>%
     select(id, uri, name, description, subjectOf, 
            provider, provider_id, nhdpv2_REACHCODE, 
-           nhdpv2_REACH_measure, nhdpv2_COMID, mainstem_uri) %>%
+           nhdpv2_REACH_measure, nhdpv2_COMID, nhdpv2_totdasqkm, 
+           nhdpv2_link_source, nhdpv2_offset_m,
+           gage_totdasqkm = drainage_area_sqkm, 
+           dasqkm_diff = da_diff, mainstem_uri) %>%
     mutate(id = as.integer(id)) |>
     left_join(dup, by = "uri")
 
@@ -43,7 +46,8 @@ write_usgs_reference <- function(gage_hydrologic_locations, registry, providers,
     left_join(select(convert_provider_id(registry, providers), 
                      uri, identifier, id), by = "identifier") %>%
     filter(provider == "https://waterdata.usgs.gov") %>%
-    select(id, uri, name, description, subjectOf, provider, provider_id, nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
+    select(id, uri, name, description, subjectOf, provider, provider_id, 
+           nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
     mutate(id = as.integer(id))
 
   out$id <- out$provider_id

diff --git a/_targets.R b/_targets.R
@@ -2,11 +2,21 @@ library(targets)
 
 tar_option_set(packages = c("nhdplusTools", "sf", "dplyr", "dataRetrieval", 
                             "sbtools", "readr", "knitr", "mapview", "data.table"),
-               memory = "transient", garbage_collection = TRUE,
-               debug = "gage_hydrologic_locations_with_mainstems")
+               memory = "transient", garbage_collection = TRUE)
 
+# primary output file for geoconnex reference server
 reference_file <- "out/ref_gages.gpkg"
 
+# registry csv file which is checked in
+registry_csv <- "reg/ref_gages.csv"
+
+# locations for all known reference gages
+# https://github.com/internetofwater/ref_gages/issues/33
+reference_locations_csv <- "reg/ref_locations.csv"
+
+# contains information for each gage provider
+providers_lookup_csv <- "reg/providers.csv"
+
 # this is a set of location overrides
 nwis_hydrolocation <- "data/nwis_hydrolocations.csv"
 
@@ -63,6 +73,8 @@ list(
                                                   pnw_gage)),
 
   ### location normalization ###
+  # these targets generate a normalized form set of gages from each source.
+
   # This Gage layer from NHDPlusV2 is a basic starting point for
   # NWIS gage locations.
   tar_target("nhdpv2_gage", select(read_sf(nat_db, "Gage"), 
@@ -84,19 +96,19 @@ list(
   # Each entry will have a provider and provider_id that acts as a unique
   # primary key. The existing registry file will have a unique attribute
   # that contains that primary key. 
-  tar_target("providers_csv", command = "reg/providers.csv", format = "file"),
+  tar_target("providers_csv", providers_lookup_csv, format = "file"),
   tar_target("providers", read_csv(providers_csv)),
 
 
   tar_target("registry", build_registry(gage_locations,
-                                        registry = "reg/ref_gages.csv",
+                                        registry = registry_csv,
                                         providers = providers)),
 
   # Also create a table of reference locations for the registered gages.
   # unlike the registry, this may update to have the "best" location of a gage. 
   tar_target("ref_locations", build_reference_location(gage_locations, 
-                                                       reference_locations = "reg/ref_locations.csv", 
-                                                       registry = "reg/ref_gages.csv", 
+                                                       reference_locations = reference_locations_csv, 
+                                                       registry = registry_csv, 
                                                        providers = providers)),
 
   ### spatial integration ###
@@ -134,4 +146,9 @@ list(
                                               registry, providers, reference_file, 
                                               nldi_file,
                                               duplicate_locations = duplicate_locations)),
-  tar_target("registry_out", write_registry(registry, "reg/ref_gages.csv")))
+  tar_target("registry_out", write_registry(registry, registry_csv)),
+
+  tar_target("validation", validate_ref_gage(registry_csv, reference_file, 
+                                             reference_locations_csv, 
+                                             providers_lookup_csv,
+                                             reference_out)))