-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Gage metadata additions. #39
base: main
Are you sure you want to change the base?
Changes from 5 commits
db3d179
130d476
6aa7f51
794434e
21203f3
47d8261
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -107,7 +107,8 @@ get_cdec_gage_locations <- function(gages) { | |||||
nhdpv2_COMID = comid_medres, | ||||||
provider_id = id) |> | ||||||
mutate(nhdpv2_REACH_measure = rep(NA_real_, n()), | ||||||
nhdpv2_COMID = as.numeric(nhdpv2_COMID)) | ||||||
nhdpv2_COMID = as.numeric(nhdpv2_COMID), | ||||||
nhdpv2_link_source = "https://cdec.water.ca.gov") | ||||||
} | ||||||
|
||||||
# gages <- targets::tar_read("co_gage") | ||||||
|
@@ -117,31 +118,37 @@ get_co_gage_locations <- function(gages) { | |||||
select(provider_id = `DWR Abbrev`) |> | ||||||
mutate(nhdpv2_REACHCODE = rep(NA_character_, n()), | ||||||
nhdpv2_COMID = rep(NA_integer_, n()), | ||||||
nhdpv2_REACH_measure = rep(NA_real_, n())) | ||||||
nhdpv2_REACH_measure = rep(NA_real_, n()), | ||||||
nhdpv2_link_source = rep(NA_character_, n())) | ||||||
|
||||||
} | ||||||
|
||||||
get_nwis_hydrolocations <- function(nhdpv2_gage, | ||||||
swim_gage, | ||||||
nwis_hydrolocation) { | ||||||
|
||||||
nh <- read.csv(nwis_hydrolocation, colClasses = c("character", | ||||||
"integer", | ||||||
"character", | ||||||
"numeric")) | ||||||
|
||||||
nh <- mutate(nh, nhdpv2_link_source = "https://github.com/internetofwater/ref_gages/blob/main/data/nwis_hydrolocations.csv") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The README file for this folder is blank. It looks like this file has corrections to the original data. If that is right, have you contacted the original source to make the corrections there? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are corrections on publications that aren't really subject to change. The hope is that the file in this repo becomes the place to track manual fixes if another source of truth doesn't become available. |
||||||
|
||||||
if(any(swim_gage$Gage_no %in% nh$provider_id)) stop("duplicates in override registry") | ||||||
|
||||||
swim_gage <- sf::st_drop_geometry(swim_gage) |> | ||||||
select(provider_id = Gage_no, | ||||||
nhdpv2_COMID = COMID, | ||||||
nhdpv2_REACHCODE = REACHCODE, | ||||||
nhdpv2_REACH_measure = REACH_meas) |> | ||||||
filter(nhdpv2_COMID != -9999) | ||||||
filter(nhdpv2_COMID != -9999) |> | ||||||
mutate(nhdpv2_link_source = "https://doi.org/10.5066/P9J5CK2Y") | ||||||
|
||||||
nh <- bind_rows(nh, swim_gage) | ||||||
|
||||||
nhdpv2_gage <- filter(st_drop_geometry(nhdpv2_gage), | ||||||
!provider_id %in% nh$provider_id) | ||||||
!provider_id %in% nh$provider_id) |> | ||||||
mutate(nhdpv2_link_source = "https://www.epa.gov/waterdata/nhdplus-national-hydrography-dataset-plus") | ||||||
|
||||||
bind_rows(nhdpv2_gage, nh) | ||||||
|
||||||
|
@@ -154,7 +161,7 @@ get_nwis_hydrolocations <- function(nhdpv2_gage, | |||||
get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locations, nhdpv2_fline, | ||||||
da_diff_thresh = 0.5, search_radius_m = 500, | ||||||
max_matches_in_radius = 5) { | ||||||
|
||||||
v2_area <- select(nhdplusTools::get_vaa(), | ||||||
nhdpv2_COMID = comid, | ||||||
nhdpv2_totdasqkm = totdasqkm) | ||||||
|
@@ -171,6 +178,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati | |||||
all_gages$nhdpv2_REACHCODE <- NA | ||||||
all_gages$nhdpv2_REACH_measure <- NA | ||||||
all_gages$nhdpv2_COMID <- NA | ||||||
all_gages$nhdpv2_link_source <- NA | ||||||
all_gages$nhdpv2_offset_m <- NA | ||||||
|
||||||
for(hl in hydrologic_locations) { | ||||||
|
||||||
|
@@ -188,6 +197,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati | |||||
hl$locations$nhdpv2_REACH_measure | ||||||
all_gages$nhdpv2_COMID[provider_selector][matcher] <- | ||||||
hl$locations$nhdpv2_COMID | ||||||
all_gages$nhdpv2_link_source[provider_selector][matcher] <- | ||||||
hl$locations$nhdpv2_link_source | ||||||
|
||||||
# Some gages missing reachcode/measure but have COMID | ||||||
update_index <- is.na(all_gages$nhdpv2_REACH_measure & !is.na(all_gages$nhdpv2_COMID)) | ||||||
|
@@ -204,13 +215,31 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati | |||||
|
||||||
all_gages <- left_join(all_gages, v2_area, by = "nhdpv2_COMID") | ||||||
|
||||||
diff_da <- abs(all_gages$nhdpv2_totdasqkm - | ||||||
all_gages$drainage_area_sqkm) / | ||||||
all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm | ||||||
|
||||||
norm_diff_da <- all_gages$da_diff / | ||||||
all_gages$drainage_area_sqkm | ||||||
|
||||||
bad_da <- all_gages[!is.na(diff_da) & diff_da > da_diff_thresh, ] | ||||||
abs_norm_diff_da <- abs(norm_diff_da) | ||||||
|
||||||
bad_da <- all_gages[!is.na(all_gages$da_diff) & # has an estimate | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that your previous method considered any gage with a difference greater than 0.5 km^2 as bad, which may have identified more gages than this new filter. Have you compared the results of the old and new filters? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|
||||||
((all_gages$drainage_area_sqkm <= 100 & | ||||||
# use unnormalized because differences so quantized | ||||||
# due to catchment resolution. | ||||||
# when da_diff is negative, use within 25% | ||||||
(all_gages$da_diff > 10 | (all_gages$da_diff < 0 & abs_norm_diff_da > 0.25))) | | ||||||
|
||||||
# is tens of catchments and within 10% | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually meant catchments here -- Since things are discretized to catchments for drainage area accumulation, the quanta is one catchment. The comment could read "is tens of catchments (~20-40sqkm) and within 10% drainage area match" since catchments are mostly in the range of 1-4sqkm. Does that make sense? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cutoff is for the drainage area, not catchments, right? Do you know that all the gages that meet this criterion have multiple catchments? That's why I was confused by the comment There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't -- I'm just going on the knowledge of the typical size of a catchment. Do you think it would be worth investigating gages on the margins? When dealing with such large numbers of features, I tend to plow ahead and try not to get too stuck on details but may be going to far with that approach in this case. |
||||||
(all_gages$drainage_area_sqkm > 100 & | ||||||
abs_norm_diff_da > (0.1)) | | ||||||
|
||||||
# is hundreds of catchments and within 5% | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
(all_gages$drainage_area_sqkm > 500 & | ||||||
abs_norm_diff_da > (0.05))), ] | ||||||
|
||||||
update_index <- which(is.na(all_gages$nhdpv2_COMID) | | ||||||
!all_gages$nhdpv2_COMID %in% nhdpv2_fline$COMID | | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not following this addition. What causes a gage in |
||||||
all_gages$provider_id %in% bad_da$provider_id) | ||||||
|
||||||
no_location <- all_gages[update_index, ] | ||||||
|
@@ -251,18 +280,55 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati | |||||
linked_gages <- select(no_location, provider_id) |> | ||||||
mutate(id = seq_len(n())) |> | ||||||
left_join(select(linked_gages_dedup, | ||||||
id, COMID, REACHCODE, REACH_meas), | ||||||
id, COMID, REACHCODE, REACH_meas, nhdpv2_totdasqkm, offset), | ||||||
by = "id") | ||||||
|
||||||
all_gages$nhdpv2_REACHCODE[update_index] <- linked_gages$REACHCODE | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you guarantee that the order of gages in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the order is the same, you should be able to do these replacements in 1 line (see example for a different dataset in a comment below) |
||||||
all_gages$nhdpv2_REACH_measure[update_index] <- linked_gages$REACH_meas | ||||||
all_gages$nhdpv2_COMID[update_index] <- linked_gages$COMID | ||||||
all_gages$nhdpv2_totdasqkm[update_index] <- linked_gages$nhdpv2_totdasqkm | ||||||
all_gages$nhdpv2_link_source[update_index] <- rep("https://github.com/internetofwater/ref_gages", nrow(linked_gages)) | ||||||
all_gages$nhdpv2_offset_m[update_index] <- linked_gages$offset | ||||||
|
||||||
all_gages$nhdpv2_totdasqkm <- round(all_gages$nhdpv2_totdasqkm, digits = 1) | ||||||
all_gages$drainage_area_sqkm <- round(all_gages$drainage_area_sqkm, digits = 1) | ||||||
|
||||||
all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm | ||||||
|
||||||
all_gages | ||||||
add_offset(all_gages, nhdpv2_fline) | ||||||
} | ||||||
|
||||||
add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) { | ||||||
add_offset <- function(all_gages, nhdpv2_fline) { | ||||||
|
||||||
missing_offset <- all_gages |> | ||||||
filter(is.na(nhdpv2_offset_m) & !is.na(nhdpv2_REACHCODE)) | ||||||
|
||||||
missing_offset <- sf::st_transform(missing_offset, sf::st_crs(nhdpv2_fline)) | ||||||
|
||||||
new_indexes <- hydroloom::index_points_to_lines(nhdpv2_fline, sf::st_geometry(missing_offset), | ||||||
search_radius = units::set_units(1000, "meters"), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1 km is far to search. Is this based on known location precision issues? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is far! It has to be this big because of large waterbodies that have artificial paths in the middle but the gage location on the shore. There's actually one in NHDPlusV2 gages that is >50km! on a Lake in upstate new york. |
||||||
ids = as.integer(missing_offset$nhdpv2_COMID)) | ||||||
|
||||||
missing_offset$point_id <- seq_len(nrow(missing_offset)) | ||||||
|
||||||
missing_offset <- left_join(missing_offset, new_indexes, by = "point_id") | ||||||
|
||||||
missing_offset$nhdpv2_offset_m <- missing_offset$offset | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could use |
||||||
|
||||||
missing_offset <- sf::st_transform(missing_offset, sf::st_crs(all_gages)) | ||||||
|
||||||
missing_offset$nhdpv2_REACHCODE[is.na(missing_offset$nhdpv2_offset_m)] <- NA | ||||||
missing_offset$nhdpv2_REACH_measure[is.na(missing_offset$nhdpv2_offset_m)] <- NA | ||||||
missing_offset$nhdpv2_COMID[is.na(missing_offset$nhdpv2_offset_m)] <- NA | ||||||
missing_offset$nhdpv2_totdasqkm[is.na(missing_offset$nhdpv2_offset_m)] <- NA | ||||||
missing_offset$nhdpv2_link_source[is.na(missing_offset$nhdpv2_offset_m)] <- NA | ||||||
dblodgett-usgs marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
dplyr::bind_rows(filter(all_gages, !id %in% missing_offset$id), | ||||||
select(missing_offset, all_of(names(all_gages)))) | ||||||
} | ||||||
|
||||||
add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) { | ||||||
|
||||||
mainstems <- mainstems[,c("head_nhdpv2_COMID", "uri"), drop = TRUE] | ||||||
mainstems$head_nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "", | ||||||
mainstems$head_nhdpv2_COMID)) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
validate_ref_gage <- function(registry_csv, reference_file, | ||
reference_locations_csv, | ||
providers_lookup_csv, reference_out) { | ||
|
||
registry <- read_csv(registry_csv) | ||
|
||
reference <- read_sf(reference_file) | ||
|
||
ref_locations <- read_csv(reference_locations_csv) | ||
|
||
providers <- read_csv(providers_lookup_csv) | ||
|
||
if(any(!reference$id %in% registry$id)) { | ||
stop("missing ids in registry") | ||
} | ||
|
||
if(any(!reference$id %in% ref_locations$id)) { | ||
stop("missing ids in ref locations") | ||
} | ||
|
||
if(any(!reference$provider %in% providers$provider)) { | ||
stop("missing providers") | ||
} | ||
|
||
if(any(!registry$provider %in% providers$id)) { | ||
stop("registry has unknown providers") | ||
} | ||
|
||
if(any(sapply(names(registry), \(x) any(is.na(registry[[x]]))))) { | ||
stop("NAs not allowed in registry") | ||
} | ||
|
||
if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_offset_m))) { | ||
stop("if a comid is identified it must have an offset") | ||
} | ||
|
||
if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_REACH_measure))) { | ||
stop("if a comid is identified it must have a measure") | ||
} | ||
|
||
if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_link_source))) { | ||
stop("if a comid is identified it must have a link source") | ||
} | ||
|
||
if(any(sf::st_is_empty(sf::st_geometry(reference)))) { | ||
stop("all geometry must not be empty") | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,21 @@ library(targets) | |
|
||
tar_option_set(packages = c("nhdplusTools", "sf", "dplyr", "dataRetrieval", | ||
"sbtools", "readr", "knitr", "mapview", "data.table"), | ||
memory = "transient", garbage_collection = TRUE, | ||
debug = "gage_hydrologic_locations_with_mainstems") | ||
memory = "transient", garbage_collection = TRUE) | ||
|
||
# primary output file for geoconnex reference server | ||
reference_file <- "out/ref_gages.gpkg" | ||
|
||
# registry csv file which is checked in | ||
registry_csv <- "reg/ref_gages.csv" | ||
|
||
# locations for all known reference gages | ||
# https://github.com/internetofwater/ref_gages/issues/33 | ||
reference_locations_csv <- "reg/ref_locations.csv" | ||
|
||
# contains information for each gage provider | ||
providers_lookup_csv <- "reg/providers.csv" | ||
|
||
# this is a set of location overrides | ||
nwis_hydrolocation <- "data/nwis_hydrolocations.csv" | ||
Comment on lines
+7
to
21
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. providers, yes. registry and ref_locations I don't. I'll create an issue to fix that. |
||
|
||
|
@@ -63,6 +73,8 @@ list( | |
pnw_gage)), | ||
|
||
### location normalization ### | ||
# these targets generate a normalized form set of gages from each source. | ||
|
||
# This Gage layer from NHDPlusV2 is a basic starting point for | ||
# NWIS gage locations. | ||
tar_target("nhdpv2_gage", select(read_sf(nat_db, "Gage"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. general comment: the target name does not need quotes. I actually didn't know you could use quotes! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hahah -- I didn't know you didn't have to! |
||
|
@@ -84,19 +96,19 @@ list( | |
# Each entry will have a provider and provider_id that acts as a unique | ||
# primary key. The existing registry file will have a unique attribute | ||
# that contains that primary key. | ||
tar_target("providers_csv", command = "reg/providers.csv", format = "file"), | ||
tar_target("providers_csv", providers_lookup_csv, format = "file"), | ||
tar_target("providers", read_csv(providers_csv)), | ||
|
||
|
||
tar_target("registry", build_registry(gage_locations, | ||
registry = "reg/ref_gages.csv", | ||
registry = registry_csv, | ||
providers = providers)), | ||
|
||
# Also create a table of reference locations for the registered gages. | ||
# unlike the registry, this may update to have the "best" location of a gage. | ||
tar_target("ref_locations", build_reference_location(gage_locations, | ||
reference_locations = "reg/ref_locations.csv", | ||
registry = "reg/ref_gages.csv", | ||
reference_locations = reference_locations_csv, | ||
registry = registry_csv, | ||
providers = providers)), | ||
|
||
### spatial integration ### | ||
|
@@ -134,4 +146,9 @@ list( | |
registry, providers, reference_file, | ||
nldi_file, | ||
duplicate_locations = duplicate_locations)), | ||
tar_target("registry_out", write_registry(registry, "reg/ref_gages.csv"))) | ||
tar_target("registry_out", write_registry(registry, registry_csv)), | ||
|
||
tar_target("validation", validate_ref_gage(registry_csv, reference_file, | ||
reference_locations_csv, | ||
providers_lookup_csv, | ||
reference_out))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do any COMIDs start with a 0?
I like to read in COMID as a character because it is a unique identifier
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No -- they are strictly integers. I use characters too but comid can be either one.