Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gage metadata additions. #39

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 77 additions & 11 deletions R/gage_locations.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ get_cdec_gage_locations <- function(gages) {
nhdpv2_COMID = comid_medres,
provider_id = id) |>
mutate(nhdpv2_REACH_measure = rep(NA_real_, n()),
nhdpv2_COMID = as.numeric(nhdpv2_COMID))
nhdpv2_COMID = as.numeric(nhdpv2_COMID),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do any COMIDs start with a 0?
I like to read in COMID as a character because it is a unique identifier

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No -- they are strictly integers. I use characters too but comid can be either one.

nhdpv2_link_source = "https://cdec.water.ca.gov")
}

# gages <- targets::tar_read("co_gage")
Expand All @@ -117,31 +118,37 @@ get_co_gage_locations <- function(gages) {
select(provider_id = `DWR Abbrev`) |>
mutate(nhdpv2_REACHCODE = rep(NA_character_, n()),
nhdpv2_COMID = rep(NA_integer_, n()),
nhdpv2_REACH_measure = rep(NA_real_, n()))
nhdpv2_REACH_measure = rep(NA_real_, n()),
nhdpv2_link_source = rep(NA_character_, n()))

}

get_nwis_hydrolocations <- function(nhdpv2_gage,
swim_gage,
nwis_hydrolocation) {

nh <- read.csv(nwis_hydrolocation, colClasses = c("character",
"integer",
"character",
"numeric"))

nh <- mutate(nh, nhdpv2_link_source = "https://github.com/internetofwater/ref_gages/blob/main/data/nwis_hydrolocations.csv")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The README file for this folder is blank. It looks like this file has corrections to the original data. If that is right, have you contacted the original source to make the corrections there?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are corrections on publications that aren't really subject to change. The hope is that the file in this repo becomes the place to track manual fixes if another source of truth doesn't become available.


if(any(swim_gage$Gage_no %in% nh$provider_id)) stop("duplicates in override registry")

swim_gage <- sf::st_drop_geometry(swim_gage) |>
select(provider_id = Gage_no,
nhdpv2_COMID = COMID,
nhdpv2_REACHCODE = REACHCODE,
nhdpv2_REACH_measure = REACH_meas) |>
filter(nhdpv2_COMID != -9999)
filter(nhdpv2_COMID != -9999) |>
mutate(nhdpv2_link_source = "https://doi.org/10.5066/P9J5CK2Y")

nh <- bind_rows(nh, swim_gage)

nhdpv2_gage <- filter(st_drop_geometry(nhdpv2_gage),
!provider_id %in% nh$provider_id)
!provider_id %in% nh$provider_id) |>
mutate(nhdpv2_link_source = "https://www.epa.gov/waterdata/nhdplus-national-hydrography-dataset-plus")

bind_rows(nhdpv2_gage, nh)

Expand All @@ -154,7 +161,7 @@ get_nwis_hydrolocations <- function(nhdpv2_gage,
get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locations, nhdpv2_fline,
da_diff_thresh = 0.5, search_radius_m = 500,
max_matches_in_radius = 5) {

v2_area <- select(nhdplusTools::get_vaa(),
nhdpv2_COMID = comid,
nhdpv2_totdasqkm = totdasqkm)
Expand All @@ -171,6 +178,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
all_gages$nhdpv2_REACHCODE <- NA
all_gages$nhdpv2_REACH_measure <- NA
all_gages$nhdpv2_COMID <- NA
all_gages$nhdpv2_link_source <- NA
all_gages$nhdpv2_offset_m <- NA

for(hl in hydrologic_locations) {

Expand All @@ -188,6 +197,8 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
hl$locations$nhdpv2_REACH_measure
all_gages$nhdpv2_COMID[provider_selector][matcher] <-
hl$locations$nhdpv2_COMID
all_gages$nhdpv2_link_source[provider_selector][matcher] <-
hl$locations$nhdpv2_link_source

# Some gages missing reachcode/measure but have COMID
update_index <- is.na(all_gages$nhdpv2_REACH_measure & !is.na(all_gages$nhdpv2_COMID))
Expand All @@ -204,13 +215,31 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati

all_gages <- left_join(all_gages, v2_area, by = "nhdpv2_COMID")

diff_da <- abs(all_gages$nhdpv2_totdasqkm -
all_gages$drainage_area_sqkm) /
all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm

norm_diff_da <- all_gages$da_diff /
all_gages$drainage_area_sqkm

bad_da <- all_gages[!is.na(diff_da) & diff_da > da_diff_thresh, ]
abs_norm_diff_da <- abs(norm_diff_da)

bad_da <- all_gages[!is.na(all_gages$da_diff) & # has an estimate
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that your previous method considered any gage with a difference greater than 0.5 km^2 as bad, which may have identified more gages than this new filter. Have you compared the results of the old and new filters?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was actually difference greater than 50% were removed and recalculated. Either way, I did look at what the 50% threshold looked like vs the new scheme.

The lines here are the threshold:

image
image

This is the final filter I came up with

372610036-ab5d0016-1847-493e-be61-59f19f12edf2
372610162-00633aa9-a544-4e4f-b7ef-077b347fb183


((all_gages$drainage_area_sqkm <= 100 &
# use unnormalized because differences so quantized
# due to catchment resolution.
# when da_diff is negative, use within 25%
(all_gages$da_diff > 10 | (all_gages$da_diff < 0 & abs_norm_diff_da > 0.25))) |

# is tens of catchments and within 10%
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# is tens of catchments and within 10%
# is tens of kms and within 10%

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually meant catchments here -- Since things are discretized to catchments for drainage area accumulation, the quanta is one catchment. The comment could read "is tens of catchments (~20-40sqkm) and within 10% drainage area match" since catchments are mostly in the range of 1-4sqkm. Does that make sense?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cutoff is for the drainage area, not catchments, right? Do you know that all the gages that meet this criterion have multiple catchments? That's why I was confused by the comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't -- I'm just going on the knowledge of the typical size of a catchment. Do you think it would be worth investigating gages on the margins? When dealing with such large numbers of features, I tend to plow ahead and try not to get too stuck on details but may be going to far with that approach in this case.

(all_gages$drainage_area_sqkm > 100 &
abs_norm_diff_da > (0.1)) |

# is hundreds of catchments and within 5%
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# is hundreds of catchments and within 5%
# is hundreds of kms and within 5%

(all_gages$drainage_area_sqkm > 500 &
abs_norm_diff_da > (0.05))), ]

update_index <- which(is.na(all_gages$nhdpv2_COMID) |
!all_gages$nhdpv2_COMID %in% nhdpv2_fline$COMID |
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not following this addition. What causes a gage in all_gages to have a COMID that is not an nhdpv2_fline? Could the gage be on a stream that is not mapped in the network? I'd suggest retaining the original all_gages$nhdpv2_COMID in a column

all_gages$provider_id %in% bad_da$provider_id)

no_location <- all_gages[update_index, ]
Expand Down Expand Up @@ -251,18 +280,55 @@ get_hydrologic_locations <- function(all_gages, ref_locations, hydrologic_locati
linked_gages <- select(no_location, provider_id) |>
mutate(id = seq_len(n())) |>
left_join(select(linked_gages_dedup,
id, COMID, REACHCODE, REACH_meas),
id, COMID, REACHCODE, REACH_meas, nhdpv2_totdasqkm, offset),
by = "id")

all_gages$nhdpv2_REACHCODE[update_index] <- linked_gages$REACHCODE
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you guarantee that the order of gages in linked_gages is the same as the update_index order?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the order is the same, you should be able to do these replacements in 1 line (see example for a different dataset in a comment below)

all_gages$nhdpv2_REACH_measure[update_index] <- linked_gages$REACH_meas
all_gages$nhdpv2_COMID[update_index] <- linked_gages$COMID
all_gages$nhdpv2_totdasqkm[update_index] <- linked_gages$nhdpv2_totdasqkm
all_gages$nhdpv2_link_source[update_index] <- rep("https://github.com/internetofwater/ref_gages", nrow(linked_gages))
all_gages$nhdpv2_offset_m[update_index] <- linked_gages$offset

all_gages$nhdpv2_totdasqkm <- round(all_gages$nhdpv2_totdasqkm, digits = 1)
all_gages$drainage_area_sqkm <- round(all_gages$drainage_area_sqkm, digits = 1)

all_gages$da_diff <- all_gages$nhdpv2_totdasqkm - all_gages$drainage_area_sqkm

all_gages
add_offset(all_gages, nhdpv2_fline)
}

add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {
add_offset <- function(all_gages, nhdpv2_fline) {

missing_offset <- all_gages |>
filter(is.na(nhdpv2_offset_m) & !is.na(nhdpv2_REACHCODE))

missing_offset <- sf::st_transform(missing_offset, sf::st_crs(nhdpv2_fline))

new_indexes <- hydroloom::index_points_to_lines(nhdpv2_fline, sf::st_geometry(missing_offset),
search_radius = units::set_units(1000, "meters"),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 km is far to search. Is this based on known location precision issues?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is far! It has to be this big because of large waterbodies that have artificial paths in the middle but the gage location on the shore. There's actually one in NHDPlusV2 gages that is >50km! on a Lake in upstate new york.

ids = as.integer(missing_offset$nhdpv2_COMID))

missing_offset$point_id <- seq_len(nrow(missing_offset))

missing_offset <- left_join(missing_offset, new_indexes, by = "point_id")

missing_offset$nhdpv2_offset_m <- missing_offset$offset
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could use rename() here instead of creating a new column


missing_offset <- sf::st_transform(missing_offset, sf::st_crs(all_gages))

missing_offset$nhdpv2_REACHCODE[is.na(missing_offset$nhdpv2_offset_m)] <- NA
missing_offset$nhdpv2_REACH_measure[is.na(missing_offset$nhdpv2_offset_m)] <- NA
missing_offset$nhdpv2_COMID[is.na(missing_offset$nhdpv2_offset_m)] <- NA
missing_offset$nhdpv2_totdasqkm[is.na(missing_offset$nhdpv2_offset_m)] <- NA
missing_offset$nhdpv2_link_source[is.na(missing_offset$nhdpv2_offset_m)] <- NA
dblodgett-usgs marked this conversation as resolved.
Show resolved Hide resolved

dplyr::bind_rows(filter(all_gages, !id %in% missing_offset$id),
select(missing_offset, all_of(names(all_gages))))
}

add_mainstems <- function(gage_hydrologic_locations, mainstems, vaa) {

mainstems <- mainstems[,c("head_nhdpv2_COMID", "uri"), drop = TRUE]
mainstems$head_nhdpv2_COMID <- as.integer(gsub("https://geoconnex.us/nhdplusv2/comid/", "",
mainstems$head_nhdpv2_COMID))
Expand Down
5 changes: 4 additions & 1 deletion R/registry.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ build_reference_location <- function(gl, reference_locations, registry, provider
if(nrow(loc) == 0) return(existing_locations)

# else return all the old plus some new
bind_rows(existing_locations, loc)
out <- bind_rows(existing_locations, loc)

write_csv(out, reference_locations)

out
}

48 changes: 48 additions & 0 deletions R/validation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
validate_ref_gage <- function(registry_csv, reference_file,
reference_locations_csv,
providers_lookup_csv, reference_out) {

registry <- read_csv(registry_csv)

reference <- read_sf(reference_file)

ref_locations <- read_csv(reference_locations_csv)

providers <- read_csv(providers_lookup_csv)

if(any(!reference$id %in% registry$id)) {
stop("missing ids in registry")
}

if(any(!reference$id %in% ref_locations$id)) {
stop("missing ids in ref locations")
}

if(any(!reference$provider %in% providers$provider)) {
stop("missing providers")
}

if(any(!registry$provider %in% providers$id)) {
stop("registry has unknown providers")
}

if(any(sapply(names(registry), \(x) any(is.na(registry[[x]]))))) {
stop("NAs not allowed in registry")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_offset_m))) {
stop("if a comid is identified it must have an offset")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_REACH_measure))) {
stop("if a comid is identified it must have a measure")
}

if(any(!is.na(reference$nhdpv2_COMID) & is.na(reference$nhdpv2_link_source))) {
stop("if a comid is identified it must have a link source")
}

if(any(sf::st_is_empty(sf::st_geometry(reference)))) {
stop("all geometry must not be empty")
}
}
12 changes: 8 additions & 4 deletions R/write_out.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
write_reference <- function(gage_hydrologic_locations, registry, providers, reference_file, nldi_file,
duplicate_locations) {

duplicate_locations$cluster_string <- unlist(lapply(duplicate_locations$cluster_id, \(x) {
if(is.null(x)) return("")
paste(paste0("https://geoconnex.us/ref/gages/", x), collapse = ",")
Expand All @@ -18,13 +18,16 @@ write_reference <- function(gage_hydrologic_locations, registry, providers, refe
distinct()

if(any(duplicated(out$identifier))) stop("duplicate identifiers?")

out <- out |>
left_join(select(convert_provider_id(registry, providers),
uri, identifier, id), by = "identifier") %>%
select(id, uri, name, description, subjectOf,
provider, provider_id, nhdpv2_REACHCODE,
nhdpv2_REACH_measure, nhdpv2_COMID, mainstem_uri) %>%
nhdpv2_REACH_measure, nhdpv2_COMID, nhdpv2_totdasqkm,
nhdpv2_link_source, nhdpv2_offset_m,
gage_totdasqkm = drainage_area_sqkm,
dasqkm_diff = da_diff, mainstem_uri) %>%
mutate(id = as.integer(id)) |>
left_join(dup, by = "uri")

Expand All @@ -43,7 +46,8 @@ write_usgs_reference <- function(gage_hydrologic_locations, registry, providers,
left_join(select(convert_provider_id(registry, providers),
uri, identifier, id), by = "identifier") %>%
filter(provider == "https://waterdata.usgs.gov") %>%
select(id, uri, name, description, subjectOf, provider, provider_id, nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
select(id, uri, name, description, subjectOf, provider, provider_id,
nhdpv2_REACHCODE, nhdpv2_REACH_measure, nhdpv2_COMID) %>%
mutate(id = as.integer(id))

out$id <- out$provider_id
Expand Down
31 changes: 24 additions & 7 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@ library(targets)

tar_option_set(packages = c("nhdplusTools", "sf", "dplyr", "dataRetrieval",
"sbtools", "readr", "knitr", "mapview", "data.table"),
memory = "transient", garbage_collection = TRUE,
debug = "gage_hydrologic_locations_with_mainstems")
memory = "transient", garbage_collection = TRUE)

# primary output file for geoconnex reference server
reference_file <- "out/ref_gages.gpkg"

# registry csv file which is checked in
registry_csv <- "reg/ref_gages.csv"

# locations for all known reference gages
# https://github.com/internetofwater/ref_gages/issues/33
reference_locations_csv <- "reg/ref_locations.csv"

# contains information for each gage provider
providers_lookup_csv <- "reg/providers.csv"

# this is a set of location overrides
nwis_hydrolocation <- "data/nwis_hydrolocations.csv"
Comment on lines +7 to 21
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want targets to track changes to these files? If so, you'll need to create file or file_fast targets for them. It looks like some of these are tracked.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

providers, yes. registry and ref_locations I don't. I'll create an issue to fix that.


Expand Down Expand Up @@ -63,6 +73,8 @@ list(
pnw_gage)),

### location normalization ###
# these targets generate a normalized form set of gages from each source.

# This Gage layer from NHDPlusV2 is a basic starting point for
# NWIS gage locations.
tar_target("nhdpv2_gage", select(read_sf(nat_db, "Gage"),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

general comment: the target name does not need quotes. I actually didn't know you could use quotes!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hahah -- I didn't know you didn't have to!

Expand All @@ -84,19 +96,19 @@ list(
# Each entry will have a provider and provider_id that acts as a unique
# primary key. The existing registry file will have a unique attribute
# that contains that primary key.
tar_target("providers_csv", command = "reg/providers.csv", format = "file"),
tar_target("providers_csv", providers_lookup_csv, format = "file"),
tar_target("providers", read_csv(providers_csv)),


tar_target("registry", build_registry(gage_locations,
registry = "reg/ref_gages.csv",
registry = registry_csv,
providers = providers)),

# Also create a table of reference locations for the registered gages.
# unlike the registry, this may update to have the "best" location of a gage.
tar_target("ref_locations", build_reference_location(gage_locations,
reference_locations = "reg/ref_locations.csv",
registry = "reg/ref_gages.csv",
reference_locations = reference_locations_csv,
registry = registry_csv,
providers = providers)),

### spatial integration ###
Expand Down Expand Up @@ -134,4 +146,9 @@ list(
registry, providers, reference_file,
nldi_file,
duplicate_locations = duplicate_locations)),
tar_target("registry_out", write_registry(registry, "reg/ref_gages.csv")))
tar_target("registry_out", write_registry(registry, registry_csv)),

tar_target("validation", validate_ref_gage(registry_csv, reference_file,
reference_locations_csv,
providers_lookup_csv,
reference_out)))
Loading