From a59f1fbf8809b10458502d95f21300553d7f6729 Mon Sep 17 00:00:00 2001
From: Mantra <mantrakusumgar@outlook.com>
Date: Mon, 2 Sep 2024 16:42:09 +0100
Subject: [PATCH] This branch tries to (and ultimately fails) speed up the
 review inputs metadata endpoint in Hintr. Profiling the code revealed these
 Naomi functions were the slow parts. After optimising these they are many
 times faster on the test data and the MWI data however where we ultimately
 fail is DRC for which there is only a slight speed up because the majority of
 the time for it is spent calculating the other plot types like art_new_total,
 art_new_adult_m, etc. We have a solution for this that will probably get done
 in a PR after the plot refactor. For now, the main changes in this PR: *
 drop_geometry arg to aggregate functions to skip unnecessary work (these
 functions were trying to drop geometry after it had already been dropped so
 wasting a lot of time reading the files) * small test fix, we should be
 checking `check3` (defined in the lines just above the diff) and not `check1`
 * new algorithm for aggregating up data, it is just a for loop that works
 level by level to aggregate up the data * same new algorithm is also applied
 to calculating `missing_map`

---
 DESCRIPTION                             |   2 +-
 R/input-time-series.R                   | 399 ++++++++++++------------
 man/aggregate_anc.Rd                    |   5 +-
 man/aggregate_art.Rd                    |   5 +-
 tests/testthat/test-input-time-series.R |   2 +-
 5 files changed, 210 insertions(+), 203 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f84e736a..1fdeafd9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: naomi
 Title: Naomi Model for Subnational HIV Estimates
-Version: 2.9.27
+Version: 2.9.28
 Authors@R:
     person(given = "Jeff",
            family = "Eaton",
diff --git a/R/input-time-series.R b/R/input-time-series.R
index 2ef2eba5..0c717318 100644
--- a/R/input-time-series.R
+++ b/R/input-time-series.R
@@ -5,19 +5,25 @@
 ##'
 ##' @param art Path to file containing ART data or ART data object
 ##' @param shape Path to file containing geojson areas data or areas data object
+##' @param drop_geometry Setting this to FALSE will skip dropping geometry on
+##'   the shape file (default TRUE)
 ##'
 ##' @return Aggregated ART data containing columns area_id, area_name,
 ##' area_level, area_level_label, parent_area_id, sex, age_group, time_period,
 ##'  year, quarter,calendar_quarter and art_current
 ##'
 ##' @export
-aggregate_art <- function(art, shape) {
+aggregate_art <- function(art, shape, drop_geometry = TRUE) {
 
   ## Check if shape is object or file path
-  if (!inherits(shape, "sf")) {
-    areas <- sf::read_sf(shape) |> sf::st_drop_geometry()
+  if (drop_geometry) {
+    if(!inherits(shape, "sf")) {
+      areas <- sf::read_sf(shape) |> sf::st_drop_geometry()
+    } else {
+      areas <- shape |> sf::st_drop_geometry()
+    }
   } else {
-    areas <- shape |> sf::st_drop_geometry()
+    areas <- shape
   }
 
   ## Check if art is object or file path
@@ -29,85 +35,71 @@ aggregate_art <- function(art, shape) {
   cols_list <- c("art_current", "art_new", "vl_tested_12mos", "vl_suppressed_12mos")
   cols_keep <- intersect(cols_list, colnames(art))
 
-  art <- art |>
+  # make sure the ART data is the correct shape
+  clean_art <- art |>
     dplyr::select(area_id, sex, age_group, calendar_quarter, dplyr::any_of(cols_list))
 
-  art_number <- art |>
-    dplyr::left_join(areas |> dplyr::select(area_id, area_level), by = "area_id")
-
-  aggregate_art_by_level <- function(art_number, level) {
-
-    ## Recursively aggregate ART data up from lowest level of programme data provided
-    # Levels to aggregate up from
-    max_dat <- dplyr::filter(art_number, area_level == level)
-    max_shape <- dplyr::filter(areas, area_level == level)
-
-    # Ensure entries exist for all programme data age/sex/quarter combinations X
-    # shape file area_ids at finest stratification
-
-    age_sex_df <- max_dat |>
-      dplyr::group_by(sex, age_group, calendar_quarter) |>
-      dplyr::summarise(.groups = "drop")
-
-    art_full <- tidyr::crossing(area_id = unique(max_shape$area_id),
-                                age_sex_df) |>
-      dplyr::left_join(max_dat, by = c("area_id", "sex", "age_group", "calendar_quarter")) |>
-      dplyr::mutate(area_level = level)
-
-
-    art_number_wide <- spread_areas(areas |> dplyr::filter(area_level <= level)) |>
-      dplyr::right_join(art_full, by = "area_id", multiple = "all")
-
-
-    # Function to aggregate based on area_id[0-9]$ columns in hierarchy
-    aggregate_data_art <- function(col_name) {
-
-      max_col_name <- paste0("area_id", level)
-
-      if (col_name == max_col_name) {
-        # Don't aggregate lowest level of data to retain missing values
-        df <- art_number_wide |> dplyr::select(area_id, sex, age_group, calendar_quarter,
-                                                all_of(cols_keep))
-      } else {
-
-        df <- art_number_wide |>
-          dplyr::group_by(!!col_name := .data[[col_name]], sex, age_group, calendar_quarter) |>
-          dplyr::summarise_at(dplyr::vars(dplyr::all_of(cols_keep)), ~sum(., na.rm = TRUE),
-                              .groups = "drop") |>
-          dplyr::rename_with(~gsub("[0-9]$", "", .))
-
-      }
-
-    }
-
-
-    # Aggregated data frame for area levels > data provided
-    aggregate_cols <- grep("^area_id*\\s*[0-9]$", colnames(art_number_wide), value = TRUE)
-
-    aggregate_cols |>
-      lapply(function(x) aggregate_data_art(x)) |>
-      dplyr::bind_rows() |>
-      dplyr::ungroup() |>
-      dplyr::bind_rows()
-  }
-
-  max_levels <- art_number |> dplyr::summarise(area_level = max(area_level),
-                                              .by = "calendar_quarter")
+  # get all combinations of all stratifications:
+  # > every calendar_quarter will have all age_groups present within that
+  #   year in the data. If only one row is missing age_group "Y000_014" within
+  #   a given calendar_quarter then we fill it in with NAs. If all rows within
+  #   a calendar_quarter are missing "Y000_014" then we omit it
+  # > within a calendar_quarter and age_group we have consistent sex values which
+  #   are either "both" or one of "male" and "female". Validation of sex values is
+  #   done elsewhere
+  all_strat_art <- clean_art |>
+    dplyr::group_by(calendar_quarter, age_group) |>
+    dplyr::reframe(sex = unique(dplyr::cur_data()$sex)) |>
+    dplyr::ungroup()
+
+  # this gets the max area_level per quarter so we can get a complete list of
+  # area_ids within that area level and left join later
+  quarter_by_area_level <- clean_art |>
+    dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_id") |>
+    dplyr::group_by(calendar_quarter) |>
+    dplyr::summarise(area_level = max(area_level), .groups = "drop")
+
+  # combine all_strat_art with quarter_by_area_level and clean_art to get a
+  # complete data with all stratification combinations and area_ids for level
+  # with NAs where data was omitted in the CSV
+  agg_art <- all_strat_art |>
+    dplyr::left_join(quarter_by_area_level, by = "calendar_quarter") |>
+    dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_level") |>
+    dplyr::left_join(clean_art, by = c("area_id", "calendar_quarter", "age_group", "sex"))
+
+  area_with_parent_ids <- areas |>
+    dplyr::select(area_id, parent_area_id)
+
+  max_area_level <- max(agg_art$area_level)
+
+  # every iteration of the loop aggregates the area up one area_level and
+  # adds all those rows to agg_art
+  for (level in max_area_level:1) {
+    agg_art <- agg_art |>
+      dplyr::filter(area_level == level) |>
+      dplyr::left_join(area_with_parent_ids, by = "area_id") |>
+      dplyr::group_by(parent_area_id, calendar_quarter, sex, age_group) |>
+      # > area_level is one less since we are aggregating to the parent
+      # > sum across all the other numeric values ignoring NAs
+      dplyr::summarise(
+        area_level = level - 1,
+        dplyr::across(dplyr::any_of(cols_list), ~sum(.x, na.rm = TRUE)),
+        .groups = "drop"
+      ) |>
+      dplyr::rename(area_id = parent_area_id) |>
+      dplyr::bind_rows(agg_art)
+  }  
 
-  ## Note we can have the case here that different calendar quarters have
-  ## a different max level e.g. Mozambique
-  art_long <- lapply(unique(max_levels$area_level),
-                     function(level) aggregate_art_by_level(art_number, level)) |>
-    dplyr::bind_rows() |>
+  # add in extra columns and sort
+  art_long <- agg_art |>
+    dplyr::left_join(
+      areas |> dplyr::select(
+        area_id, area_name, area_level_label, parent_area_id, area_sort_order
+      ), by = "area_id"
+    ) |>
     dplyr::mutate(year = calendar_quarter_to_year(calendar_quarter),
                   quarter = calendar_quarter_to_quarter(calendar_quarter),
                   time_period = paste0(year, " ", quarter)) |>
-    dplyr::left_join(
-      areas |>
-        dplyr::select(area_id, area_name, area_level,
-                      area_level_label, parent_area_id, area_sort_order),
-      by = c("area_id")
-    ) |>
     dplyr::select(area_id, area_level, area_name, area_level_label,parent_area_id,
                   area_sort_order, sex, age_group,time_period, year, quarter,
                   calendar_quarter, dplyr::everything()) |>
@@ -115,7 +107,6 @@ aggregate_art <- function(art, shape) {
 
   art_long$area_hierarchy <- build_hierarchy_label(art_long)
   art_long
-
 }
 
 
@@ -144,7 +135,7 @@ prepare_input_time_series_art <- function(art, shape) {
 
   ## Recursively aggregate ART data up from lowest level of programme data provided
   # Levels to aggregate up from
-  art_long <- aggregate_art(art, shape)
+  art_long <- aggregate_art(art, areas, drop_geometry = FALSE)
   sex_level <- unique(art_long$sex)
   age_level <- unique(art_long$age_group)
   admin_level <- max(art_long$area_level)
@@ -264,36 +255,42 @@ prepare_input_time_series_art <- function(art, shape) {
     dplyr::arrange(area_sort_order, calendar_quarter)
 
   # Tag data with NAs at the lowest admin level
-  area_level <- max(art_plot_data_long$area_level)
-
-  hierarchy_wide <- spread_areas(areas |> dplyr::filter(area_level <= admin_level)) |>
-    dplyr::select(dplyr::starts_with("area_id"))
-
+  art_level <- max(art_plot_data_long$area_level)
 
+  # initialise missing_map with values that are missing, these will only show up
+  # at the max admin level per calendar_quarter
   missing_map <- art_plot_data_long |>
-    # For edge cases where data is provided at different admin levels for
-    # different years (get max area level by year)
-    dplyr::select(calendar_quarter, area_level) |>
-    dplyr::group_by(calendar_quarter) |>
-    dplyr::summarise(area_level = max(area_level)) |>
-    # Select lowest admin level for each year
-    dplyr::left_join(art_plot_data_long |>
-                       dplyr::select(area_id, area_name, area_level, calendar_quarter, plot, value),
-                     multiple = "all", by = dplyr::join_by(calendar_quarter, area_level)) |>
-    # Joint to wide hierarchy
-    dplyr::left_join(hierarchy_wide, by = dplyr::join_by(area_id)) |>
-    # Filter for districts with missing value
-    dplyr::filter(is.na(value)) |>
-    dplyr::select(missing = area_id, dplyr::everything()) |>
-    # Get into long format
-    tidyr::pivot_longer(cols = dplyr::starts_with("area_id"), values_to = "area_id") |>
-    # Aggregate missing observations by area_id at all levels
-    dplyr::group_by(area_id, plot, calendar_quarter) |>
-    dplyr::summarise(missing_ids = list(missing), .groups = "drop")
-
+    dplyr::select(area_id, calendar_quarter, value, plot, area_level) |>
+    # find NAs, also check it isn't a NaN because these can appear in some
+    # derived columns where we divide by 0
+    dplyr::filter(is.na(value) & !is.nan(value)) |>
+    dplyr::select(-value) |>
+    dplyr::mutate(missing_ids = as.list(area_id))
+  
+  area_with_parent_ids <- areas |>
+    dplyr::select(area_id, parent_area_id)
+
+  # same idea as in aggregate_art, every iteration of the loop aggregates
+  # up one admin level
+  for (level in art_level:1) {
+    missing_map <- missing_map |>
+      dplyr::filter(area_level == level) |>
+      dplyr::left_join(area_with_parent_ids, by = "area_id") |>
+      dplyr::group_by(parent_area_id, calendar_quarter, plot) |>
+      # > missing_ids merge the two lists together
+      # > area_level decrease area_level by one because we aggregate
+      #   up to the parent
+      dplyr::summarise(
+        missing_ids = list(unlist(missing_ids, FALSE, FALSE)),
+        area_level = level - 1,
+        .groups = "drop"
+      ) |>
+      dplyr::rename(area_id = parent_area_id) |>
+      dplyr::bind_rows(missing_map)
+  }
 
   df_final <- art_plot_data_long |>
-    dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot)) |>
+    dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot, area_level)) |>
     dplyr::mutate(value = tidyr::replace_na(value, 0), missing = NULL)
 
   return(df_final)
@@ -307,6 +304,8 @@ prepare_input_time_series_art <- function(art, shape) {
 ##'
 ##' @param anc Path to file containing ANC data or ANC data object
 ##' @param shape Path to file containing geojson areas data or areas data object
+##' @param drop_geometry Setting this to FALSE will skip dropping geometry on
+##'   the shape file (default TRUE)
 ##'
 ##' @return Aggregated ANC data containing columns area_id, area_name, area_level,
 ##' area_level_label, sex,age_group, time_period, year, quarter, calendar_quarter,
@@ -314,16 +313,20 @@ prepare_input_time_series_art <- function(art, shape) {
 ##' births_clients_ratio
 
 ##' @export
-aggregate_anc <- function(anc, shape) {
+aggregate_anc <- function(anc, shape, drop_geometry = TRUE) {
 
   ## Recursively aggregate ANC data up from lowest level of programm data provided
   # Level to aggregate from
 
   ## Check if shape is object or file path
-  if(!inherits(shape, "sf")) {
-    areas <- sf::read_sf(shape) |> sf::st_drop_geometry()
+  if (drop_geometry) {
+    if(!inherits(shape, "sf")) {
+      areas <- sf::read_sf(shape) |> sf::st_drop_geometry()
+    } else {
+      areas <- shape |> sf::st_drop_geometry()
+    }
   } else {
-    areas <- shape |> sf::st_drop_geometry()
+    areas <- shape
   }
 
   ## Check if anc is object or file path
@@ -336,79 +339,67 @@ aggregate_anc <- function(anc, shape) {
                  "anc_tested", "anc_tested_pos", "anc_known_neg", "births_facility")
   cols_keep <- intersect(cols_list, colnames(anc))
 
-  anc <- anc |>
+  # make sure the ANC data is the correct shape, note that all the
+  # variables in cols_list are requried expect births_facility
+  clean_anc <- anc |>
     dplyr::select(area_id, age_group, year, dplyr::any_of(cols_list))
 
-  anc_testing <- anc |>
-    dplyr::left_join( dplyr::select(areas, area_id, area_level), by = "area_id")
-
-  # Split data by year and aggregate from lowest level available
-  anc_dat <- split(anc_testing , f = anc_testing$year)
-
-  aggregate_anc_by_level <- function(anc_testing){
-
-    ## Recursively aggregate ANC data up from lowest level of programme data provided
-    # Level to aggregate from
-    anc_level <- max(anc_testing$area_level)
-    max_dat <- dplyr::filter(anc_testing, area_level == anc_level)
-    max_shape <- dplyr::filter(areas, area_level == anc_level)
-
-
-    # Ensure entries exist for all programme data age/sex/quarter combinations X
-    # shape file area_ids at finest stratification
-    anc_full <- tidyr::crossing(area_id = unique(max_shape$area_id),
-                                age_group = unique(max_dat$age_group),
-                                year = unique(max_dat$year)) |>
-      dplyr::left_join(max_dat, by = c("area_id", "age_group", "year"))
-
-    # Join ANC data to hierarchy
-    anc_testing_wide <- areas |>
-      dplyr::filter(area_level <= anc_level) |>
-      spread_areas() |>
-      dplyr::right_join(anc_full, by = "area_id", multiple = "all")
-
-
-    # Function to aggregate based on area_id[0-9]$ columns in hierarchy
-    aggregate_data_anc <- function(col_name) {
-
-      max_admin <- paste0("area_id", anc_level)
-
-      if(col_name == max_admin) {
-        # Don't aggregate lowest level of data to retain missing values
-        df <- anc_testing_wide |> dplyr::select(area_id, age_group, year,
-                                                 all_of(cols_keep))
-      } else {
-
-        df <- anc_testing_wide |>
-          dplyr::group_by(!!col_name := .data[[col_name]], age_group, year) |>
-          dplyr::summarise_at(dplyr::vars(dplyr::all_of(cols_keep)), ~sum(., na.rm = TRUE),
-                              .groups = "drop") |>
-          dplyr::rename_with(~gsub("[0-9]$", "", .))
-      }
-    }
-
-    # Aggregated data frame for area levels
-    aggregate_cols <- grep("^area_id*\\s*[0-9]$", colnames(anc_testing_wide), value = TRUE)
-
-    aggregated_anc <- aggregate_cols |>
-      lapply(aggregate_data_anc) |>
-      dplyr::bind_rows() |>
-      dplyr::ungroup() |>
-      dplyr::bind_rows()
+  # initialise aggregated anc with the max admin level (most fine grained)
+  # and fill in and missing rows - if a year has max admin level n (these can be
+  # different per year) then we fill in values for any missing areas at admin
+  # level n with NAs
+  agg_anc <- clean_anc |>
+    dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_id") |>
+    dplyr::group_by(year) |>
+    # summarise to table with columns year, area_level (max area level for this year)
+    # and age_group
+    dplyr::summarize(
+      area_level = dplyr::first(area_level),
+      age_group = dplyr::first(age_group)
+    ) |>
+    # expand each year row to multiple rows with all area_ids for that admin level
+    # this is the complete list of area_ids that we need
+    dplyr::left_join(
+      areas |> dplyr::select(area_id, area_level), by = "area_level"
+    ) |>
+    # left join complete list of area_ids with our potentially missing area_ids in
+    # clean_anc to get rows of NAs if an area_id is missing
+    dplyr::left_join(clean_anc, by = c("year", "area_id", "age_group")) |>
+    dplyr::select(area_id, age_group, year, area_level, dplyr::any_of(cols_list))
+
+  area_with_parent_ids <- areas |>
+    dplyr::select(area_id, parent_area_id)
+
+  max_area_level <- max(agg_anc$area_level)
+
+  # every iteration of the loop aggregates the area up one area_level and
+  # adds all those rows to agg_anc
+  for (level in max_area_level:1) {
+    agg_anc <- agg_anc |>
+      dplyr::filter(area_level == level) |>
+      dplyr::left_join(area_with_parent_ids, by = "area_id") |>
+      dplyr::group_by(parent_area_id, year, age_group) |>
+      # > area_level is one less since we are aggregating to the parent
+      # > sum across all the other numeric values ignoring NAs
+      dplyr::summarise(
+        area_level = level - 1,
+        dplyr::across(dplyr::any_of(cols_list), ~sum(.x, na.rm = TRUE)),
+        .groups = "drop"
+      ) |>
+      dplyr::rename(area_id = parent_area_id) |>
+      dplyr::bind_rows(agg_anc)
   }
 
-  anc_long <- lapply(anc_dat, aggregate_anc_by_level) |>
-    dplyr::bind_rows() |>
+  # add in extra columns and sort
+  anc_long <- agg_anc |>
+    dplyr::left_join(
+      areas |> dplyr::select(
+        area_id, area_name, area_level_label, area_sort_order, parent_area_id
+      ),
+      by = "area_id"
+    ) |>
     dplyr::mutate(time_period = as.character(year), quarter = "Q4", sex = "female",
                   calendar_quarter = paste0("CY", time_period, quarter)) |>
-    dplyr::left_join(areas |>
-                       dplyr::select(area_id, area_name, area_level,area_level_label,
-                                     parent_area_id, area_sort_order),
-                     by = "area_id") |>
-    dplyr::select(area_id, area_name, area_level, area_level_label,parent_area_id,
-                  area_sort_order, sex, age_group, time_period, year, quarter,
-                  calendar_quarter, dplyr::all_of(cols_keep)) |>
-    dplyr::ungroup() |>
     dplyr::arrange(year, area_sort_order)
 
   anc_long$area_hierarchy <- build_hierarchy_label(anc_long)
@@ -439,7 +430,7 @@ prepare_input_time_series_anc <- function(anc, shape) {
   }
 
   ## Shape data for plot
-  anc_long <- aggregate_anc(anc, shape)
+  anc_long <- aggregate_anc(anc, areas, drop_geometry = FALSE)
 
   anc_plot_data_long <- anc_long |>
     dplyr::mutate(
@@ -449,7 +440,7 @@ prepare_input_time_series_anc <- function(anc, shape) {
       anc_art_among_known = anc_already_art / anc_known_pos,
       anc_art_coverage = anc_already_art / anc_total_pos,
       births_clients_ratio = births_facility / anc_clients
-      ) |>
+    ) |>
     dplyr::select(area_id, area_name, area_level, area_level_label, parent_area_id,
                   area_sort_order, age_group,  time_period, year, quarter,
                   calendar_quarter, anc_clients, anc_tested, anc_tested_pos,
@@ -463,31 +454,41 @@ prepare_input_time_series_anc <- function(anc, shape) {
 
   # Tag data with NAs at the lowest admin level
   anc_level <- max(anc_plot_data_long$area_level)
-  hierarchy_wide <- spread_areas(areas |> dplyr::filter(area_level <= anc_level))
 
+  # initialise missing_map with values that are missing, these will only show up
+  # at the max admin level per year
   missing_map <- anc_plot_data_long |>
-    # For edge cases where data is provided at different admin levels for
-    # different years (get max area level by year)
-    dplyr::select(calendar_quarter, area_level) |>
-    dplyr::group_by(calendar_quarter) |>
-    dplyr::summarise(area_level = max(area_level)) |>
-    # Select lowest admin level for each year
-    dplyr::left_join(anc_plot_data_long |>
-                       dplyr::select(area_id, area_name, area_level, calendar_quarter, plot, value),
-                     multiple = "all", by = dplyr::join_by(calendar_quarter, area_level)) |>
-    # Joint to wide hierarchy
-    dplyr::left_join(hierarchy_wide, by = dplyr::join_by(area_id)) |>
-    # Filter for districts with missing value
-    dplyr::filter(is.na(value)) |>
-    dplyr::select(missing = area_id, dplyr::everything()) |>
-    # Get into long format
-    tidyr::pivot_longer(cols = dplyr::starts_with("area_id"), values_to = "area_id") |>
-    # Aggregate missing observations by area_id at all levels
-    dplyr::group_by(area_id, plot, calendar_quarter) |>
-    dplyr::summarise(missing_ids = list(missing), .groups = "drop")
+    dplyr::select(area_id, year, value, plot, area_level) |>
+    # find NAs, also check it isn't a NaN because these can appear in some
+    # derived columns where we divide by 0
+    dplyr::filter(is.na(value) & !is.nan(value)) |>
+    dplyr::select(-value) |>
+    dplyr::mutate(missing_ids = as.list(area_id))
+  
+  area_with_parent_ids <- areas |>
+    dplyr::select(area_id, parent_area_id)
+
+  # same idea as in aggregate_anc, every iteration of the loop aggregates
+  # up one admin level
+  for (level in anc_level:1) {
+    missing_map <- missing_map |>
+      dplyr::filter(area_level == level) |>
+      dplyr::left_join(area_with_parent_ids, by = "area_id") |>
+      dplyr::group_by(parent_area_id, year, plot) |>
+      # > missing_ids merge the two lists together
+      # > area_level decrease area_level by one because we aggregate
+      #   up to the parent
+      dplyr::summarise(
+        missing_ids = list(unlist(missing_ids, FALSE, FALSE)),
+        area_level = level - 1,
+        .groups = "drop"
+      ) |>
+      dplyr::rename(area_id = parent_area_id) |>
+      dplyr::bind_rows(missing_map)
+  }
 
   df_final <- anc_plot_data_long |>
-    dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot)) |>
+    dplyr::left_join(missing_map, by = dplyr::join_by(area_id, year, plot, area_level)) |>
     dplyr::mutate(value = tidyr::replace_na(value, 0), missing = NULL)
 
   return(df_final)
diff --git a/man/aggregate_anc.Rd b/man/aggregate_anc.Rd
index deada0b7..715fdf72 100644
--- a/man/aggregate_anc.Rd
+++ b/man/aggregate_anc.Rd
@@ -4,12 +4,15 @@
 \alias{aggregate_anc}
 \title{Aggregate ANC data according to area hierarchy}
 \usage{
-aggregate_anc(anc, shape)
+aggregate_anc(anc, shape, drop_geometry = TRUE)
 }
 \arguments{
 \item{anc}{Path to file containing ANC data or ANC data object}
 
 \item{shape}{Path to file containing geojson areas data or areas data object}
+
+\item{drop_geometry}{Setting this to FALSE will skip dropping geometry on
+the shape file (default TRUE)}
 }
 \value{
 Aggregated ANC data containing columns area_id, area_name, area_level,
diff --git a/man/aggregate_art.Rd b/man/aggregate_art.Rd
index e507ed79..af922cf0 100644
--- a/man/aggregate_art.Rd
+++ b/man/aggregate_art.Rd
@@ -4,12 +4,15 @@
 \alias{aggregate_art}
 \title{Aggregate ART data according to area hierarchy}
 \usage{
-aggregate_art(art, shape)
+aggregate_art(art, shape, drop_geometry = TRUE)
 }
 \arguments{
 \item{art}{Path to file containing ART data or ART data object}
 
 \item{shape}{Path to file containing geojson areas data or areas data object}
+
+\item{drop_geometry}{Setting this to FALSE will skip dropping geometry on
+the shape file (default TRUE)}
 }
 \value{
 Aggregated ART data containing columns area_id, area_name,
diff --git a/tests/testthat/test-input-time-series.R b/tests/testthat/test-input-time-series.R
index 1d5542f8..196664e6 100644
--- a/tests/testthat/test-input-time-series.R
+++ b/tests/testthat/test-input-time-series.R
@@ -152,7 +152,7 @@ test_that("ART data can be aggregated when avalible at different admin levels",
     dplyr::group_by(area_level_label) %>%
     dplyr::summarise(n = dplyr::n(), .groups = "drop")
 
-  expect_equal(check1$n, c(10, 10, 6))
+  expect_equal(check3$n, c(10, 10, 6))
 
 
   # (4) Test that ART data can be aggregated with missing records