cal-itp · edasmalchi · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/_shared_utils/setup.py b/_shared_utils/setup.py
@@ -4,7 +4,7 @@
 setup(
     name="shared_utils",
     packages=find_packages(),
-    version="2.6",
+    version="2.7",
     description="Shared utility functions for data analyses",
     author="Cal-ITP",
     license="Apache",

diff --git a/_shared_utils/shared_utils/catalog_utils.py b/_shared_utils/shared_utils/catalog_utils.py
@@ -5,7 +5,6 @@
 from typing import Literal
 
 import intake
-import yaml
 from omegaconf import OmegaConf  # this is yaml parser
 
 repo_name = "data-analyses/"
@@ -22,20 +21,3 @@ def get_catalog(catalog_name: Literal["shared_data_catalog", "gtfs_analytics_dat
 
     else:
         return intake.open_catalog(catalog_path)
-
-
-def get_parameters(config_file: str, key: str) -> dict:
-    """
-    Parse the config.yml file to get the parameters needed
-    for working with route or stop segments.
-    These parameters will be passed through the scripts when working
-    with vehicle position data.
-
-    Returns a dictionary of parameters.
-    """
-    # https://aaltoscicomp.github.io/python-for-scicomp/scripts/
-    with open(config_file) as f:
-        my_dict = yaml.safe_load(f)
-        params_dict = my_dict[key]
-
-    return params_dict
diff --git a/_shared_utils/shared_utils/gtfs_analytics_data.yml b/_shared_utils/shared_utils/gtfs_analytics_data.yml
@@ -52,14 +52,19 @@ rt_vs_schedule_tables:
   sched_route_direction_metrics: "schedule_route_dir/schedule_route_direction_metrics"
   vp_trip_metrics: "vp_trip/trip_metrics"
   vp_route_direction_metrics: "vp_route_dir/route_direction_metrics"
+  vp_operator_metrics: "vp_operator/operator_metrics"
+  sched_stop_metrics: "schedule_stop/schedule_stop_metrics"
+  #vp_stop_metrics: "vp_stop/vp_stop_metrics" # WIP: transit bunching
   schedule_rt_stop_times: "schedule_rt_stop_times"
   early_trip_minutes: -5
   late_trip_minutes: 5
 
+
 digest_tables:
   dir: ${gcs_paths.RT_SCHED_GCS}
   route_schedule_vp: "digest/schedule_vp_metrics"
   route_segment_speeds: "digest/segment_speeds"
+  route_segment_geometry: "digest/segment_speeds_geom"
   operator_profiles: "digest/operator_profiles"
   operator_routes_map: "digest/operator_routes"
   operator_sched_rt: "digest/operator_schedule_rt_category"

diff --git a/_shared_utils/shared_utils/publish_utils.py b/_shared_utils/shared_utils/publish_utils.py
@@ -1,12 +1,16 @@
 import os
 from pathlib import Path
-from typing import Union
+from typing import Literal, Union
 
 import gcsfs
+import geopandas as gpd
 import pandas as pd
+from shared_utils import catalog_utils
 
 fs = gcsfs.GCSFileSystem()
+SCHED_GCS = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/"
 PUBLIC_BUCKET = "gs://calitp-publish-data-analysis/"
+GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
 
 
 def write_to_public_gcs(
@@ -59,3 +63,29 @@ def exclude_private_datasets(
     Filter out private datasets.
     """
     return df[df[col].isin(public_gtfs_dataset_keys)].reset_index(drop=True)
+
+
+def subset_table_from_previous_date(
+    gcs_bucket: str,
+    filename: Union[str, Path],
+    operator_and_dates_dict: dict,
+    date: str,
+    crosswalk_col: str = "schedule_gtfs_dataset_key",
+    data_type: Literal["df", "gdf"] = "df",
+) -> pd.DataFrame:
+    CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
+
+    crosswalk = pd.read_parquet(f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet", columns=["name", crosswalk_col])
+
+    subset_keys = crosswalk[crosswalk.name.isin(operator_and_dates_dict[date])][crosswalk_col].unique()
+
+    if data_type == "df":
+        past_df = pd.read_parquet(
+            f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]]
+        )
+    else:
+        past_df = gpd.read_parquet(
+            f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]]
+        )
+
+    return past_df
diff --git a/_shared_utils/shared_utils/schedule_rt_utils.py b/_shared_utils/shared_utils/schedule_rt_utils.py
@@ -151,19 +151,16 @@ def get_organization_id(
         sorting = [True for c in merge_cols]
         keep_cols = ["organization_source_record_id"]
 
-        # Eventually, we need to move to 1 organization name, so there's
-        # no fanout when we merge it on
-        # Until then, handle it by dropping duplicates and pick 1 name
-        dim_provider_gtfs_data2 = (
-            dim_provider_gtfs_data2.sort_values(
-                merge_cols + ["_valid_to", "_valid_from"], ascending=sorting + [False, False]
-            )
-            .drop_duplicates(merge_cols)
-            .reset_index(drop=True)[merge_cols + keep_cols]
-        )
+        # We allow fanout when merging a feed to multiple organization names,
+        # but we should handle it by selectig a preferred
+        # rather than alphabetical.
+        # (organization names Foothill Transit and City of Duarte)
+        dim_provider_gtfs_data2 = dim_provider_gtfs_data2.sort_values(
+            merge_cols + ["_valid_to", "_valid_from"], ascending=sorting + [False, False]
+        ).reset_index(drop=True)[merge_cols + keep_cols]
 
         df2 = pd.merge(df, dim_provider_gtfs_data2, on=merge_cols, how="inner")
-        # return dim_provider_gtfs_data2
+
         return df2
 
 

diff --git a/ahsc_grant/ACS_eda.ipynb b/ahsc_grant/ACS_eda.ipynb
@@ -6629,7 +6629,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,

diff --git a/ahsc_grant/process_mst.ipynb b/ahsc_grant/process_mst.ipynb
@@ -1498,7 +1498,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,