Skip to content

Commit

Permalink
Performance Improvements in User Stats Processing (#124)
Browse files Browse the repository at this point in the history
* Polars of 2 DB Utils Function

* Cut Load Time in half and added timers

* Delete docker-compose-dev.yml.bak

* Added Timers

* commit fix

* commit fix

* update gitignore

* update

* batch loading

* Batch Loading

* Removed Polars, Fixed Trajectory bug

* Batch

* Loads but refreshes

* Worksgit add .

* Reverted Changes for new pr

* Fix

* Fix

* Req being weird

* Revert last line change

---------

Co-authored-by: K. Shankari <[email protected]>
  • Loading branch information
TeachMeTW and shankari authored Sep 18, 2024
1 parent 38a50dc commit b9b0c34
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 86 deletions.
12 changes: 7 additions & 5 deletions app_sidebar_collapsible.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,19 @@ def make_controls():
'flex-direction': 'column'}
)

page_content = dcc.Loading(
type='default',
fullscreen=True,
children=html.Div(dash.page_container, style={
# Dcc Loading removed for Data Page Lazy Loading.
# TODO Figure out how to enable Loading on everything BUT Data Page UUIDs Tab
page_content = html.Div(
dash.page_container,
style={
"margin-left": "5rem",
"margin-right": "2rem",
"padding": "2rem 1rem",
})
}
)



def make_home_page(): return [
sidebar,
html.Div([make_controls(), page_content])
Expand Down
143 changes: 98 additions & 45 deletions pages/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
Since the dcc.Location component is not in the layout when navigating to this page, it triggers the callback.
The workaround is to check if the input value is None.
"""
from dash import dcc, html, Input, Output, callback, register_page, dash_table, State
from dash import dcc, html, Input, Output, callback, register_page, dash_table, State, callback_context, Patch
# Etc
import logging
import time
import pandas as pd
from dash.exceptions import PreventUpdate

from concurrent.futures import ThreadPoolExecutor, as_completed
from utils import constants
from utils import permissions as perm_utils
from utils import db_utils
Expand All @@ -28,10 +29,14 @@
dcc.Tab(label='Trajectories', value='tab-trajectories-datatable'),
]),
html.Div(id='tabs-content'),
dcc.Interval(id='interval-load-more', interval=6000, n_intervals=0),
dcc.Store(id='store-uuids', data=[]), # Store to hold the original UUIDs data
dcc.Store(id='store-loaded-uuids', data={'data': [], 'loaded': False}) # Store to track loaded data
]
)



def clean_location_data(df):
if 'data.start_loc.coordinates' in df.columns:
df['data.start_loc.coordinates'] = df['data.start_loc.coordinates'].apply(lambda x: f'({x[0]}, {x[1]})')
Expand All @@ -51,6 +56,8 @@ def update_store_trajectories(start_date: str, end_date: str, tz: str, excluded_

@callback(
Output('tabs-content', 'children'),
Output('store-loaded-uuids', 'data'),
Output('interval-load-more', 'disabled'), # Disable interval when all data is loaded
Input('tabs-datatable', 'value'),
Input('store-uuids', 'data'),
Input('store-excluded-uuids', 'data'),
Expand All @@ -60,66 +67,110 @@ def update_store_trajectories(start_date: str, end_date: str, tz: str, excluded_
Input('date-picker', 'start_date'),
Input('date-picker', 'end_date'),
Input('date-picker-timezone', 'value'),
Input('interval-load-more', 'n_intervals'),# Interval to trigger the loading of more data
State('store-loaded-uuids', 'data'), # Use State to track already loaded data
State('store-loaded-uuids', 'loaded'), # Keep track if we have finished loading all data
)
def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone):
data, columns, has_perm = None, [], False
if tab == 'tab-uuids-datatable':
data = store_uuids["data"]
data = db_utils.add_user_stats(data)
columns = perm_utils.get_uuids_columns()
has_perm = perm_utils.has_permission('data_uuids')
def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone, n_intervals, loaded_uuids_store, all_data_loaded):
initial_batch_size = 10 # Define the batch size for loading UUIDs

# Ensure store_uuids contains the key 'data' which is a list of dictionaries
if not isinstance(store_uuids, dict) or 'data' not in store_uuids:
logging.error(f"Expected store_uuids to be a dict with a 'data' key, but got {type(store_uuids)}")
return html.Div([html.P("Data structure error.")]), loaded_uuids_store, True

# Extract the list of UUIDs from the dict
uuids_list = store_uuids['data']

# Ensure uuids_list is a list for slicing
if not isinstance(uuids_list, list):
logging.error(f"Expected store_uuids['data'] to be a list but got {type(uuids_list)}")
return html.Div([html.P("Data structure error.")]), loaded_uuids_store, True

# Retrieve already loaded data from the store
loaded_data = loaded_uuids_store.get('data', [])
total_loaded = len(loaded_data)

# Handle the UUIDs tab with lazy loading
if tab == 'tab-uuids-datatable' and not loaded_uuids_store.get('loaded', False):
total_to_load = total_loaded + initial_batch_size
total_to_load = min(total_to_load, len(uuids_list)) # Avoid loading more than available

logging.debug(f"Loading next batch of UUIDs: {total_loaded} to {total_to_load}")

# Slice the list of UUIDs from the dict
new_data = uuids_list[total_loaded:total_to_load]

if new_data:
# Process and append the new data to the loaded store
processed_data = db_utils.add_user_stats(new_data, initial_batch_size)
loaded_data.extend(processed_data)

# Create a Patch object to append data progressively
patched_data = Patch()
patched_data['data'] = processed_data

# Update the store with the new data
loaded_uuids_store['data'] = loaded_data
loaded_uuids_store['loaded'] = len(loaded_data) >= len(uuids_list) # Mark all data as loaded if done

logging.debug(f"New batch loaded. Total loaded: {len(loaded_data)}")

# Prepare the data to be displayed
columns = perm_utils.get_uuids_columns() # Get the relevant columns
df = pd.DataFrame(loaded_data)

if df.empty or not perm_utils.has_permission('data_uuids'):
logging.debug("No data or permission issues.")
return html.Div([html.P("No data available or you don't have permission.")]), loaded_uuids_store, False

df = df.drop(columns=[col for col in df.columns if col not in columns])

# Use the Patch() object to append new data instead of fully replacing the table
logging.debug("Returning patched data to update the UI.")
return html.Div([populate_datatable(df)]), loaded_uuids_store, False if not loaded_uuids_store['loaded'] else True


# Handle other tabs normally
elif tab == 'tab-trips-datatable':
data = store_trips["data"]
columns = perm_utils.get_allowed_trip_columns()
columns.update(
col['label'] for col in perm_utils.get_allowed_named_trip_columns()
)
columns.update(col['label'] for col in perm_utils.get_allowed_named_trip_columns())
columns.update(store_trips["userinputcols"])
has_perm = perm_utils.has_permission('data_trips')

df = pd.DataFrame(data)
if df.empty or not has_perm:
return None
return None, loaded_uuids_store, True

logging.debug(f"Final list of retained cols {columns=}")
logging.debug(f"Before dropping, {df.columns=}")
df = df.drop(columns=[col for col in df.columns if col not in columns])
logging.debug(f"After dropping, {df.columns=}")
df = clean_location_data(df)

trips_table = populate_datatable(df,'trips-table')
#Return an HTML Div containing a button (button-clicked) and the populated datatable
trips_table = populate_datatable(df, 'trips-table')
logging.debug(f"Returning 3 values: {trips_table}, {loaded_uuids_store}, True")
return html.Div([
html.Button(
'Display columns with raw units',
id='button-clicked', #identifier for the button
n_clicks=0, #initialize number of clicks to 0
style={'marginLeft':'5px'}
),
trips_table, #populated trips table component
])

html.Button('Display columns with raw units', id='button-clicked', n_clicks=0, style={'marginLeft': '5px'}),
trips_table
]), loaded_uuids_store, True

elif tab == 'tab-demographics-datatable':
data = store_demographics["data"]
has_perm = perm_utils.has_permission('data_demographics')
# if only one survey is available, process it without creating a subtab
if len(data) == 1:
# here data is a dictionary

if len(data) == 1:
data = list(data.values())[0]
columns = list(data[0].keys())
# for multiple survey, create subtabs for unique surveys
elif len(data) > 1:
#returns subtab only if has_perm is True
if not has_perm:
return None
return None, loaded_uuids_store
return html.Div([
dcc.Tabs(id='subtabs-demographics', value=list(data.keys())[0], children=[
dcc.Tab(label= key, value= key) for key in data
]),
dcc.Tab(label=key, value=key) for key in data
]),
html.Div(id='subtabs-demographics-content')
])
]), loaded_uuids_store, True

elif tab == 'tab-trajectories-datatable':
# Currently store_trajectories data is loaded only when the respective tab is selected
#Here we query for trajectory data once "Trajectories" tab is selected
(start_date, end_date) = iso_to_date_only(start_date, end_date)
if store_trajectories == {}:
store_trajectories = update_store_trajectories(start_date, end_date, timezone, store_excluded_uuids)
Expand All @@ -128,14 +179,17 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de
columns = list(data[0].keys())
columns = perm_utils.get_trajectories_columns(columns)
has_perm = perm_utils.has_permission('data_trajectories')

df = pd.DataFrame(data)
if df.empty or not has_perm:
return None

df = df.drop(columns=[col for col in df.columns if col not in columns])
df = pd.DataFrame(data)
if df.empty or not has_perm:
return None, loaded_uuids_store

df = df.drop(columns=[col for col in df.columns if col not in columns])
return populate_datatable(df), loaded_uuids_store, True

# Default case: if no data is loaded or the tab is not handled
return None, loaded_uuids_store, True

return populate_datatable(df)

# handle subtabs for demographic table when there are multiple surveys
@callback(
Expand Down Expand Up @@ -177,7 +231,6 @@ def update_dropdowns_trips(n_clicks, button_label):
#return the list of hidden columns and the updated button label
return hidden_col, button_label


def populate_datatable(df, table_id=''):
if not isinstance(df, pd.DataFrame):
raise PreventUpdate
Expand Down
Loading

0 comments on commit b9b0c34

Please sign in to comment.