Skip to content

Commit

Permalink
Dimension search (#30)
Browse files Browse the repository at this point in the history
* Reverse meta index added

* Multiselection refactoring

* Multiselect widget rename & refactor

* Derived dataset workaround

* Fix default multiselect

* Fix multiselectall element counting

* Refactoring and dimension selection draft

* Stash fix

* WIP codes selector

* Codes selection

* Removed reference to widget output

* EU flag added

* Increased plots

* Better widget placement

* Default stashing

* Refactoring

* Fix index error
  • Loading branch information
lum4chi authored Jul 4, 2023
1 parent 55a2c54 commit 1e67ed3
Show file tree
Hide file tree
Showing 16 changed files with 396 additions and 202 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Eurostat Data Wizard
# 🇪🇺 Eurostat Data Wizard
A straightforward webapp to export easily multiple Eurostat datasets.

## Data
Expand Down
13 changes: 13 additions & 0 deletions datawizard/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,16 @@ def fetch_metabase(session) -> pd.DataFrame:
names=["dataset", "dimension", "code"],
)
return df


def metabase2datasets(metabase: pd.DataFrame, codelist: pd.DataFrame) -> pd.DataFrame:
metabase = metabase.set_index(["dimension", "code"])
metabase = metabase.join(codelist)
metabase = (
metabase.groupby(["code", "code_label", "dimension", "dimension_label"])[
"dataset"
]
.apply(list)
.to_frame()
)
return metabase
2 changes: 1 addition & 1 deletion globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
}
DIMS_INDEX_PATH = f"{CACHE_PATH}/dimension_index.pkl"
CLUSTERING_PATH = f"{CACHE_PATH}/clustermap.csv.gz"
MAX_VARIABLES_PLOT = 25
MAX_VARIABLES_PLOT = 120


def get_last_index_update() -> datetime | None:
Expand Down
59 changes: 59 additions & 0 deletions pages/0_👀_Lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import streamlit as st
import pandas as pd

from st_widgets.commons import (
app_config,
get_logger,
load_dimensions_and_codes,
load_metabase2datasets,
reduce_multiselect_font_size,
)
from st_widgets.console import session_console
from st_widgets.stateful.data_editor import stateful_data_editor

logging = get_logger(__name__)
session = st.session_state
app_config("Data Import")


@st.cache_data
def load_dimensions(metabase2datasets: pd.DataFrame) -> pd.Series:
# Return a series with dimensions code as index and descriptions as values.
return (
metabase2datasets.reset_index()[["dimension", "dimension_label"]]
.drop_duplicates()
.set_index("dimension")
.squeeze()
.sort_index()
)


if __name__ == "__main__":
reduce_multiselect_font_size()
st.markdown(
""" Select only dimensions of interest to filter the dataset list in the `Data` page. Click anywhere in the table and type `CMD+F` or `CTRL+F` to search."""
)
meta = load_metabase2datasets()
codes = load_dimensions_and_codes(meta)
codes = codes.reset_index().assign(selected=False)

selected_codes = stateful_data_editor(
codes,
disabled=["code", "dimension", "description"],
use_container_width=True,
key="_selected_codes",
)

selected_codes_mask = selected_codes.set_index(["code", "dimension"])[
"selected"
].values
dataset_counts = (
meta[selected_codes_mask]["dataset"].explode("dataset").value_counts()
)
st.sidebar.dataframe(dataset_counts)

session["lookup_datasets"] = (
dataset_counts.index.str.upper().tolist() if not dataset_counts.empty else None
)

session_console()
174 changes: 67 additions & 107 deletions pages/1_🗄️_Data.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import logging

import pandas as pd
import streamlit as st

from datawizard.data import fetch_table_of_contents
from datawizard.data import (
fetch_codelist,
fetch_metabase,
fetch_table_of_contents,
get_cached_session,
metabase2datasets,
parse_codelist,
)
from st_widgets.commons import (
app_config,
get_logger,
global_download_lock,
load_dataset,
reduce_multiselect_font_size,
)
from st_widgets.console import session_console
from st_widgets.stateful.multiselect import stateful_multiselect
Expand All @@ -20,17 +26,6 @@
app_config("Data Import")


def reset_user_selections():
# NOTE Because datasets list change, reset the selected idx
if "_selected_dataset_options" in session:
session.pop("_selected_dataset_options")
if "_selected_dataset_index" in session:
session.pop("_selected_dataset_index")
# NOTE Override "Filter datasets by (map) selection"
if "_selected_map_selection" in session:
session["_selected_map_selection"] = False


@st.cache_data()
def load_toc() -> pd.Series | None:
# Return a series with datasets code as index and descriptions as values.
Expand All @@ -40,6 +35,9 @@ def load_toc() -> pd.Series | None:
with st.spinner(text="Fetching table of contents"):
with global_download_lock():
toc = fetch_table_of_contents()
# TODO Derived dataset are not found:
# HTTPError: 404 Client Error: Not Found for url: ...
toc = toc[~toc.index.str.contains("$", regex=False)]
toc = toc["title"]
return toc
except Exception as e:
Expand All @@ -49,111 +47,73 @@ def load_toc() -> pd.Series | None:
def save_datasets_to_stash():
toc = load_toc()

# Datasets search criteria
if toc is not None:
with st.sidebar:
# TODO Switching from lookup or no filter, must reset index
using_lookup = "lookup_datasets" in session and session["lookup_datasets"]
dataset_code = stateful_selectbox(
label="Choose a dataset",
options=toc.index,
label=f"Select dataset {'from `lookup` page ' if using_lookup else ''}(type to search)",
options=session["lookup_datasets"]
if using_lookup
else toc.index.tolist(),
format_func=lambda i: i + " | " + toc.loc[i],
key="_selected_dataset",
)
logging.info(f"Selectbox selection: {dataset_code}")

# Create or reuse a filtering history for this code
if dataset_code not in session["history"]:
session["history"][dataset_code] = dict()
history = session["history"][dataset_code]
history["stash"] = True

# Dataset filtering criteria
if dataset_code is not None:
with st.spinner(text="Downloading data"):
dataset = load_dataset(dataset_code)

# Create or reuse a filtering history for this code
if dataset_code not in session["history"]:
session["history"][dataset_code] = dict()
history = session["history"][dataset_code]

history["stash"] = st.sidebar.checkbox(
"Save into Stash",
value=history["stash"] if "stash" in history else False,
)

st.subheader(
f"Variable selection: {dataset_code + ' | ' + toc.loc[dataset_code]}"
)

# Flags management
flags = dataset.flag.fillna("<NA>").unique().tolist()

flags_container = st.container()

if st.button(
"Select all",
key=f"_{dataset_code}.flags_all",
):
del session[f"_{dataset_code}.flags_default"]
st.experimental_rerun()

with flags_container:
history["flags"] = stateful_multiselect(
label="Select FLAG",
options=flags,
default=flags,
key=f"_{dataset_code}.flags",
)
st.subheader(
f"Variable selection: {dataset_code + ' | ' + toc.loc[dataset_code]}"
)

dataset = load_dataset(dataset_code)

# Flags filtering handles
flags = dataset.flag.fillna("<NA>").unique().tolist()
history["flags"] = stateful_multiselect(
"Select FLAG", flags, default=flags, key=f"_{dataset_code}.flags"
)

# Indexes management
indexes = {n: dataset.index.levels[i].to_list() for i, n in enumerate(dataset.index.names)} # type: ignore
if "time" in indexes:
indexes["time"] = [
min(indexes["time"]).year,
max(indexes["time"]).year,
]

if "indexes" not in history:
history["indexes"] = dict()

for name in dataset.index.names:
index_container = st.container()

if st.button(
"Select all",
key=f"_{dataset_code}.indexes.{name}_all",
):
del session[f"_{dataset_code}.indexes.{name}_default"]
st.experimental_rerun()

if name == "time":
with index_container:
m, M = indexes["time"][0], indexes["time"][1]
M = M if m < M else M + 1 # RangeError fix
history["indexes"]["time"] = stateful_slider(
label="Select TIME [min: 1 year]",
min_value=m,
max_value=M,
value=(m, M),
key=f"_{dataset_code}.indexes.time",
)
else:
with index_container:
history["indexes"][name] = stateful_multiselect(
label=f"Select {name.upper()}",
options=indexes[name],
default=indexes[name],
key=f"_{dataset_code}.indexes.{name}",
)


def change_font_size():
st.markdown(
"""
<style>
.stMultiSelect [data-baseweb=select] span{
max-width: 500px;
font-size: 0.8rem;
}
</style>
""",
unsafe_allow_html=True,
)
# Indexes filtering handles (all the available dimensions)
indexes = {n: dataset.index.levels[i].to_list() for i, n in enumerate(dataset.index.names)} # type: ignore
if "time" in indexes:
indexes["time"] = [
min(indexes["time"]).year,
max(indexes["time"]).year,
]

if "indexes" not in history:
history["indexes"] = dict()

for name in dataset.index.names:
if name == "time":
codes_dims, M = indexes["time"][0], indexes["time"][1]
M = M if codes_dims < M else M + 1 # RangeError fix
history["indexes"]["time"] = stateful_slider(
label="Select TIME [min: 1 year]",
min_value=codes_dims,
max_value=M,
value=(codes_dims, M),
key=f"_{dataset_code}.indexes.time",
)
else:
history["indexes"][name] = stateful_multiselect(
f"Select {name.upper()}",
indexes[name],
default=indexes[name],
key=f"_{dataset_code}.indexes.{name}",
)


if __name__ == "__main__":
change_font_size()
reduce_multiselect_font_size()
save_datasets_to_stash()
session_console()
4 changes: 2 additions & 2 deletions pages/2_🛒_Stash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import streamlit as st

from st_widgets.commons import app_config, load_stash
from st_widgets.commons import app_config, load_stash, read_stash_from_history
from st_widgets.console import session_console
from st_widgets.dataframe import (
empty_eurostat_dataframe,
Expand All @@ -13,7 +13,7 @@

def show_stash():
if "history" in st.session_state:
stash = st.session_state.history
stash = read_stash_from_history(st.session_state.history)
dataset = empty_eurostat_dataframe()

remove_code = st.sidebar.selectbox(
Expand Down
4 changes: 2 additions & 2 deletions pages/3_📈_Timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from datawizard.utils import trim_code, tuple2str
from globals import MAX_VARIABLES_PLOT
from st_widgets.commons import app_config, load_stash
from st_widgets.commons import app_config, load_stash, read_stash_from_history
from st_widgets.console import session_console
from st_widgets.dataframe import empty_eurostat_dataframe
from st_widgets.stateful.number_input import stateful_number_input
Expand Down Expand Up @@ -33,7 +33,7 @@ def plot_column_idx(df, i, annotate=False):
try:
with st.spinner(text="Fetching data"):
if "history" in st.session_state:
stash = load_stash(st.session_state.history)
stash = load_stash(read_stash_from_history(st.session_state.history))
else:
st.warning("No stash found. Select some data to plot.")
except ValueError as ve:
Expand Down
4 changes: 2 additions & 2 deletions pages/4_📊_Correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from datawizard.utils import trim_code, tuple2str
from globals import MAX_VARIABLES_PLOT
from st_widgets.commons import app_config, load_stash
from st_widgets.commons import app_config, load_stash, read_stash_from_history
from st_widgets.console import session_console
from st_widgets.dataframe import empty_eurostat_dataframe
from st_widgets.stateful.number_input import stateful_number_input
Expand Down Expand Up @@ -87,7 +87,7 @@ def plot_heatmap(corr: pd.DataFrame, figsize: Tuple[int, int] = (18, 16)):
try:
with st.spinner(text="Fetching data"):
if "history" in st.session_state:
stash = load_stash(st.session_state.history)
stash = load_stash(read_stash_from_history(st.session_state.history))
else:
st.warning("No stash found. Select some data to plot.")
except ValueError as ve:
Expand Down
Loading

0 comments on commit 1e67ed3

Please sign in to comment.