Skip to content

Commit

Permalink
Stash management (#31)
Browse files Browse the repository at this point in the history
* Todo removed

* Removed redundant codelist query

* Lookup improved

* Stash selection

* test fix
  • Loading branch information
lum4chi authored Jul 7, 2023
1 parent 1e67ed3 commit b17a91a
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 89 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# 🇪🇺 Eurostat Data Wizard
A straightforward webapp to export easily multiple Eurostat datasets.

## Lookup
1. Search and select all variables that you find interesting: dataset containing them will be available in the `Data` page.
## Data
1. Choose an Eurostat dataset of interest (or start typing dataset code or title).
2. After loading, you can inspect the dataset and filter indexes, flags and time-span with the controls provided in the sidebar.
Expand Down
34 changes: 7 additions & 27 deletions datawizard/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,22 +71,6 @@ def fetch_dataset(code: str, caching_days: int = 7) -> pd.DataFrame:
return dataset


def fetch_dataset_codelist(request: sdmx.Request, dataset: str) -> pd.DataFrame:
"""Returns codelist found from eurostat"""
metadata = request.datastructure(dataset)
codelist = pd.DataFrame()
for codes in metadata.codelist.values(): # type: ignore
codes = sdmx.to_pandas(codes)
codelist = pd.concat([codelist, codes])
codelist.index.name = "code"
if codelist.empty:
return pd.DataFrame()
codelist = codelist.rename(columns={"name": "label", "parent": "dimension"})
codelist["dimension"] = codelist["dimension"].str.lower()
codelist = codelist.set_index("dimension", append=True).swaplevel()
return codelist


def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
"""Preprocess dataset by mangling it in a convenient DataFrame."""
df = df.rename(columns={"geo\\TIME_PERIOD": "geo"})
Expand All @@ -106,14 +90,13 @@ def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
return pd.concat([values, flags], axis=1).stack("time", dropna=False)[["value", "flag"]].dropna(how="all", axis=0) # type: ignore


def fetch_dataset_and_metadata(
def fetch_and_preprocess_dataset(
code: str,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
) -> pd.DataFrame:
# TODO remove dependency from `eurostat_sdmx_request``
data = fetch_dataset(code)
data = None if data is None else preprocess_dataset(data)
metadata = fetch_dataset_codelist(eurostat_sdmx_request(), code)
return data, metadata # type: ignore TODO Type checking fails
return data # type: ignore TODO Type checking fails


def cast_time_to_datetimeindex(data: pd.DataFrame):
Expand All @@ -135,18 +118,15 @@ def cast_time_to_datetimeindex(data: pd.DataFrame):


def append_code_descriptions(data: pd.DataFrame, codelist: pd.DataFrame):
df = data.reset_index()
cols_to_transform = data.index.names.difference(["time"]).union(["flag"]) # type: ignore
df = data.reset_index()
code2level = codelist["code_label"]
for dimension in cols_to_transform:
if dimension == "flag":
# `flag` is served with a different name in codelist
code2description = quote_sanitizer(
codelist.squeeze().loc["obs_flag"]
).to_dict()
code2description = quote_sanitizer(code2level.loc["obs_flag"]).to_dict()
else:
code2description = quote_sanitizer(
codelist.squeeze().loc[dimension]
).to_dict()
code2description = quote_sanitizer(code2level.loc[dimension]).to_dict()
code2code_pipe_description = concat_keys_to_values(code2description)
df[dimension] = df[dimension].map(code2code_pipe_description)
data = df.set_index(data.index.names)
Expand Down
17 changes: 13 additions & 4 deletions pages/0_👀_Lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,20 @@ def load_dimensions(metabase2datasets: pd.DataFrame) -> pd.Series:
key="_selected_codes",
)

selected_codes_mask = selected_codes.set_index(["code", "dimension"])[
"selected"
].values
selected_codes_mask = selected_codes["selected"].values

st.markdown("Selected dimension overview:")
selected_datasets_by_code = meta.reset_index()[selected_codes_mask]
st.dataframe(
selected_datasets_by_code[
["dimension", "dimension_label", "code", "code_label", "dataset"]
],
hide_index=False,
use_container_width=True,
)

dataset_counts = (
meta[selected_codes_mask]["dataset"].explode("dataset").value_counts()
selected_datasets_by_code["dataset"].explode("dataset").value_counts()
)
st.sidebar.dataframe(dataset_counts)

Expand Down
6 changes: 3 additions & 3 deletions pages/1_🗄️_Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
app_config,
get_logger,
global_download_lock,
load_codelist,
load_dataset,
reduce_multiselect_font_size,
)
Expand Down Expand Up @@ -50,7 +51,6 @@ def save_datasets_to_stash():
# Datasets search criteria
if toc is not None:
with st.sidebar:
# TODO Switching from lookup or no filter, must reset index
using_lookup = "lookup_datasets" in session and session["lookup_datasets"]
dataset_code = stateful_selectbox(
label=f"Select dataset {'from `lookup` page ' if using_lookup else ''}(type to search)",
Expand All @@ -60,7 +60,6 @@ def save_datasets_to_stash():
format_func=lambda i: i + " | " + toc.loc[i],
key="_selected_dataset",
)
logging.info(f"Selectbox selection: {dataset_code}")

# Create or reuse a filtering history for this code
if dataset_code not in session["history"]:
Expand All @@ -74,7 +73,8 @@ def save_datasets_to_stash():
f"Variable selection: {dataset_code + ' | ' + toc.loc[dataset_code]}"
)

dataset = load_dataset(dataset_code)
codelist = load_codelist()
dataset = load_dataset(dataset_code, codelist)

# Flags filtering handles
flags = dataset.flag.fillna("<NA>").unique().tolist()
Expand Down
34 changes: 26 additions & 8 deletions pages/2_🛒_Stash.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
import streamlit as st

from st_widgets.commons import app_config, load_stash, read_stash_from_history
Expand All @@ -13,16 +14,33 @@

def show_stash():
if "history" in st.session_state:
stash = read_stash_from_history(st.session_state.history)
dataset = empty_eurostat_dataframe()
history = st.session_state.history
history_frame = (
pd.Series(
{
dataset_code: values["stash"]
for dataset_code, values in history.items()
},
)
.to_frame("stash")
.reset_index()
.rename(columns={"index": "dataset"})
)

remove_code = st.sidebar.selectbox(
"Remove a dataset",
["-"] + [code for code, p in stash.items() if p["stash"]],
# TODO Refresh twice fix needed
history_frame = st.sidebar.data_editor(
history_frame,
disabled=["dataset"],
use_container_width=True,
)
if remove_code != "-":
stash.pop(remove_code)
st.experimental_rerun()

for dataset_code, is_stashed in (
history_frame.set_index("dataset")["stash"].to_dict().items()
):
history[dataset_code]["stash"] = is_stashed

stash = read_stash_from_history(history)
dataset = empty_eurostat_dataframe()

try:
with st.spinner(text="Fetching data"):
Expand Down
28 changes: 20 additions & 8 deletions st_widgets/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
append_code_descriptions,
cast_time_to_datetimeindex,
fetch_codelist,
fetch_dataset_and_metadata,
fetch_and_preprocess_dataset,
fetch_metabase,
get_cached_session,
metabase2datasets,
Expand Down Expand Up @@ -73,9 +73,13 @@ def load_metabase2datasets() -> pd.DataFrame:
@st.cache_data()
def load_dimensions_and_codes(metabase2datasets: pd.DataFrame) -> pd.Series:
# Arrage metabase as an index of dimensions + descriptions
codes_dims = metabase2datasets.reset_index()[
["dimension", "code", "dimension_label", "code_label"]
].set_index(["dimension", "code"])
codes_dims = (
metabase2datasets.reset_index()[
["dimension", "code", "dimension_label", "code_label"]
]
.set_index(["dimension", "code"])
.sort_index()
)
codes_dims = codes_dims["dimension_label"].str.cat(
codes_dims["code_label"], sep=": "
)
Expand All @@ -84,12 +88,19 @@ def load_dimensions_and_codes(metabase2datasets: pd.DataFrame) -> pd.Series:


@st.cache_data()
def load_dataset(code: str) -> pd.DataFrame:
def load_codelist() -> pd.DataFrame:
req = get_cached_session()
codelist = parse_codelist(fetch_codelist(req))
return codelist


@st.cache_data()
def load_dataset(code: str, codelist: pd.DataFrame) -> pd.DataFrame:
# Return desiderd dataset by code in `long-format` (time as index)
with global_download_lock():
data, meta = fetch_dataset_and_metadata(code)
data = fetch_and_preprocess_dataset(code)
data = cast_time_to_datetimeindex(data)
data = append_code_descriptions(data, meta)
data = append_code_descriptions(data, codelist)
# `flag` shown before `value` to be near others filter key
return data[["flag", "value"]]

Expand All @@ -104,7 +115,8 @@ def load_stash(stash: dict) -> pd.DataFrame:
properties["stash"],
)
if stash:
df = load_dataset(code)
codelist = load_codelist()
df = load_dataset(code, codelist)
df = filter_dataset_replacing_NA(
df,
indexes,
Expand Down
Loading

0 comments on commit b17a91a

Please sign in to comment.