diff --git a/README.md b/README.md index 25d8a20..ff30375 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # 🇪🇺 Eurostat Data Wizard A straightforward webapp to export easily multiple Eurostat datasets. +## Lookup +1. Search and select all variables that you find interesting: dataset containing them will be available in the `Data` page. ## Data 1. Choose an Eurostat dataset of interest (or start typing dataset code or title). 2. After loading, you can inspect the dataset and filter indexes, flags and time-span with the controls provided in the sidebar. diff --git a/datawizard/data.py b/datawizard/data.py index cf85754..c66e29a 100644 --- a/datawizard/data.py +++ b/datawizard/data.py @@ -71,22 +71,6 @@ def fetch_dataset(code: str, caching_days: int = 7) -> pd.DataFrame: return dataset -def fetch_dataset_codelist(request: sdmx.Request, dataset: str) -> pd.DataFrame: - """Returns codelist found from eurostat""" - metadata = request.datastructure(dataset) - codelist = pd.DataFrame() - for codes in metadata.codelist.values(): # type: ignore - codes = sdmx.to_pandas(codes) - codelist = pd.concat([codelist, codes]) - codelist.index.name = "code" - if codelist.empty: - return pd.DataFrame() - codelist = codelist.rename(columns={"name": "label", "parent": "dimension"}) - codelist["dimension"] = codelist["dimension"].str.lower() - codelist = codelist.set_index("dimension", append=True).swaplevel() - return codelist - - def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame: """Preprocess dataset by mangling it in a convenient DataFrame.""" df = df.rename(columns={"geo\\TIME_PERIOD": "geo"}) @@ -106,14 +90,13 @@ def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame: return pd.concat([values, flags], axis=1).stack("time", dropna=False)[["value", "flag"]].dropna(how="all", axis=0) # type: ignore -def fetch_dataset_and_metadata( +def fetch_and_preprocess_dataset( code: str, -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> pd.DataFrame: # TODO remove dependency from `eurostat_sdmx_request`` data = fetch_dataset(code) data = None if data is None else preprocess_dataset(data) - metadata = fetch_dataset_codelist(eurostat_sdmx_request(), code) - return data, metadata # type: ignore TODO Type checking fails + return data # type: ignore TODO Type checking fails def cast_time_to_datetimeindex(data: pd.DataFrame): @@ -135,18 +118,15 @@ def cast_time_to_datetimeindex(data: pd.DataFrame): def append_code_descriptions(data: pd.DataFrame, codelist: pd.DataFrame): - df = data.reset_index() cols_to_transform = data.index.names.difference(["time"]).union(["flag"]) # type: ignore + df = data.reset_index() + code2level = codelist["code_label"] for dimension in cols_to_transform: if dimension == "flag": # `flag` is served with a different name in codelist - code2description = quote_sanitizer( - codelist.squeeze().loc["obs_flag"] - ).to_dict() + code2description = quote_sanitizer(code2level.loc["obs_flag"]).to_dict() else: - code2description = quote_sanitizer( - codelist.squeeze().loc[dimension] - ).to_dict() + code2description = quote_sanitizer(code2level.loc[dimension]).to_dict() code2code_pipe_description = concat_keys_to_values(code2description) df[dimension] = df[dimension].map(code2code_pipe_description) data = df.set_index(data.index.names) diff --git "a/pages/0_\360\237\221\200_Lookup.py" "b/pages/0_\360\237\221\200_Lookup.py" index 3bc8e9b..cf60f9f 100644 --- "a/pages/0_\360\237\221\200_Lookup.py" +++ "b/pages/0_\360\237\221\200_Lookup.py" @@ -44,11 +44,20 @@ def load_dimensions(metabase2datasets: pd.DataFrame) -> pd.Series: key="_selected_codes", ) - selected_codes_mask = selected_codes.set_index(["code", "dimension"])[ - "selected" - ].values + selected_codes_mask = selected_codes["selected"].values + + st.markdown("Selected dimension overview:") + selected_datasets_by_code = meta.reset_index()[selected_codes_mask] + st.dataframe( + selected_datasets_by_code[ + ["dimension", "dimension_label", "code", "code_label", "dataset"] + ], + hide_index=False, + use_container_width=True, + ) + dataset_counts = ( - meta[selected_codes_mask]["dataset"].explode("dataset").value_counts() + selected_datasets_by_code["dataset"].explode("dataset").value_counts() ) st.sidebar.dataframe(dataset_counts) diff --git "a/pages/1_\360\237\227\204\357\270\217_Data.py" "b/pages/1_\360\237\227\204\357\270\217_Data.py" index 4d8e7fd..d25d8f3 100644 --- "a/pages/1_\360\237\227\204\357\270\217_Data.py" +++ "b/pages/1_\360\237\227\204\357\270\217_Data.py" @@ -13,6 +13,7 @@ app_config, get_logger, global_download_lock, + load_codelist, load_dataset, reduce_multiselect_font_size, ) @@ -50,7 +51,6 @@ def save_datasets_to_stash(): # Datasets search criteria if toc is not None: with st.sidebar: - # TODO Switching from lookup or no filter, must reset index using_lookup = "lookup_datasets" in session and session["lookup_datasets"] dataset_code = stateful_selectbox( label=f"Select dataset {'from `lookup` page ' if using_lookup else ''}(type to search)", @@ -60,7 +60,6 @@ def save_datasets_to_stash(): format_func=lambda i: i + " | " + toc.loc[i], key="_selected_dataset", ) - logging.info(f"Selectbox selection: {dataset_code}") # Create or reuse a filtering history for this code if dataset_code not in session["history"]: @@ -74,7 +73,8 @@ def save_datasets_to_stash(): f"Variable selection: {dataset_code + ' | ' + toc.loc[dataset_code]}" ) - dataset = load_dataset(dataset_code) + codelist = load_codelist() + dataset = load_dataset(dataset_code, codelist) # Flags filtering handles flags = dataset.flag.fillna("").unique().tolist() diff --git "a/pages/2_\360\237\233\222_Stash.py" "b/pages/2_\360\237\233\222_Stash.py" index 9057d9d..5ea4929 100644 --- "a/pages/2_\360\237\233\222_Stash.py" +++ "b/pages/2_\360\237\233\222_Stash.py" @@ -1,3 +1,4 @@ +import pandas as pd import streamlit as st from st_widgets.commons import app_config, load_stash, read_stash_from_history @@ -13,16 +14,33 @@ def show_stash(): if "history" in st.session_state: - stash = read_stash_from_history(st.session_state.history) - dataset = empty_eurostat_dataframe() + history = st.session_state.history + history_frame = ( + pd.Series( + { + dataset_code: values["stash"] + for dataset_code, values in history.items() + }, + ) + .to_frame("stash") + .reset_index() + .rename(columns={"index": "dataset"}) + ) - remove_code = st.sidebar.selectbox( - "Remove a dataset", - ["-"] + [code for code, p in stash.items() if p["stash"]], + # TODO Refresh twice fix needed + history_frame = st.sidebar.data_editor( + history_frame, + disabled=["dataset"], + use_container_width=True, ) - if remove_code != "-": - stash.pop(remove_code) - st.experimental_rerun() + + for dataset_code, is_stashed in ( + history_frame.set_index("dataset")["stash"].to_dict().items() + ): + history[dataset_code]["stash"] = is_stashed + + stash = read_stash_from_history(history) + dataset = empty_eurostat_dataframe() try: with st.spinner(text="Fetching data"): diff --git a/st_widgets/commons.py b/st_widgets/commons.py index ca39216..9ee178d 100644 --- a/st_widgets/commons.py +++ b/st_widgets/commons.py @@ -9,7 +9,7 @@ append_code_descriptions, cast_time_to_datetimeindex, fetch_codelist, - fetch_dataset_and_metadata, + fetch_and_preprocess_dataset, fetch_metabase, get_cached_session, metabase2datasets, @@ -73,9 +73,13 @@ def load_metabase2datasets() -> pd.DataFrame: @st.cache_data() def load_dimensions_and_codes(metabase2datasets: pd.DataFrame) -> pd.Series: # Arrage metabase as an index of dimensions + descriptions - codes_dims = metabase2datasets.reset_index()[ - ["dimension", "code", "dimension_label", "code_label"] - ].set_index(["dimension", "code"]) + codes_dims = ( + metabase2datasets.reset_index()[ + ["dimension", "code", "dimension_label", "code_label"] + ] + .set_index(["dimension", "code"]) + .sort_index() + ) codes_dims = codes_dims["dimension_label"].str.cat( codes_dims["code_label"], sep=": " ) @@ -84,12 +88,19 @@ def load_dimensions_and_codes(metabase2datasets: pd.DataFrame) -> pd.Series: @st.cache_data() -def load_dataset(code: str) -> pd.DataFrame: +def load_codelist() -> pd.DataFrame: + req = get_cached_session() + codelist = parse_codelist(fetch_codelist(req)) + return codelist + + +@st.cache_data() +def load_dataset(code: str, codelist: pd.DataFrame) -> pd.DataFrame: # Return desiderd dataset by code in `long-format` (time as index) with global_download_lock(): - data, meta = fetch_dataset_and_metadata(code) + data = fetch_and_preprocess_dataset(code) data = cast_time_to_datetimeindex(data) - data = append_code_descriptions(data, meta) + data = append_code_descriptions(data, codelist) # `flag` shown before `value` to be near others filter key return data[["flag", "value"]] @@ -104,7 +115,8 @@ def load_stash(stash: dict) -> pd.DataFrame: properties["stash"], ) if stash: - df = load_dataset(code) + codelist = load_codelist() + df = load_dataset(code, codelist) df = filter_dataset_replacing_NA( df, indexes, diff --git a/tests/test_Data.py b/tests/test_Data.py index 52c5699..918af3f 100644 --- a/tests/test_Data.py +++ b/tests/test_Data.py @@ -8,7 +8,7 @@ append_code_descriptions, cast_time_to_datetimeindex, fetch_dataset, - fetch_dataset_and_metadata, + fetch_and_preprocess_dataset, fetch_table_of_contents, filter_dataset, parse_codelist, @@ -190,7 +190,70 @@ def codelist_response(): }, "label": "Occurence", "extension": {"lang": "EN", "id": "OCCUR", "version": "1.1"}, - } + }, + { + "class": "dimension", + "source": "ESTAT", + "category": { + "label": { + "CB_EU_FOR": "Individuals who are born in another EU Member State", + }, + "index": ["CB_EU_FOR"], + }, + "label": "Individual type", + "extension": {"lang": "EN", "id": "IND_TYPE", "version": "1.1"}, + }, + { + "class": "dimension", + "source": "ESTAT", + "category": { + "label": { + "I_IUG_DKPC": "Individuals used the internet on a desktop computer", + }, + "index": ["I_IUG_DKPC"], + }, + "label": "Information society indicator", + "extension": {"lang": "EN", "id": "INDIC_IS", "version": "1.1"}, + }, + { + "class": "dimension", + "source": "ESTAT", + "category": { + "label": { + "PC_IND": "Percentage of individuals", + }, + "index": ["PC_IND"], + }, + "label": "Unit of measure", + "extension": {"lang": "EN", "id": "UNIT", "version": "1.1"}, + }, + { + "class": "dimension", + "source": "ESTAT", + "category": { + "label": { + "AL": "Albania", + "IT": "Italy", + }, + "index": ["AL", "IT"], + }, + "label": "Geopolitical entity (reporting)", + "extension": {"lang": "EN", "id": "GEO", "version": "1.1"}, + }, + { + "class": "dimension", + "source": "ESTAT", + "category": { + "label": { + "d": "definition differs, see metadata", + "f": "forecast", + "u": "low reliability", + }, + "index": ["d", "f", "u"], + }, + "label": "Observation status (Flag)", + "extension": {"lang": "EN", "id": "OBS_FLAG", "version": "1.1"}, + }, ] }, } @@ -198,14 +261,47 @@ def codelist_response(): @pytest.fixture def codelist(): + # TODO order alphabetically return pd.DataFrame.from_dict( { - "index": [("occur", "ADLH"), ("occur", "M12"), ("occur", "Y5")], + "index": [ + ("geo", "AL"), + ("geo", "IT"), + ( + "ind_type", + "CB_EU_FOR", + ), + ( + "indic_is", + "I_IUG_DKPC", + ), + ("obs_flag", "d"), + ("obs_flag", "f"), + ("obs_flag", "u"), + ("occur", "ADLH"), + ("occur", "M12"), + ("occur", "Y5"), + ("unit", "PC_IND"), + ], "columns": ["dimension_label", "code_label"], "data": [ + ["Geopolitical entity (reporting)", "Albania"], + ["Geopolitical entity (reporting)", "Italy"], + [ + "Individual type", + "Individuals who are born in another EU Member State", + ], + [ + "Information society indicator", + "Individuals used the internet on a desktop computer", + ], + ["Observation status (Flag)", "definition differs, see metadata"], + ["Observation status (Flag)", "forecast"], + ["Observation status (Flag)", "low reliability"], ["Occurence", "Adulthood"], ["Occurence", "Last 12 months"], ["Occurence", "Last 5 years"], + ["Unit of measure", "Percentage of individuals"], ], "index_names": ["dimension", "code"], "column_names": [None], @@ -246,33 +342,6 @@ def metabase(): ) -@pytest.fixture() -def dataset_codelist(): - # Emulate a processed dataset codelist - codes = pd.DataFrame( - { - "label": { - ( - "ind_type", - "CB_EU_FOR", - ): "Individuals who are born in another EU Member State", - ( - "indic_is", - "I_IUG_DKPC", - ): "Individuals used the internet on a desktop computer", - ("unit", "PC_IND"): "Percentage of individuals", - ("geo", "AL"): "Albania", - ("geo", "IT"): "Italy", - ("obs_flag", "f"): "forecast", - ("obs_flag", "u"): "low reliability", - ("obs_flag", "d"): "definition differs, see metadata", - } - } - ) - codes.index = codes.index.set_names(["dimension", "code"]) - return codes - - @pytest.fixture def reverse_index(): # Emulate metabase2dataset output (a metabase enriched with descriptions) @@ -332,18 +401,13 @@ def test_preprocess_dataset(mocker, raw_dataset, dataset): assert_frame_equal(data, dataset) -def test_fetch_dataset_and_metadata(mocker, raw_dataset, dataset_codelist): +def test_fetch_and_preprocess_dataset(mocker, raw_dataset): mocker.patch( "datawizard.data.fetch_dataset", return_value=raw_dataset, ) - mocker.patch( - "datawizard.data.fetch_dataset_codelist", - return_value=dataset_codelist, - ) - data, metadata = fetch_dataset_and_metadata("fake-code") + data = fetch_and_preprocess_dataset("fake-code") assert isinstance(data, pd.DataFrame) - assert isinstance(metadata, pd.DataFrame) def test_cast_time_to_datetimeindex( @@ -369,8 +433,8 @@ def test_cast_time_to_datetimeindex( cast_time_to_datetimeindex(weekly_dataset) -def test_append_code_descriptions(dataset, dataset_codelist): - df = append_code_descriptions(dataset, dataset_codelist) +def test_append_code_descriptions(dataset, codelist): + df = append_code_descriptions(dataset, codelist) assert_index_equal( df.index.get_level_values("geo"), pd.Index(