diff --git a/API_REFERENCE_LINKS.yaml b/API_REFERENCE_LINKS.yaml index b083b6e01..2c8b71bc7 100644 --- a/API_REFERENCE_LINKS.yaml +++ b/API_REFERENCE_LINKS.yaml @@ -16,6 +16,12 @@ python: write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html + value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html + unnest: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.unnest.html + field: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.field.html + struct: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.struct.html + rename_fields: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.rename_fields.html + is_duplicated: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.is_duplicated.html replace: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.str.replace.html sample: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.sample.html day: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.dt.day.html @@ -117,7 +123,6 @@ python: name: log link: https://numpy.org/doc/stable/reference/generated/numpy.log.html feature_flags: ['numpy'] - struct: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.struct.html# lengths: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.lengths.html n_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.n_chars.html str.contains: diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py new file mode 100644 index 000000000..f209420a3 --- /dev/null +++ b/docs/src/python/user-guide/expressions/structs.py @@ -0,0 +1,66 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:ratings_df] +ratings = pl.DataFrame( + { + "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], + "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], + "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], + "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + } +) +print(ratings) +# --8<-- [end:ratings_df] + +# --8<-- [start:state_value_counts] +out = ratings.select(pl.col("Theatre").value_counts(sort=True)) +print(out) +# --8<-- [end:state_value_counts] + +# --8<-- [start:struct_unnest] +out = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") +print(out) +# --8<-- [end:struct_unnest] + +# --8<-- [start:series_struct] +rating_Series = pl.Series( + "ratings", + [ + {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5}, + {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9}, + ], +) +print(rating_Series) +# --8<-- [end:series_struct] + +# --8<-- [start:series_struct_extract] +out = rating_Series.struct.field("Movie") +print(out) +# --8<-- [end:series_struct_extract] + +# --8<-- [start:series_struct_rename] +out = ( + rating_Series.to_frame() + .select(pl.col("ratings").struct.rename_fields(["Film", "State", "Value"])) + .unnest("ratings") +) +print(out) +# --8<-- [end:series_struct_rename] + +# --8<-- [start:struct_duplicates] +out = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(out) +# --8<-- [end:struct_duplicates] + +# --8<-- [start:struct_ranking] +out = ratings.with_columns( + pl.struct("Count", "Avg_Rating") + .rank("dense", descending=True) + .over("Movie", "Theatre") + .alias("Rank") +).filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(out) +# --8<-- [end:struct_ranking] diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md new file mode 100644 index 000000000..750ce83a4 --- /dev/null +++ b/docs/user-guide/expressions/structs.md @@ -0,0 +1,92 @@ +# The Struct datatype + +Polars `Struct`s are the idiomatic way of working with multiple columns. It is also a free operation i.e. moving columns into `Struct`s does not copy any data! + +For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: + +{{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:setup" +--8<-- "python/user-guide/expressions/structs.py:ratings_df" +``` + +## Encountering the `Struct` type + +A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. Checking the number of times a state appears the data will be done as so: + +{{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:state_value_counts" +``` + +Quite unexpected an output, especially if coming from tools that do not have such a data type. We're not in peril though, to get back to a more familiar output, all we need to do is `unnest` the `Struct` column into its constituent columns: + +{{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_unnest" +``` + +!!! note "Why `value_counts` returns a `Struct`" + + Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/ouput of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. + +## Structs as `dict`s + +Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: + +{{code_block('user-guide/expressions/structs','series_struct',['Series'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct" +``` + +!!! note "Constructing `Series` objects" + + Note that `Series` here was constructed with the `name` of the series in the begninng, followed by the `values`. Providing the latter first + is considered an anti-pattern in Polars, and must be avoided. + +### Extracting individual values of a `Struct` + +Let's say that we needed to obtain just the `movie` value in the `Series` that we created above. We can use the `field` method to do so: + +{{code_block('user-guide/expressions/structs','series_struct_extract',['field'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_extract" +``` + +### Renaming individual keys of a `Struct` + +What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_Series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: + +{{code_block('user-guide/expressions/structs','series_struct_rename',['rename_fields'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_rename" +``` + +## Practical use-cases of `Struct` columns + +### Identifying duplicate rows + +Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a `Movie` and `Theatre` level. This is where the `Struct` datatype shines: + +{{code_block('user-guide/expressions/structs','struct_duplicates',['is_duplicated', 'struct'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_duplicates" +``` + +We can identify the unique cases at this level also with `is_unique`! + +### Multi-column ranking + +Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define *Count* of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: + +{{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:struct_ranking" +``` + +That's a pretty complex set of requirements done very elegantly in Polars! + +### Using multi-column apply + +This was discussed in the previous section on *User Defined Functions*. + diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index aa7a4f81c..40de817c4 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -151,6 +151,9 @@ In Python, those would be passed as `dict` to the calling python function and ca ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:combine" ``` + +`Structs` are covered in detail in the next section. + ### Return types? Custom python functions are black boxes for polars. We really don't know what kind of black arts you are doing, so we have diff --git a/mkdocs.yml b/mkdocs.yml index f786eaf00..6af6fff36 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -95,6 +95,7 @@ nav: - user-guide/expressions/folds.md - user-guide/expressions/lists.md - user-guide/expressions/user-defined-functions.md + - user-guide/expressions/structs.md - user-guide/expressions/numpy.md - Transformations: - user-guide/transformations/joins.md