Skip to content
This repository has been archived by the owner on Sep 26, 2023. It is now read-only.

Commit

Permalink
Add expression expansion and selectors tutorial (#364)
Browse files Browse the repository at this point in the history
  • Loading branch information
avimallu authored Jul 5, 2023
1 parent 6b9eafe commit 1980708
Show file tree
Hide file tree
Showing 6 changed files with 263 additions and 40 deletions.
28 changes: 28 additions & 0 deletions API_REFERENCE_LINKS.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,34 @@ python:
apply: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.apply.html
over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html
implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html
dt_to_string:
link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.to_string.html
name: dt.to_string
selectors: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html
cs_numeric:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.numeric
name: cs.numeric
cs_by_name:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.by_name
name: cs.by_name
cs_first:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.first
name: cs.first
cs_temporal:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.temporal
name: cs.temporal
cs_contains:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.contains
name: cs.contains
cs_matches:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.matches
name: cs.matches
is_selector:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.is_selector
name: is_selector
selector_column_names:
link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.selector_column_names
name: selector_column_names
DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html
read_database_connectorx:
name: read_database
Expand Down
96 changes: 96 additions & 0 deletions docs/src/python/user-guide/expressions/column_selections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# --8<-- [start:setup]
import polars as pl

# --8<-- [end:setup]

# --8<-- [start:selectors_df]
from datetime import date, datetime

df = pl.DataFrame(
{
"id": [9, 4, 2],
"place": ["Mars", "Earth", "Saturn"],
"date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
"sales": [33.4, 2142134.1, 44.7],
"has_people": [False, True, False],
"logged_at": pl.date_range(
datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
),
}
).with_row_count("rn")
print(df)
# --8<-- [end:selectors_df]

# --8<-- [start:all]
out = df.select(pl.col("*"))

# Is equivalent to
out = df.select(pl.all())
print(out)
# --8<-- [end:all]

# --8<-- [start:exclude]
out = df.select(pl.col("*").exclude("logged_at", "rn"))
print(out)
# --8<-- [end:exclude]

# --8<-- [start:expansion_by_names]
out = df.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d"))
print(out)
# --8<-- [end:expansion_by_names]

# --8<-- [start:expansion_by_regex]
out = df.select(pl.col("^.*(as|sa).*$"))
print(out)
# --8<-- [end:expansion_by_regex]

# --8<-- [start:expansion_by_dtype]
out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique())
print(out)
# --8<-- [end:expansion_by_dtype]

# --8<-- [start:expansion_by_dtype]
out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique())
print(out)
# --8<-- [end:expansion_by_dtype]

# --8<-- [start:selectors_intro]
import polars.selectors as cs

out = df.select(cs.integer(), cs.string())
print(out)
# --8<-- [end:selectors_intro]

# --8<-- [start:selectors_diff]
out = df.select(cs.numeric() - cs.first())
print(out)
# --8<-- [end:selectors_diff]

# --8<-- [start:selectors_union]
out = df.select(cs.by_name("rn") | ~cs.numeric())
print(out)
# --8<-- [end:selectors_union]

# --8<-- [start:selectors_by_name]
out = df.select(cs.contains("rn"), cs.matches(".*_.*"))
print(out)
# --8<-- [end:selectors_by_name]

# --8<-- [start:selectors_to_expr]
out = df.select(cs.temporal().as_expr().dt.to_string("%Y-%h-%d"))
print(out)
# --8<-- [end:selectors_to_expr]

# --8<-- [start:selectors_is_selector_utility]
from polars.selectors import is_selector

out = cs.temporal()
print(is_selector(out))
# --8<-- [end:selectors_is_selector_utility]

# --8<-- [start:selectors_colnames_utility]
from polars.selectors import selector_column_names

out = cs.temporal().as_expr().dt.to_string("%Y-%h-%d")
print(selector_column_names(df, out))
# --8<-- [end:selectors_colnames_utility]
18 changes: 0 additions & 18 deletions docs/src/python/user-guide/expressions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
np.random.seed(12)
# --8<-- [end:setup]


# --8<-- [start:dataframe]
df = pl.DataFrame(
{
Expand All @@ -19,23 +18,6 @@
print(df)
# --8<-- [end:dataframe]

# --8<-- [start:all]

df_all = df.select([pl.col("*")])

# Is equivalent to
df_all = df.select([pl.all()])
print(df_all)
# --8<-- [end:all]


# --8<-- [start:exclude]

df_exclude = df.select([pl.exclude("groups")])
print(df_exclude)
# --8<-- [end:exclude]


# --8<-- [start:samename]
df_samename = df.select([pl.col("nrs") + 5])
print(df_samename)
Expand Down
134 changes: 134 additions & 0 deletions docs/user-guide/expressions/column_selections.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@

# Column Selections

Let's create a dataset to use in this section:

{{code_block('user-guide/expressions/column_selections','selectors_df',['DataFrame'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:setup"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_df"
```

## Expression Expansion

As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to *expand* the expression.

This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them!

### Select all, or all but some

We can select all columns in the `DataFrame` object by providing the argument `*`:

{{code_block('user-guide/expressions/column_selections', 'all',['all'])}}
```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:all"
```

Often, we don't just want to include all columns, but include all *while* excluding a few. This can be done easily as well:

{{code_block('user-guide/expressions/column_selections','exclude',['exclude'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:exclude"
```

### By multiple strings

Specifying multiple strings allows expressions to *expand* to all matching columns:

{{code_block('user-guide/expressions/column_selections','expansion_by_names',['dt_to_string'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_names"
```

### By regular expressions

Multiple column selection is possible by regular expressions also, by making sure to wrap the regex by `^` and `$` to let `pl.col` know that a regex selection is expected:

{{code_block('user-guide/expressions/column_selections','expansion_by_regex',[''])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_regex"
```

### By data type

`pl.col` can select multiple columns using Polars data types:

{{code_block('user-guide/expressions/column_selections','expansion_by_dtype',['n_unique'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_dtype"
```

## Using `selectors`

Polars also allows for the use of intuitive selections for columns based on their name, `dtype` or other properties; and this is built on top of existing functionality outlined in `col` used above. It is recommended to use them by importing and aliasing `polars.selectors` as `cs`.

### By `dtype`

To select just the integer and string columns, we can do:

{{code_block('user-guide/expressions/column_selections','selectors_intro',['selectors'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_intro"
```

### Applying set operations

These *selectors* also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers:

{{code_block('user-guide/expressions/column_selections','selectors_diff',['cs_first', 'cs_numeric'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_diff"
```

We can also select the row number by name **and** any **non**-numeric columns:

{{code_block('user-guide/expressions/column_selections','selectors_union',['cs_by_name', 'cs_numeric'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_union"
```

### By patterns and substrings

*Selectors* can also be matched by substring and regex patterns:

{{code_block('user-guide/expressions/column_selections','selectors_by_name',['cs_contains', 'cs_matches'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_by_name"
```

### Converting to expressions

What if we want to apply a specific operation on the selected columns (i.e. get back to representing them as **expressions** to operate upon)? We can simply convert them using `as_expr` and then proceed as normal:

{{code_block('user-guide/expressions/column_selections','selectors_to_expr',['cs_temporal'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_to_expr"
```

### Debugging `selectors`

Polars also provides two helpful utility functions to aid with using selectors: `is_selector` and `selector_column_names`:

{{code_block('user-guide/expressions/column_selections','selectors_is_selector_utility',['is_selector'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_is_selector_utility"
```

To predetermine the column names that are selected, which is especially useful for a LazyFrame object:

{{code_block('user-guide/expressions/column_selections','selectors_colnames_utility',['selector_column_names'])}}

```python exec="on" result="text" session="user-guide/column_selections"
--8<-- "python/user-guide/expressions/column_selections.py:selectors_colnames_utility"
```
26 changes: 4 additions & 22 deletions docs/user-guide/expressions/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,7 @@ In the examples below we will use the following `DataFrame`:
--8<-- "python/user-guide/expressions/functions.py:dataframe"
```


#### Column Selection

There are various convenience methods to select multiple or all columns.

##### Select All Columns

{{code_block('user-guide/expressions/functions','all',['all'])}}


##### Select All Columns Except

{{code_block('user-guide/expressions/functions','exclude',['exclude'])}}

```python exec="on" result="text" session="user-guide/functions"
--8<-- "python/user-guide/expressions/functions.py:exclude"
```

#### Column Naming
## Column Naming

By default if you perform an expression it will keep the same name as the original column. In the example below we perform an expression on the `nrs` column. Note that the output `DataFrame` still has the same name.

Expand Down Expand Up @@ -67,7 +49,7 @@ In case of multiple columns for example when using `all()` or `col(*)` you can a
[:material-api: `suffix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html)
[:material-api: `map_alias`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html)

#### Count Unique Values
## Count Unique Values

There are two ways to count unique values in `Polars`: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough.

Expand All @@ -78,9 +60,9 @@ There are two ways to count unique values in `Polars`: an exact methodology and
--8<-- "python/user-guide/expressions/functions.py:countunique"
```

#### Conditionals
## Conditionals

`Polars` supports if-like conditions in expression with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise).
`Polars` supports if-else like conditions in expressions with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise).

{{code_block('user-guide/expressions/functions','conditional',['when'])}}

Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ nav:
- user-guide/concepts/streaming.md
- Expressions:
- user-guide/expressions/operators.md
- user-guide/expressions/column_selections.md
- user-guide/expressions/functions.md
- user-guide/expressions/casting.md
- user-guide/expressions/strings.md
Expand Down

0 comments on commit 1980708

Please sign in to comment.