diff --git a/API_REFERENCE_LINKS.yaml b/API_REFERENCE_LINKS.yaml index 2c8b71bc7..d82f11221 100644 --- a/API_REFERENCE_LINKS.yaml +++ b/API_REFERENCE_LINKS.yaml @@ -58,6 +58,34 @@ python: apply: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.apply.html over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html + dt_to_string: + link: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.to_string.html + name: dt.to_string + selectors: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html + cs_numeric: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.numeric + name: cs.numeric + cs_by_name: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.by_name + name: cs.by_name + cs_first: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.first + name: cs.first + cs_temporal: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.temporal + name: cs.temporal + cs_contains: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.contains + name: cs.contains + cs_matches: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.matches + name: cs.matches + is_selector: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.is_selector + name: is_selector + selector_column_names: + link: https://pola-rs.github.io/polars/py-polars/html/reference/selectors.html#polars.selectors.selector_column_names + name: selector_column_names DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html read_database_connectorx: name: read_database diff --git a/docs/src/python/user-guide/expressions/column_selections.py b/docs/src/python/user-guide/expressions/column_selections.py new file mode 100644 index 000000000..c17dac485 --- /dev/null +++ b/docs/src/python/user-guide/expressions/column_selections.py @@ -0,0 +1,96 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:selectors_df] +from datetime import date, datetime + +df = pl.DataFrame( + { + "id": [9, 4, 2], + "place": ["Mars", "Earth", "Saturn"], + "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True), + "sales": [33.4, 2142134.1, 44.7], + "has_people": [False, True, False], + "logged_at": pl.date_range( + datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True + ), + } +).with_row_count("rn") +print(df) +# --8<-- [end:selectors_df] + +# --8<-- [start:all] +out = df.select(pl.col("*")) + +# Is equivalent to +out = df.select(pl.all()) +print(out) +# --8<-- [end:all] + +# --8<-- [start:exclude] +out = df.select(pl.col("*").exclude("logged_at", "rn")) +print(out) +# --8<-- [end:exclude] + +# --8<-- [start:expansion_by_names] +out = df.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d")) +print(out) +# --8<-- [end:expansion_by_names] + +# --8<-- [start:expansion_by_regex] +out = df.select(pl.col("^.*(as|sa).*$")) +print(out) +# --8<-- [end:expansion_by_regex] + +# --8<-- [start:expansion_by_dtype] +out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique()) +print(out) +# --8<-- [end:expansion_by_dtype] + +# --8<-- [start:expansion_by_dtype] +out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique()) +print(out) +# --8<-- [end:expansion_by_dtype] + +# --8<-- [start:selectors_intro] +import polars.selectors as cs + +out = df.select(cs.integer(), cs.string()) +print(out) +# --8<-- [end:selectors_intro] + +# --8<-- [start:selectors_diff] +out = df.select(cs.numeric() - cs.first()) +print(out) +# --8<-- [end:selectors_diff] + +# --8<-- [start:selectors_union] +out = df.select(cs.by_name("rn") | ~cs.numeric()) +print(out) +# --8<-- [end:selectors_union] + +# --8<-- [start:selectors_by_name] +out = df.select(cs.contains("rn"), cs.matches(".*_.*")) +print(out) +# --8<-- [end:selectors_by_name] + +# --8<-- [start:selectors_to_expr] +out = df.select(cs.temporal().as_expr().dt.to_string("%Y-%h-%d")) +print(out) +# --8<-- [end:selectors_to_expr] + +# --8<-- [start:selectors_is_selector_utility] +from polars.selectors import is_selector + +out = cs.temporal() +print(is_selector(out)) +# --8<-- [end:selectors_is_selector_utility] + +# --8<-- [start:selectors_colnames_utility] +from polars.selectors import selector_column_names + +out = cs.temporal().as_expr().dt.to_string("%Y-%h-%d") +print(selector_column_names(df, out)) +# --8<-- [end:selectors_colnames_utility] diff --git a/docs/src/python/user-guide/expressions/functions.py b/docs/src/python/user-guide/expressions/functions.py index 28d439c47..c75b4c3c3 100644 --- a/docs/src/python/user-guide/expressions/functions.py +++ b/docs/src/python/user-guide/expressions/functions.py @@ -6,7 +6,6 @@ np.random.seed(12) # --8<-- [end:setup] - # --8<-- [start:dataframe] df = pl.DataFrame( { @@ -19,23 +18,6 @@ print(df) # --8<-- [end:dataframe] -# --8<-- [start:all] - -df_all = df.select([pl.col("*")]) - -# Is equivalent to -df_all = df.select([pl.all()]) -print(df_all) -# --8<-- [end:all] - - -# --8<-- [start:exclude] - -df_exclude = df.select([pl.exclude("groups")]) -print(df_exclude) -# --8<-- [end:exclude] - - # --8<-- [start:samename] df_samename = df.select([pl.col("nrs") + 5]) print(df_samename) diff --git a/docs/user-guide/expressions/column_selections.md b/docs/user-guide/expressions/column_selections.md new file mode 100644 index 000000000..298f49014 --- /dev/null +++ b/docs/user-guide/expressions/column_selections.md @@ -0,0 +1,134 @@ + +# Column Selections + +Let's create a dataset to use in this section: + +{{code_block('user-guide/expressions/column_selections','selectors_df',['DataFrame'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:setup" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_df" +``` + +## Expression Expansion + +As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to *expand* the expression. + +This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them! + +### Select all, or all but some + +We can select all columns in the `DataFrame` object by providing the argument `*`: + +{{code_block('user-guide/expressions/column_selections', 'all',['all'])}} +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:all" +``` + +Often, we don't just want to include all columns, but include all *while* excluding a few. This can be done easily as well: + +{{code_block('user-guide/expressions/column_selections','exclude',['exclude'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:exclude" +``` + +### By multiple strings + +Specifying multiple strings allows expressions to *expand* to all matching columns: + +{{code_block('user-guide/expressions/column_selections','expansion_by_names',['dt_to_string'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_names" +``` + +### By regular expressions + +Multiple column selection is possible by regular expressions also, by making sure to wrap the regex by `^` and `$` to let `pl.col` know that a regex selection is expected: + +{{code_block('user-guide/expressions/column_selections','expansion_by_regex',[''])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_regex" +``` + +### By data type + +`pl.col` can select multiple columns using Polars data types: + +{{code_block('user-guide/expressions/column_selections','expansion_by_dtype',['n_unique'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:expansion_by_dtype" +``` + +## Using `selectors` + +Polars also allows for the use of intuitive selections for columns based on their name, `dtype` or other properties; and this is built on top of existing functionality outlined in `col` used above. It is recommended to use them by importing and aliasing `polars.selectors` as `cs`. + +### By `dtype` + +To select just the integer and string columns, we can do: + +{{code_block('user-guide/expressions/column_selections','selectors_intro',['selectors'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_intro" +``` + +### Applying set operations + +These *selectors* also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: + +{{code_block('user-guide/expressions/column_selections','selectors_diff',['cs_first', 'cs_numeric'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_diff" +``` + +We can also select the row number by name **and** any **non**-numeric columns: + +{{code_block('user-guide/expressions/column_selections','selectors_union',['cs_by_name', 'cs_numeric'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_union" +``` + +### By patterns and substrings + +*Selectors* can also be matched by substring and regex patterns: + +{{code_block('user-guide/expressions/column_selections','selectors_by_name',['cs_contains', 'cs_matches'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_by_name" +``` + +### Converting to expressions + +What if we want to apply a specific operation on the selected columns (i.e. get back to representing them as **expressions** to operate upon)? We can simply convert them using `as_expr` and then proceed as normal: + +{{code_block('user-guide/expressions/column_selections','selectors_to_expr',['cs_temporal'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_to_expr" +``` + +### Debugging `selectors` + +Polars also provides two helpful utility functions to aid with using selectors: `is_selector` and `selector_column_names`: + +{{code_block('user-guide/expressions/column_selections','selectors_is_selector_utility',['is_selector'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_is_selector_utility" +``` + +To predetermine the column names that are selected, which is especially useful for a LazyFrame object: + +{{code_block('user-guide/expressions/column_selections','selectors_colnames_utility',['selector_column_names'])}} + +```python exec="on" result="text" session="user-guide/column_selections" +--8<-- "python/user-guide/expressions/column_selections.py:selectors_colnames_utility" +``` diff --git a/docs/user-guide/expressions/functions.md b/docs/user-guide/expressions/functions.md index be12822cb..40d6e1ac5 100644 --- a/docs/user-guide/expressions/functions.md +++ b/docs/user-guide/expressions/functions.md @@ -11,25 +11,7 @@ In the examples below we will use the following `DataFrame`: --8<-- "python/user-guide/expressions/functions.py:dataframe" ``` - -#### Column Selection - -There are various convenience methods to select multiple or all columns. - -##### Select All Columns - -{{code_block('user-guide/expressions/functions','all',['all'])}} - - -##### Select All Columns Except - -{{code_block('user-guide/expressions/functions','exclude',['exclude'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:exclude" -``` - -#### Column Naming +## Column Naming By default if you perform an expression it will keep the same name as the original column. In the example below we perform an expression on the `nrs` column. Note that the output `DataFrame` still has the same name. @@ -67,7 +49,7 @@ In case of multiple columns for example when using `all()` or `col(*)` you can a [:material-api: `suffix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html) [:material-api: `map_alias`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html) -#### Count Unique Values +## Count Unique Values There are two ways to count unique values in `Polars`: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough. @@ -78,9 +60,9 @@ There are two ways to count unique values in `Polars`: an exact methodology and --8<-- "python/user-guide/expressions/functions.py:countunique" ``` -#### Conditionals +## Conditionals -`Polars` supports if-like conditions in expression with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise). +`Polars` supports if-else like conditions in expressions with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise). {{code_block('user-guide/expressions/functions','conditional',['when'])}} diff --git a/mkdocs.yml b/mkdocs.yml index 6af6fff36..0aab7ffae 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -86,6 +86,7 @@ nav: - user-guide/concepts/streaming.md - Expressions: - user-guide/expressions/operators.md + - user-guide/expressions/column_selections.md - user-guide/expressions/functions.md - user-guide/expressions/casting.md - user-guide/expressions/strings.md