diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index d7b11f7ab7fa..92efbd1d2173 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -252,7 +252,7 @@ impl LazyFrame { /// Return a String describing the logical plan. /// - /// If `optimized` is `true`, explains the optimized plan. If `optimized` is `false, + /// If `optimized` is `true`, explains the optimized plan. If `optimized` is `false`, /// explains the naive, un-optimized plan. pub fn explain(&self, optimized: bool) -> PolarsResult { if optimized { diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 2a9bc80237dc..1fea34db3465 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -26,6 +26,7 @@ python: is_duplicated: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.is_duplicated.html sample: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.sample.html head: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.head.html + glimpse: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.glimpse.html tail: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.tail.html describe: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.describe.html col: https://docs.pola.rs/api/python/stable/reference/expressions/col.html diff --git a/docs/source/src/python/user-guide/concepts/data-types-and-structures.py b/docs/source/src/python/user-guide/concepts/data-types-and-structures.py index e6cd13245d2f..3d08edcbcec9 100644 --- a/docs/source/src/python/user-guide/concepts/data-types-and-structures.py +++ b/docs/source/src/python/user-guide/concepts/data-types-and-structures.py @@ -12,16 +12,16 @@ # --8<-- [end:series-dtype] # --8<-- [start:df] -import datetime as dt +from datetime import date df = pl.DataFrame( { "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], "birthdate": [ - dt.date(1997, 1, 10), - dt.date(1985, 2, 15), - dt.date(1983, 3, 22), - dt.date(1981, 4, 30), + date(1997, 1, 10), + date(1985, 2, 15), + date(1983, 3, 22), + date(1981, 4, 30), ], "weight": [57.9, 72.5, 53.6, 83.1], # (kg) "height": [1.56, 1.77, 1.65, 1.75], # (m) @@ -39,6 +39,10 @@ print(df.head(3)) # --8<-- [end:head] +# --8<-- [start:glimpse] +print(df.glimpse(return_as_string=True)) +# --8<-- [end:glimpse] + # --8<-- [start:tail] print(df.tail(3)) # --8<-- [end:tail] diff --git a/docs/source/src/python/user-guide/concepts/expressions.py b/docs/source/src/python/user-guide/concepts/expressions.py index fb057ccec389..7a4cb0637ba3 100644 --- a/docs/source/src/python/user-guide/concepts/expressions.py +++ b/docs/source/src/python/user-guide/concepts/expressions.py @@ -10,16 +10,16 @@ # --8<-- [end:print-expr] # --8<-- [start:df] -import datetime as dt +from datetime import date df = pl.DataFrame( { "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], "birthdate": [ - dt.date(1997, 1, 10), - dt.date(1985, 2, 15), - dt.date(1983, 3, 22), - dt.date(1981, 4, 30), + date(1997, 1, 10), + date(1985, 2, 15), + date(1983, 3, 22), + date(1981, 4, 30), ], "weight": [57.9, 72.5, 53.6, 83.1], # (kg) "height": [1.56, 1.77, 1.65, 1.75], # (m) @@ -54,16 +54,16 @@ # --8<-- [start:filter-1] result = df.filter( - pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)), + pl.col("birthdate").is_between(date(1982, 12, 31), date(1996, 1, 1)), pl.col("height") > 1.7, ) print(result) # --8<-- [end:filter-1] # --8<-- [start:group_by-1] -result = df.group_by((pl.col("birthdate").dt.year() // 10 * 10).alias("decade")).agg( - pl.col("name") -) +result = df.group_by( + (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), +).agg(pl.col("name")) print(result) # --8<-- [end:group_by-1] diff --git a/docs/source/src/python/user-guide/concepts/lazy-vs-eager.py b/docs/source/src/python/user-guide/concepts/lazy-vs-eager.py index ebd684cf1a1d..dd48c65b2378 100644 --- a/docs/source/src/python/user-guide/concepts/lazy-vs-eager.py +++ b/docs/source/src/python/user-guide/concepts/lazy-vs-eager.py @@ -1,5 +1,8 @@ +# --8<-- [start:import] import polars as pl +# --8<-- [end:import] + # --8<-- [start:eager] df = pl.read_csv("docs/assets/data/iris.csv") @@ -18,3 +21,25 @@ df = q.collect() # --8<-- [end:lazy] + +# --8<-- [start:explain] +print(q.explain()) +# --8<-- [end:explain] + +# --8<-- [start:explain-expression-expansion] +schema = pl.Schema( + { + "int_1": pl.Int16, + "int_2": pl.Int32, + "float_1": pl.Float64, + "float_2": pl.Float64, + "float_3": pl.Float64, + } +) + +print( + pl.LazyFrame(schema=schema) + .select((pl.col(pl.Float64) * 1.1).name.suffix("*1.1")) + .explain() +) +# --8<-- [end:explain-expression-expansion] diff --git a/docs/source/src/rust/user-guide/concepts/lazy-vs-eager.rs b/docs/source/src/rust/user-guide/concepts/lazy-vs-eager.rs index cbebb6a46a3f..955111ac2c11 100644 --- a/docs/source/src/rust/user-guide/concepts/lazy-vs-eager.rs +++ b/docs/source/src/rust/user-guide/concepts/lazy-vs-eager.rs @@ -28,5 +28,15 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:lazy] + // --8<-- [start:explain] + let q = LazyCsvReader::new("docs/assets/data/iris.csv") + .with_has_header(true) + .finish()? + .filter(col("sepal_length").gt(lit(5))) + .group_by(vec![col("species")]) + .agg([col("sepal_width").mean()]); + println!("{:?}", q.explain(true)); + // --8<-- [end:explain] + Ok(()) } diff --git a/docs/source/user-guide/concepts/streaming.md b/docs/source/user-guide/concepts/_streaming.md similarity index 91% rename from docs/source/user-guide/concepts/streaming.md rename to docs/source/user-guide/concepts/_streaming.md index 2cce3100ac8e..e4427c10481a 100644 --- a/docs/source/user-guide/concepts/streaming.md +++ b/docs/source/user-guide/concepts/_streaming.md @@ -1,5 +1,7 @@ # Streaming + + One additional benefit of the lazy API is that it allows queries to be executed in a streaming manner. Instead of processing all the data at once, Polars can execute the query in batches allowing you to process datasets that do not fit in memory. To tell Polars we want to execute a query in streaming mode we pass the `streaming=True` argument to `collect` diff --git a/docs/source/user-guide/concepts/data-types-and-structures.md b/docs/source/user-guide/concepts/data-types-and-structures.md index 04e3609bca76..2de8120f05a3 100644 --- a/docs/source/user-guide/concepts/data-types-and-structures.md +++ b/docs/source/user-guide/concepts/data-types-and-structures.md @@ -4,13 +4,13 @@ Polars supports a variety of data types that fall broadly under the following categories: -- Numeric data types: signed integers, unsigned integers, and floating point numbers. +- Numeric data types: signed integers, unsigned integers, floating point numbers, and decimals. - Nested data types: lists, structs, and arrays. -- Temporal: dates, times, and time deltas. -- Miscellaneous: structs, strings, categoricals, enums, and more. +- Temporal: dates, datetimes, times, and time deltas. +- Miscellaneous: strings, binary data, Booleans, categoricals, enums, and objects. All types support missing values represented by the special value `null`. -The numerical types also support the special value `NaN`, about which you can read more in the [section about floating point numbers](#floating-point-numbers). +This is not to be conflated with the special value `NaN` in floating number data types; see the [section about floating point numbers](#floating-point-numbers) for more information. You can also find a [full table with all data types supported in the appendix](#appendix-full-data-types-table) with notes on when to use each data type and with links to relevant parts of the documentation. @@ -66,6 +66,25 @@ By default, you get the first 5 rows but you can also specify the number of rows --8<-- "python/user-guide/concepts/data-types-and-structures.py:head" ``` +#### Glimpse + +The function `glimpse` is another function that shows the values of the first few rows of a dataframe, but formats the output differently from `head`. +Here, each line of the output corresponds to a single column, making it easier to take inspect wider dataframes: + +=== ":fontawesome-brands-python: Python" +[:material-api: `glimpse`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.glimpse.html) + +```python +--8<-- "python/user-guide/concepts/data-types-and-structures.py:glimpse" +``` + +```python exec="on" result="text" session="user-guide/data-types-and-structures" +--8<-- "python/user-guide/concepts/data-types-and-structures.py:glimpse" +``` + +!!! info +`glimpse` is only available for Python users. + #### Tail The function `tail` shows the last rows of a dataframe. @@ -114,10 +133,9 @@ You can check the schema of a dataframe with `schema`: ## Data types internals -Polars is entirely based on the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html), an in-memory data structure specification. +Polars utilizes the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) for its data orientation. Following this specification allows Polars to transfer data to/from other tools that also use the Arrow specification with little to no overhead. -Contrary to popular belief, Polars does not draw its performance from Arrow. Polars gets most of its performance from its query engine, the optimizations it performs on your query plans, and from the parallelization that it employs when running [your expressions](expressions-and-contexts.md#expressions). ## Floating point numbers @@ -136,23 +154,23 @@ much larger internal representations than 64-bit floats), and thus some error is ## Appendix: full data types table -| Type(s) | Details | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Boolean` | Boolean type that is bit packed efficiently. | -| `Int8`, `Int16`, `Int32`, `Int64` | Varying-precision signed integer types. | -| `UInt8`, `UInt16`, `UInt32`, `UInt64` | Varying-precision unsigned integer types. | -| `Float32`, `Float64` | Varying-precision signed floating point numbers. | -| `Decimal` | Decimal 128-bit type with optional precision and non-negative scale. Use this if you need fine-grained control over the precision of your floats and the operations you make on them. Similar to [Python's `decimal.Decimal`](https://docs.python.org/3/library/decimal.html), but with a different API. | -| `String` | UTF-8 encoded. | -| `Binary` | Stores arbitrary raw binary data. | -| `Date` | Represents a calendar date. | -| `Time` | Represents a time of day. | -| `Datetime` | Represents a calendar date and time of day. | -| `Duration` | Represents a time duration. | -| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | -| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | -| `Object` | Wraps arbitrary Python objects. | -| `Categorical` | Efficient encoding of string data where the categories are inferred at runtime. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | -| `Enum` | Efficient ordered encoding of a set of predetermined string categories. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | -| `Struct` | Composite type that is useful when you need to pass multiple parameters into a single expression. [Learn more about the data type `Struct` in its dedicated documentation section.](../expressions/structs.md). | -| `Null` | Represents null values. | +| Type(s) | Details | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `Boolean` | Boolean type that is bit packed efficiently. | +| `Int8`, `Int16`, `Int32`, `Int64` | Varying-precision signed integer types. | +| `UInt8`, `UInt16`, `UInt32`, `UInt64` | Varying-precision unsigned integer types. | +| `Float32`, `Float64` | Varying-precision signed floating point numbers. | +| `Decimal` | Decimal 128-bit type with optional precision and non-negative scale. Use this if you need fine-grained control over the precision of your floats and the operations you make on them. See [Python's `decimal.Decimal`](https://docs.python.org/3/library/decimal.html) for documentation on what a decimal data type is. | +| `String` | Variable length UTF-8 encoded string data, typically Human-readable. | +| `Binary` | Stores arbitrary, varying length raw binary data. | +| `Date` | Represents a calendar date. | +| `Time` | Represents a time of day. | +| `Datetime` | Represents a calendar date and time of day. | +| `Duration` | Represents a time duration. | +| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | +| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | +| `Object` | Wraps arbitrary Python objects. | +| `Categorical` | Efficient encoding of string data where the categories are inferred at runtime. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | +| `Enum` | Efficient ordered encoding of a set of predetermined string categories. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | +| `Struct` | Composite product type that can store multiple fields. [Learn more about the data type `Struct` in its dedicated documentation section.](../expressions/structs.md). | +| `Null` | Represents null values. | diff --git a/docs/source/user-guide/concepts/expressions-and-contexts.md b/docs/source/user-guide/concepts/expressions-and-contexts.md index eed722b6cd0a..4ec537b71fb9 100644 --- a/docs/source/user-guide/concepts/expressions-and-contexts.md +++ b/docs/source/user-guide/concepts/expressions-and-contexts.md @@ -27,14 +27,14 @@ The code above expresses an abstract computation that we can save in a variable, --8<-- "python/user-guide/concepts/expressions.py:print-expr" ``` -The output above shows that Polars creates an intermediate representation of your expressions. -This intermediate representation can then be parsed and optimized by the Polars query engine when it is time to actually run the expression. +Because expressions are lazy, no computations have taken place yet. +That's what we need contexts for. ## Contexts Polars expressions need a _context_ in which they are executed to produce a result. Depending on the context it is used in, the same Polars expression can produce different results. -In this section, we will learn about the four simplest contexts that Polars provides[^1]: +In this section, we will learn about the four most common contexts that Polars provides[^1]: 1. `select` 2. `with_columns` @@ -60,9 +60,9 @@ The context `select` may produce new columns that are aggregations, combinations --8<-- "python/user-guide/concepts/expressions.py:select-1" ``` -The expressions in a context `select` must produce series that are all the same length or have length 1. -Series that have length 1 will be broadcast to match the length of the remaining series. -Literals, like the number used above, are treated as series of length 1 and are also broadcast. +The expressions in a context `select` must produce series that are all the same length or they must produce a scalar. +Scalars will be broadcast to match the length of the remaining series. +Literals, like the number used above, are also broadcast. Note that broadcasting can also occur within expressions. For instance, consider the expression below: @@ -81,7 +81,7 @@ This is also true of the other contexts that we will see next. ### `with_columns` The context `with_columns` is very similar to the context `select`. -The main difference between the two is that the context `with_columns` retains the original columns and adds new ones according to its input expressions, whereas the context `select` drops the original columns: +The main difference between the two is that the context `with_columns` creates a new dataframe that contains the columns from the original dataframe and the new columns according to its input expressions, whereas the context `select` only includes the columns selected by its input expressions: {{code_block('user-guide/concepts/expressions','with_columns-1',['with_columns'])}} @@ -89,6 +89,8 @@ The main difference between the two is that the context `with_columns` retains t --8<-- "python/user-guide/concepts/expressions.py:with_columns-1" ``` +Because of this difference between `select` and `with_columns`, the expressions used in a context `with_columns` must produce series that have the same length as the original columns in the dataframe, whereas it is enough for the expressions in the context `select` to produce series that have the same length among them. + ### `filter` The context `filter` filters the rows of a dataframe based on one or more expressions that evaluate to the Boolean data type. @@ -149,6 +151,14 @@ pl.col("weight", "height").mean().name.prefix("avg_") ``` will compute the mean value of the columns “weight” and “height” and will rename them as “avg_weight” and “avg_height”, respectively. +In fact, the expression above is equivalent to using the two following expressions: + +```python +[ + pl.col("weight").mean().alias("avg_weight"), + pl.col("height").mean().alias("avg_height"), +] +``` In this case, this expression expands into two independent expressions that Polars can execute in parallel. In other cases, we may not be able to know in advance how many independent expressions an expression will unfold into. @@ -169,7 +179,7 @@ In the case of the dataframe we have been using, it applies to two columns: --8<-- "python/user-guide/concepts/expressions.py:expression-expansion-1" ``` -In the case of the dataframe `df2` below, the same expression expands to 0 columns: +In the case of the dataframe `df2` below, the same expression expands to 0 columns because no column has the data type `Float64`: {{code_block('user-guide/concepts/expressions','expression-expansion-2',['group_by'])}} @@ -179,6 +189,8 @@ In the case of the dataframe `df2` below, the same expression expands to 0 colum It is equally easy to imagine a scenario where the same expression would expand to dozens of columns. +Next, you will learn about [the lazy API and the function `explain`](lazy-api.md#previewing-the-query-plan), which you can use to preview what an expression will expand to given a schema. + ## Conclusion Because expressions are lazy, when you use an expression inside a context Polars can try to simplify your expression before running the data transformation it expresses. diff --git a/docs/source/user-guide/concepts/index.md b/docs/source/user-guide/concepts/index.md index c1064c1ababb..c4b28e50721f 100644 --- a/docs/source/user-guide/concepts/index.md +++ b/docs/source/user-guide/concepts/index.md @@ -5,4 +5,3 @@ This chapter describes the core concepts of the Polars API. Understanding these - [Data types and structures](data-types-and-structures.md) - [Expressions and contexts](expressions-and-contexts.md) - [Lazy API](lazy-api.md) -- [Streaming](streaming.md) diff --git a/docs/source/user-guide/concepts/lazy-api.md b/docs/source/user-guide/concepts/lazy-api.md index f6c7255da2df..85b985e1a74c 100644 --- a/docs/source/user-guide/concepts/lazy-api.md +++ b/docs/source/user-guide/concepts/lazy-api.md @@ -24,6 +24,42 @@ These will significantly lower the load on memory & CPU thus allowing you to fit In many cases the eager API is actually calling the lazy API under the hood and immediately collecting the result. This has the benefit that within the query itself optimization(s) made by the query planner can still take place. -### When to use which +## When to use which In general, the lazy API should be preferred unless you are either interested in the intermediate results or are doing exploratory work and don't know yet what your query is going to look like. + +## Previewing the query plan + +When using the lazy API you can use the function `explain` to ask Polars to create a description of the query plan that will be executed once you collect the results. +This can be useful if you want to see what types of optimizations Polars performs on your queries. +We can ask Polars to explain the query `q` we defined above: + +{{code_block('user-guide/concepts/lazy-vs-eager','explain',['explain'])}} + +```python exec="on" result="text" session="user-guide/concepts/lazy-api" +--8<-- "python/user-guide/concepts/lazy-vs-eager.py:import" +--8<-- "python/user-guide/concepts/lazy-vs-eager.py:lazy" +--8<-- "python/user-guide/concepts/lazy-vs-eager.py:explain" +``` + +Immediately, we can see in the explanation that Polars did apply predicate pushdown, as it is only reading rows where the sepal length is greater than 5, and it did apply projection pushdown, as it is only reading the columns that are needed by the query. + +The function `explain` can also be used to see how expression expansion will unfold in the context of a given schema. +Consider the example expression from the [section on expression expansion](expressions-and-contexts.md#expression-expansion): + +```python +(pl.col(pl.Float64) * 1.1).name.suffix("*1.1") +``` + +We can use `explain` to see how this expression would evaluate against an arbitrary schema: + +=== ":fontawesome-brands-python: Python" +[:material-api: `explain`](https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.explain.html) + +```python +--8<-- "python/user-guide/concepts/lazy-vs-eager.py:explain-expression-expansion" +``` + +```python exec="on" result="text" session="user-guide/concepts/lazy-api" +--8<-- "python/user-guide/concepts/lazy-vs-eager.py:explain-expression-expansion" +``` diff --git a/docs/source/user-guide/expressions/casting.md b/docs/source/user-guide/expressions/casting.md index 3fe9b174e900..f0c625d19f28 100644 --- a/docs/source/user-guide/expressions/casting.md +++ b/docs/source/user-guide/expressions/casting.md @@ -1,6 +1,7 @@ # Casting -Casting converts the [underlying `DataType` of a column](../concepts/data-types-and-structures.md) to a new one. Polars uses Arrow to manage the data in memory and relies on the compute kernels in the [Rust implementation](https://github.com/jorgecarleitao/arrow2) to do the conversion. Casting is available with the `cast()` method. +Casting converts the [underlying `DataType` of a column](../concepts/data-types-and-structures.md) to a new one. +Casting is available with the `cast()` method. The `cast` method includes a `strict` parameter that determines how Polars behaves when it encounters a value that can't be converted from the source `DataType` to the target `DataType`. By default, `strict=True`, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if `strict=False`, any values that can't be converted to the target `DataType` will be quietly converted to `null`. diff --git a/mkdocs.yml b/mkdocs.yml index b8eedac80e8b..c180bbfc6b8e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,7 +19,6 @@ nav: - user-guide/concepts/data-types-and-structures.md - user-guide/concepts/expressions-and-contexts.md - user-guide/concepts/lazy-api.md - - user-guide/concepts/streaming.md - Expressions: - user-guide/expressions/index.md - user-guide/expressions/operators.md