diff --git a/docs/src/python/user-guide/concepts/contexts.py b/docs/src/python/user-guide/concepts/contexts.py index 7b13fabc4..91ef2a73e 100644 --- a/docs/src/python/user-guide/concepts/contexts.py +++ b/docs/src/python/user-guide/concepts/contexts.py @@ -20,12 +20,10 @@ # --8<-- [start:select] out = df.select( - [ - pl.sum("nrs"), - pl.col("names").sort(), - pl.col("names").first().alias("first name"), - (pl.mean("nrs") * 10).alias("10xnrs"), - ] + pl.sum("nrs"), + pl.col("names").sort(), + pl.col("names").first().alias("first name"), + (pl.mean("nrs") * 10).alias("10xnrs"), ) print(out) # --8<-- [end:select] @@ -38,10 +36,8 @@ # --8<-- [start:with_columns] df = df.with_columns( - [ - pl.sum("nrs").alias("nrs_sum"), - pl.col("random").count().alias("count"), - ] + pl.sum("nrs").alias("nrs_sum"), + pl.col("random").count().alias("count"), ) print(df) # --8<-- [end:with_columns] @@ -49,13 +45,11 @@ # --8<-- [start:groupby] out = df.groupby("groups").agg( - [ - pl.sum("nrs"), # sum nrs by groups - pl.col("random").count().alias("count"), # count group members - # sum random where name != null - pl.col("random").filter(pl.col("names").is_not_null()).sum().suffix("_sum"), - pl.col("names").reverse().alias(("reversed names")), - ] + pl.sum("nrs"), # sum nrs by groups + pl.col("random").count().alias("count"), # count group members + # sum random where name != null + pl.col("random").filter(pl.col("names").is_not_null()).sum().suffix("_sum"), + pl.col("names").reverse().alias(("reversed names")), ) print(out) # --8<-- [end:groupby] diff --git a/docs/src/python/user-guide/concepts/expressions.py b/docs/src/python/user-guide/concepts/expressions.py index b56296f37..83e6c4514 100644 --- a/docs/src/python/user-guide/concepts/expressions.py +++ b/docs/src/python/user-guide/concepts/expressions.py @@ -12,10 +12,5 @@ # --8<-- [end:example1] # --8<-- [start:example2] -df.select( - [ - pl.col("foo").sort().head(2), - pl.col("bar").filter(pl.col("foo") == 1).sum(), - ] -) +df.select(pl.col("foo").sort().head(2), pl.col("bar").filter(pl.col("foo") == 1).sum()) # --8<-- [end:example2] diff --git a/docs/src/python/user-guide/expressions/aggregation.py b/docs/src/python/user-guide/expressions/aggregation.py index 99b175833..1ea17b433 100644 --- a/docs/src/python/user-guide/expressions/aggregation.py +++ b/docs/src/python/user-guide/expressions/aggregation.py @@ -25,11 +25,9 @@ dataset.lazy() .groupby("first_name") .agg( - [ - pl.count(), - pl.col("gender"), - pl.first("last_name"), - ] + pl.count(), + pl.col("gender"), + pl.first("last_name"), ) .sort("count", descending=True) .limit(5) @@ -44,10 +42,8 @@ dataset.lazy() .groupby("state") .agg( - [ - (pl.col("party") == "Anti-Administration").sum().alias("anti"), - (pl.col("party") == "Pro-Administration").sum().alias("pro"), - ] + (pl.col("party") == "Anti-Administration").sum().alias("anti"), + (pl.col("party") == "Pro-Administration").sum().alias("pro"), ) .sort("pro", descending=True) .limit(5) @@ -60,8 +56,8 @@ # --8<-- [start:nested] q = ( dataset.lazy() - .groupby(["state", "party"]) - .agg([pl.count("party").alias("count")]) + .groupby("state", "party") + .agg(pl.count("party").alias("count")) .filter( (pl.col("party") == "Anti-Administration") | (pl.col("party") == "Pro-Administration") @@ -91,14 +87,12 @@ def avg_birthday(gender: str) -> pl.Expr: q = ( dataset.lazy() - .groupby(["state"]) + .groupby("state") .agg( - [ - avg_birthday("M"), - avg_birthday("F"), - (pl.col("gender") == "M").sum().alias("# male"), - (pl.col("gender") == "F").sum().alias("# female"), - ] + avg_birthday("M"), + avg_birthday("F"), + (pl.col("gender") == "M").sum().alias("# male"), + (pl.col("gender") == "F").sum().alias("# female"), ) .limit(5) ) @@ -116,12 +110,10 @@ def get_person() -> pl.Expr: q = ( dataset.lazy() .sort("birthday", descending=True) - .groupby(["state"]) + .groupby("state") .agg( - [ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - ] + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), ) .limit(5) ) @@ -139,13 +131,11 @@ def get_person() -> pl.Expr: q = ( dataset.lazy() .sort("birthday", descending=True) - .groupby(["state"]) + .groupby("state") .agg( - [ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), - ] + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort().first().alias("alphabetical_first"), ) .limit(5) ) @@ -163,14 +153,12 @@ def get_person() -> pl.Expr: q = ( dataset.lazy() .sort("birthday", descending=True) - .groupby(["state"]) + .groupby("state") .agg( - [ - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), - pl.col("gender").sort_by("first_name").first().alias("gender"), - ] + get_person().first().alias("youngest"), + get_person().last().alias("oldest"), + get_person().sort().first().alias("alphabetical_first"), + pl.col("gender").sort_by("first_name").first().alias("gender"), ) .sort("state") .limit(5) diff --git a/docs/src/python/user-guide/expressions/casting.py b/docs/src/python/user-guide/expressions/casting.py index d403faa80..189e72f55 100644 --- a/docs/src/python/user-guide/expressions/casting.py +++ b/docs/src/python/user-guide/expressions/casting.py @@ -19,13 +19,11 @@ # --8<-- [start:castnum] out = df.select( - [ - pl.col("integers").cast(pl.Float32).alias("integers_as_floats"), - pl.col("floats").cast(pl.Int32).alias("floats_as_integers"), - pl.col("floats_with_decimal") - .cast(pl.Int32) - .alias("floats_with_decimal_as_integers"), - ] + pl.col("integers").cast(pl.Float32).alias("integers_as_floats"), + pl.col("floats").cast(pl.Int32).alias("floats_as_integers"), + pl.col("floats_with_decimal") + .cast(pl.Int32) + .alias("floats_with_decimal_as_integers"), ) print(out) # --8<-- [end:castnum] @@ -33,24 +31,22 @@ # --8<-- [start:downcast] out = df.select( - [ - pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"), - pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"), - ] + pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"), + pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"), ) print(out) # --8<-- [end:downcast] # --8<-- [start:overflow] try: - out = df.select([pl.col("big_integers").cast(pl.Int8)]) + out = df.select(pl.col("big_integers").cast(pl.Int8)) print(out) except Exception as e: print(e) # --8<-- [end:overflow] # --8<-- [start:overflow2] -out = df.select([pl.col("big_integers").cast(pl.Int8, strict=False)]) +out = df.select(pl.col("big_integers").cast(pl.Int8, strict=False)) print(out) # --8<-- [end:overflow2] @@ -65,24 +61,18 @@ ) out = df.select( - [ - pl.col("integers").cast(pl.Utf8), - pl.col("float").cast(pl.Utf8), - pl.col("floats_as_string").cast(pl.Float64), - ] + pl.col("integers").cast(pl.Utf8), + pl.col("float").cast(pl.Utf8), + pl.col("floats_as_string").cast(pl.Float64), ) print(out) # --8<-- [end:strings] # --8<-- [start:strings2] -df = pl.DataFrame( - { - "strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"], - } -) +df = pl.DataFrame({"strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"]}) try: - out = df.select([pl.col("strings_not_float").cast(pl.Float64)]) + out = df.select(pl.col("strings_not_float").cast(pl.Float64)) print(out) except Exception as e: print(e) @@ -97,12 +87,7 @@ } ) -out = df.select( - [ - pl.col("integers").cast(pl.Boolean), - pl.col("floats").cast(pl.Boolean), - ] -) +out = df.select(pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean)) print(out) # --8<-- [end:bool] @@ -118,7 +103,7 @@ } ) -out = df.select([pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64)]) +out = df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64)) print(out) # --8<-- [end:dates] @@ -137,10 +122,8 @@ ) out = df.select( - [ - pl.col("date").dt.strftime("%Y-%m-%d"), - pl.col("string").str.strptime(pl.Datetime, "%Y-%m-%d"), - ] + pl.col("date").dt.strftime("%Y-%m-%d"), + pl.col("string").str.strptime(pl.Datetime, "%Y-%m-%d"), ) print(out) # --8<-- [end:dates2] diff --git a/docs/src/python/user-guide/expressions/folds.py b/docs/src/python/user-guide/expressions/folds.py index f2fd50a4f..803591b5b 100644 --- a/docs/src/python/user-guide/expressions/folds.py +++ b/docs/src/python/user-guide/expressions/folds.py @@ -45,10 +45,6 @@ } ) -out = df.select( - [ - pl.concat_str(["a", "b"]), - ] -) +out = df.select(pl.concat_str(["a", "b"])) print(out) # --8<-- [end:string] diff --git a/docs/src/python/user-guide/expressions/functions.py b/docs/src/python/user-guide/expressions/functions.py index c75b4c3c3..15905fb30 100644 --- a/docs/src/python/user-guide/expressions/functions.py +++ b/docs/src/python/user-guide/expressions/functions.py @@ -19,14 +19,14 @@ # --8<-- [end:dataframe] # --8<-- [start:samename] -df_samename = df.select([pl.col("nrs") + 5]) +df_samename = df.select(pl.col("nrs") + 5) print(df_samename) # --8<-- [end:samename] # --8<-- [start:samenametwice] try: - df_samename2 = df.select([pl.col("nrs") + 5, pl.col("nrs") - 5]) + df_samename2 = df.select(pl.col("nrs") + 5, pl.col("nrs") - 5) print(df_samename2) except Exception as e: print(e) @@ -34,33 +34,27 @@ # --8<-- [start:samenamealias] df_alias = df.select( - [ - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), - ] + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), ) print(df_alias) # --8<-- [end:samenamealias] # --8<-- [start:countunique] df_alias = df.select( - [ - pl.col("names").n_unique().alias("unique"), - pl.approx_unique("names").alias("unique_approx"), - ] + pl.col("names").n_unique().alias("unique"), + pl.approx_unique("names").alias("unique_approx"), ) print(df_alias) # --8<-- [end:countunique] # --8<-- [start:conditional] df_conditional = df.select( - [ - pl.col("nrs"), - pl.when(pl.col("nrs") > 2) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias("conditional"), - ] + pl.col("nrs"), + pl.when(pl.col("nrs") > 2) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias("conditional"), ) print(df_conditional) # --8<-- [end:conditional] diff --git a/docs/src/python/user-guide/expressions/numpy-example.py b/docs/src/python/user-guide/expressions/numpy-example.py index e276eb0ce..d3300591c 100644 --- a/docs/src/python/user-guide/expressions/numpy-example.py +++ b/docs/src/python/user-guide/expressions/numpy-example.py @@ -3,9 +3,5 @@ df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) -out = df.select( - [ - np.log(pl.all()).suffix("_log"), - ] -) +out = df.select(np.log(pl.all()).suffix("_log")) print(out) diff --git a/docs/src/python/user-guide/expressions/operators.py b/docs/src/python/user-guide/expressions/operators.py index e02157d2a..6f617487c 100644 --- a/docs/src/python/user-guide/expressions/operators.py +++ b/docs/src/python/user-guide/expressions/operators.py @@ -22,12 +22,10 @@ # --8<-- [start:numerical] df_numerical = df.select( - [ - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), - (pl.col("nrs") * pl.col("random")).alias("nrs * random"), - (pl.col("nrs") / pl.col("random")).alias("nrs / random"), - ] + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), + (pl.col("nrs") * pl.col("random")).alias("nrs * random"), + (pl.col("nrs") / pl.col("random")).alias("nrs / random"), ) print(df_numerical) @@ -35,14 +33,12 @@ # --8<-- [start:logical] df_logical = df.select( - [ - (pl.col("nrs") > 1).alias("nrs > 1"), - (pl.col("random") <= 0.5).alias("random < .5"), - (pl.col("nrs") != 1).alias("nrs != 1"), - (pl.col("nrs") == 1).alias("nrs == 1"), - ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"), # and - ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), # or - ] + (pl.col("nrs") > 1).alias("nrs > 1"), + (pl.col("random") <= 0.5).alias("random < .5"), + (pl.col("nrs") != 1).alias("nrs != 1"), + (pl.col("nrs") == 1).alias("nrs == 1"), + ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"), # and + ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), # or ) print(df_logical) # --8<-- [end:logical] diff --git a/docs/src/python/user-guide/expressions/strings.py b/docs/src/python/user-guide/expressions/strings.py index 6b3eef3f5..9bec188f8 100644 --- a/docs/src/python/user-guide/expressions/strings.py +++ b/docs/src/python/user-guide/expressions/strings.py @@ -8,23 +8,19 @@ df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]}) out = df.select( - [ - pl.col("animal").str.lengths().alias("byte_count"), - pl.col("animal").str.n_chars().alias("letter_count"), - ] + pl.col("animal").str.lengths().alias("byte_count"), + pl.col("animal").str.n_chars().alias("letter_count"), ) print(out) # --8<-- [end:df] # --8<-- [start:existence] out = df.select( - [ - pl.col("animal"), - pl.col("animal").str.contains("cat|bit").alias("regex"), - pl.col("animal").str.contains("rab$", literal=True).alias("literal"), - pl.col("animal").str.starts_with("rab").alias("starts_with"), - pl.col("animal").str.ends_with("dog").alias("ends_with"), - ] + pl.col("animal"), + pl.col("animal").str.contains("cat|bit").alias("regex"), + pl.col("animal").str.contains("rab$", literal=True).alias("literal"), + pl.col("animal").str.starts_with("rab").alias("starts_with"), + pl.col("animal").str.ends_with("dog").alias("ends_with"), ) print(out) # --8<-- [end:existence] @@ -40,9 +36,7 @@ } ) out = df.select( - [ - pl.col("a").str.extract(r"candidate=(\w+)", group_index=1), - ] + pl.col("a").str.extract(r"candidate=(\w+)", group_index=1), ) print(out) # --8<-- [end:extract] @@ -51,9 +45,7 @@ # --8<-- [start:extract_all] df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]}) out = df.select( - [ - pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"), - ] + pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"), ) print(out) # --8<-- [end:extract_all] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index f4eb77707..3df6b3a16 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -13,10 +13,8 @@ ) out = df.groupby("keys", maintain_order=True).agg( - [ - pl.col("values").map(lambda s: s.shift()).alias("shift_map"), - pl.col("values").shift().alias("shift_expression"), - ] + pl.col("values").map(lambda s: s.shift()).alias("shift_map"), + pl.col("values").shift().alias("shift_expression"), ) print(df) # --8<-- [end:dataframe] @@ -24,10 +22,8 @@ # --8<-- [start:apply] out = df.groupby("keys", maintain_order=True).agg( - [ - pl.col("values").apply(lambda s: s.shift()).alias("shift_map"), - pl.col("values").shift().alias("shift_expression"), - ] + pl.col("values").apply(lambda s: s.shift()).alias("shift_map"), + pl.col("values").shift().alias("shift_expression"), ) print(out) # --8<-- [end:apply] @@ -43,22 +39,18 @@ def add_counter(val: int) -> int: out = df.select( - [ - pl.col("values").apply(add_counter).alias("solution_apply"), - (pl.col("values") + pl.arange(1, pl.count() + 1)).alias("solution_expr"), - ] + pl.col("values").apply(add_counter).alias("solution_apply"), + (pl.col("values") + pl.arange(1, pl.count() + 1)).alias("solution_expr"), ) print(out) # --8<-- [end:counter] # --8<-- [start:combine] out = df.select( - [ - pl.struct(["keys", "values"]) - .apply(lambda x: len(x["keys"]) + x["values"]) - .alias("solution_apply"), - (pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"), - ] + pl.struct(["keys", "values"]) + .apply(lambda x: len(x["keys"]) + x["values"]) + .alias("solution_apply"), + (pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"), ) print(out) # --8<-- [end:combine] diff --git a/docs/src/python/user-guide/expressions/window.py b/docs/src/python/user-guide/expressions/window.py index 1117b955c..88d73ebc7 100644 --- a/docs/src/python/user-guide/expressions/window.py +++ b/docs/src/python/user-guide/expressions/window.py @@ -11,36 +11,30 @@ # --8<-- [start:groupby] out = df.select( - [ - "Type 1", - "Type 2", - pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"), - pl.col("Defense") - .mean() - .over(["Type 1", "Type 2"]) - .alias("avg_defense_by_type_combination"), - pl.col("Attack").mean().alias("avg_attack"), - ] + "Type 1", + "Type 2", + pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"), + pl.col("Defense") + .mean() + .over(["Type 1", "Type 2"]) + .alias("avg_defense_by_type_combination"), + pl.col("Attack").mean().alias("avg_attack"), ) print(out) # --8<-- [end:groupby] # --8<-- [start:operations] filtered = df.filter(pl.col("Type 2") == "Psychic").select( - [ - "Name", - "Type 1", - "Speed", - ] + "Name", + "Type 1", + "Speed", ) print(filtered) # --8<-- [end:operations] # --8<-- [start:sort] out = filtered.with_columns( - [ - pl.col(["Name", "Speed"]).sort_by("Speed", descending=True).over("Type 1"), - ] + pl.col(["Name", "Speed"]).sort_by("Speed", descending=True).over("Type 1"), ) print(out) # --8<-- [end:sort] diff --git a/docs/user-guide/migration/pandas.md b/docs/user-guide/migration/pandas.md index 79b7e353c..89cd7c663 100644 --- a/docs/user-guide/migration/pandas.md +++ b/docs/user-guide/migration/pandas.md @@ -83,7 +83,7 @@ df.loc[:,'a'] but in `Polars` you would use the `.select` method: ```python -df.select(['a']) +df.select('a') ``` If you want to select rows based on the values then in `Polars` you use the `.filter` @@ -158,10 +158,10 @@ In `Polars` we add columns to `df` using the `.with_columns` method and name the the `.alias` method: ```python -df.with_columns([ +df.with_columns( (pl.col("value") * 10).alias("tenXValue"), (pl.col("value") * 100).alias("hundredXValue"), -]) +) ``` These column assignments are executed in parallel. @@ -250,10 +250,10 @@ and then joins the result back to the original `DataFrame` producing: In `Polars` the same can be achieved with `window` functions: ```python -df.select([ +df.select( pl.all(), pl.col("type").count().over("c").alias("size") -]) +) ``` ``` @@ -287,12 +287,12 @@ them in a single `select` is both convenient **and** optimal. In the following e we look at a case where we are calculating group statistics over `"c"` twice: ```python -df.select([ +df.select( pl.all(), pl.col("c").count().over("c").alias("size"), pl.col("c").sum().over("type").alias("sum"), pl.col("c").reverse().over("c").flatten().alias("reverse_type") -]) +) ``` ``` diff --git a/docs/user-guide/migration/spark.md b/docs/user-guide/migration/spark.md index ce73803ed..156693f43 100644 --- a/docs/user-guide/migration/spark.md +++ b/docs/user-guide/migration/spark.md @@ -31,10 +31,10 @@ dfs = spark.createDataFrame( In `Polars` you can write something like this: ```python -df.select([ +df.select( pl.col("foo").sort().head(2), pl.col("bar").filter(pl.col("foo") == "d").sum() -]) +) ``` Output: @@ -92,10 +92,10 @@ Output: In `Polars` you can combine two different `head` expressions on the same DataFrame, provided that they return the same number of values. ```python -df.select([ +df.select( pl.col("foo").sort().head(2), pl.col("bar").sort(descending=True).head(2), -]) +) ``` Output: