narwhals.LazyFrame
Narwhals LazyFrame, backed by a native lazyframe.
Warning
This class is not meant to be instantiated directly - instead use
narwhals.from_native
with a native
object that is a lazy dataframe from one of the supported
backend (e.g. polars.LazyFrame, dask_expr._collection.DataFrame):
narwhals.from_native(native_lazyframe)
columns
property
Get column names.
Returns:
Type | Description |
---|---|
list[str]
|
The column names stored in a list. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrame
>>>
>>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
We define a library agnostic function:
>>> def agnostic_columns(df_native: IntoFrame) -> list[str]:
... df = nw.from_native(df_native)
... return df.columns
We can then pass any supported library such as Polars or Dask to agnostic_columns
:
>>> agnostic_columns(lf_pl)
['foo', 'bar', 'ham']
>>> agnostic_columns(lf_dask)
['foo', 'bar', 'ham']
implementation
property
Return implementation of native frame.
This can be useful when you need to use special-casing for features outside of Narwhals' scope - for example, when dealing with pandas' Period Dtype.
Returns:
Type | Description |
---|---|
Implementation
|
Implementation. |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> lf_pl = pl.LazyFrame({"a": [1, 2, 3]})
>>> lf_dask = dd.from_dict({"a": [1, 2, 3]}, npartitions=2)
>>> lf = nw.from_native(lf_pl)
>>> lf.implementation
<Implementation.POLARS: 6>
>>> lf.implementation.is_pandas()
False
>>> lf.implementation.is_polars()
True
>>> lf = nw.from_native(lf_dask)
>>> lf.implementation
<Implementation.DASK: 7>
>>> lf.implementation.is_dask()
True
schema
property
Get an ordered mapping of column names to their data type.
Returns:
Type | Description |
---|---|
Schema
|
A Narwhals Schema object that displays the mapping of column names. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
>>> lf = nw.from_native(lf_pl)
>>> lf.schema
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
>>> lf = nw.from_native(lf_dask)
>>> lf.schema
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
clone()
Create a copy of this DataFrame.
Returns:
Type | Description |
---|---|
Self
|
An identical copy of the original LazyFrame. |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [1, 2], "b": [3, 4]}
>>> lf_pl = pl.LazyFrame(data)
Let's define a dataframe-agnostic function in which we copy the DataFrame:
>>> def agnostic_clone(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.clone().collect().to_native()
We can then pass any supported library such as Polars to agnostic_clone
:
>>> agnostic_clone(lf_pl)
shape: (2, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 3 │
│ 2 ┆ 4 │
└─────┴─────┘
collect()
Materialize this LazyFrame into a DataFrame.
Returns:
Type | Description |
---|---|
DataFrame[Any]
|
DataFrame |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> data = {
... "a": ["a", "b", "a", "b", "b", "c"],
... "b": [1, 2, 3, 4, 5, 6],
... "c": [6, 5, 4, 3, 2, 1],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
>>> lf = nw.from_native(lf_pl)
>>> lf
┌─────────────────────────────┐
| Narwhals LazyFrame |
|-----------------------------|
|<LazyFrame at ...
└─────────────────────────────┘
>>> df = lf.group_by("a").agg(nw.all().sum()).collect()
>>> df.to_native().sort("a")
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ a ┆ 4 ┆ 10 │
│ b ┆ 11 ┆ 10 │
│ c ┆ 6 ┆ 1 │
└─────┴─────┴─────┘
>>> lf = nw.from_native(lf_dask)
>>> lf
┌───────────────────────────────────┐
| Narwhals LazyFrame |
|-----------------------------------|
|Dask DataFrame Structure: |
| a b c|
|npartitions=2 |
|0 string int64 int64|
|3 ... ... ...|
|5 ... ... ...|
|Dask Name: frompandas, 1 expression|
|Expr=df |
└───────────────────────────────────┘
>>> df = lf.group_by("a").agg(nw.col("b", "c").sum()).collect()
>>> df.to_native()
a b c
0 a 4 10
1 b 11 10
2 c 6 1
collect_schema()
Get an ordered mapping of column names to their data type.
Returns:
Type | Description |
---|---|
Schema
|
A Narwhals Schema object that displays the mapping of column names. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
>>> lf = nw.from_native(lf_pl)
>>> lf.collect_schema()
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
>>> lf = nw.from_native(lf_dask)
>>> lf.collect_schema()
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
drop(*columns, strict=True)
Remove columns from the LazyFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*columns
|
str | Iterable[str]
|
Names of the columns that should be removed from the dataframe. |
()
|
strict
|
bool
|
Validate that all column names exist in the schema and throw an exception if a column name does not exist in the schema. |
True
|
Returns:
Type | Description |
---|---|
Self
|
The LazyFrame with the specified columns removed. |
Warning
strict
argument is ignored for polars<1.0.0
.
Please consider upgrading to a newer version or pass to eager mode.
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
We define a library agnostic function:
>>> def agnostic_drop(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.drop("ham").collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_drop
:
>>> agnostic_drop(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 1 ┆ 6.0 │
│ 2 ┆ 7.0 │
│ 3 ┆ 8.0 │
└─────┴─────┘
>>> agnostic_drop(lf_dask)
foo bar
0 1 6.0
1 2 7.0
2 3 8.0
Use positional arguments to drop multiple columns.
>>> def agnostic_drop(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.drop("foo", "ham").collect().to_native()
>>> agnostic_drop(lf_pl)
shape: (3, 1)
┌─────┐
│ bar │
│ --- │
│ f64 │
╞═════╡
│ 6.0 │
│ 7.0 │
│ 8.0 │
└─────┘
>>> agnostic_drop(lf_dask)
bar
0 6.0
1 7.0
2 8.0
drop_nulls(subset=None)
Drop rows that contain null values.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
subset
|
str | list[str] | None
|
Column name(s) for which null values are considered. If set to None (default), use all columns. |
None
|
Returns:
Type | Description |
---|---|
Self
|
The original object with the rows removed that contained the null values. |
Notes
pandas handles null values differently from Polars and PyArrow. See null_handling for reference.
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [1.0, 2.0, None], "ba": [1.0, None, 2.0]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function:
>>> def agnostic_drop_nulls(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.drop_nulls().collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_drop_nulls
:
>>> agnostic_drop_nulls(lf_pl)
shape: (1, 2)
┌─────┬─────┐
│ a ┆ ba │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═════╪═════╡
│ 1.0 ┆ 1.0 │
└─────┴─────┘
>>> agnostic_drop_nulls(lf_dask)
a ba
0 1.0 1.0
explode(columns, *more_columns)
Explode the dataframe to long format by exploding the given columns.
Notes
It is possible to explode multiple columns only if these columns have matching element counts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
columns
|
str | Sequence[str]
|
Column names. The underlying columns being exploded must be of the |
required |
*more_columns
|
str
|
Additional names of columns to explode, specified as positional arguments. |
()
|
Returns:
Type | Description |
---|---|
Self
|
New LazyFrame |
Examples:
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> import polars as pl
>>> data = {
... "a": ["x", "y", "z", "w"],
... "lst1": [[1, 2], None, [None], []],
... "lst2": [[3, None], None, [42], []],
... }
We define a library agnostic function:
>>> def agnostic_explode(df_native: IntoFrameT) -> IntoFrameT:
... return (
... nw.from_native(df_native)
... .with_columns(nw.col("lst1", "lst2").cast(nw.List(nw.Int32())))
... .explode("lst1", "lst2")
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars to agnostic_explode
:
>>> agnostic_explode(pl.LazyFrame(data))
shape: (5, 3)
┌─────┬──────┬──────┐
│ a ┆ lst1 ┆ lst2 │
│ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 │
╞═════╪══════╪══════╡
│ x ┆ 1 ┆ 3 │
│ x ┆ 2 ┆ null │
│ y ┆ null ┆ null │
│ z ┆ null ┆ 42 │
│ w ┆ null ┆ null │
└─────┴──────┴──────┘
filter(*predicates, **constraints)
Filter the rows in the LazyFrame based on a predicate expression.
The original order of the remaining rows is preserved.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*predicates
|
IntoExpr | Iterable[IntoExpr] | list[bool]
|
Expression that evaluates to a boolean Series. Can also be a (single!) boolean list. |
()
|
**constraints
|
Any
|
Column filters; use |
{}
|
Returns:
Type | Description |
---|---|
Self
|
The filtered LazyFrame. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ["a", "b", "c"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we filter on one condition.
>>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.filter(nw.col("foo") > 1).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_filter
:
>>> agnostic_filter(lf_pl)
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 2 ┆ 7 ┆ b │
│ 3 ┆ 8 ┆ c │
└─────┴─────┴─────┘
>>> agnostic_filter(lf_dask)
foo bar ham
1 2 7 b
2 3 8 c
Filter on multiple conditions:
>>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.filter((nw.col("foo") < 3) & (nw.col("ham") == "a"))
... .collect()
... .to_native()
... )
>>> agnostic_filter(lf_pl)
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
└─────┴─────┴─────┘
>>> agnostic_filter(lf_dask)
foo bar ham
0 1 6 a
Provide multiple filters using *args
syntax:
>>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.filter(
... nw.col("foo") == 1,
... nw.col("ham") == "a",
... )
... .collect()
... .to_native()
... )
>>> agnostic_filter(lf_pl)
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
└─────┴─────┴─────┘
>>> agnostic_filter(lf_dask)
foo bar ham
0 1 6 a
Filter on an OR condition:
>>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.filter((nw.col("foo") == 1) | (nw.col("ham") == "c"))
... .collect()
... .to_native()
... )
>>> agnostic_filter(lf_pl)
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
│ 3 ┆ 8 ┆ c │
└─────┴─────┴─────┘
>>> agnostic_filter(lf_dask)
foo bar ham
0 1 6 a
2 3 8 c
Provide multiple filters using **kwargs
syntax:
>>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.filter(foo=2, ham="b").collect().to_native()
>>> agnostic_filter(lf_pl)
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 2 ┆ 7 ┆ b │
└─────┴─────┴─────┘
>>> agnostic_filter(lf_dask)
foo bar ham
1 2 7 b
gather_every(n, offset=0)
Take every nth row in the DataFrame and return as a new DataFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n
|
int
|
Gather every n-th row. |
required |
offset
|
int
|
Starting index. |
0
|
Returns:
Type | Description |
---|---|
Self
|
The LazyFrame containing only the selected rows. |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we gather every 2 rows, starting from a offset of 1:
>>> def agnostic_gather_every(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.gather_every(n=2, offset=1).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_gather_every
:
>>> agnostic_gather_every(lf_pl)
shape: (2, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 2 ┆ 6 │
│ 4 ┆ 8 │
└─────┴─────┘
>>> agnostic_gather_every(lf_dask)
a b
1 2 6
3 4 8
group_by(*keys, drop_null_keys=False)
Start a group by operation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*keys
|
str | Iterable[str]
|
Column(s) to group by. Accepts expression input. Strings are parsed as column names. |
()
|
drop_null_keys
|
bool
|
if True, then groups where any key is null won't be included in the result. |
False
|
Returns:
Type | Description |
---|---|
LazyGroupBy[Self]
|
Object which can be used to perform aggregations. |
Examples:
Group by one column and call agg
to compute the grouped sum of
another column.
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": ["a", "b", "a", "b", "c"],
... "b": [1, 2, 1, 3, 3],
... "c": [5, 4, 3, 2, 1],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we group by one column
and call agg
to compute the grouped sum of another column.
>>> def agnostic_group_by_agg(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.group_by("a")
... .agg(nw.col("b").sum())
... .sort("a")
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_group_by_agg
:
>>> agnostic_group_by_agg(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ a ┆ 2 │
│ b ┆ 5 │
│ c ┆ 3 │
└─────┴─────┘
>>> agnostic_group_by_agg(lf_dask)
a b
0 a 2
1 b 5
2 c 3
Group by multiple columns by passing a list of column names.
>>> def agnostic_group_by_agg(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.group_by(["a", "b"])
... .agg(nw.max("c"))
... .sort(["a", "b"])
... .collect()
... .to_native()
... )
>>> agnostic_group_by_agg(lf_pl)
shape: (4, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 │
│ b ┆ 2 ┆ 4 │
│ b ┆ 3 ┆ 2 │
│ c ┆ 3 ┆ 1 │
└─────┴─────┴─────┘
>>> agnostic_group_by_agg(lf_dask)
a b c
0 a 1 5
1 b 2 4
2 b 3 2
3 c 3 1
head(n=5)
Get n
rows.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n
|
int
|
Number of rows to return. |
5
|
Returns:
Type | Description |
---|---|
Self
|
A subset of the LazyFrame of shape (n, n_columns). |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": [1, 2, 3, 4, 5, 6],
... "b": [7, 8, 9, 10, 11, 12],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function that gets the first 3 rows.
>>> def agnostic_head(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.head(3).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_head
:
>>> agnostic_head(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 7 │
│ 2 ┆ 8 │
│ 3 ┆ 9 │
└─────┴─────┘
>>> agnostic_head(lf_dask)
a b
0 1 7
1 2 8
2 3 9
join(other, on=None, how='inner', *, left_on=None, right_on=None, suffix='_right')
Add a join operation to the Logical Plan.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
other
|
Self
|
Lazy DataFrame to join with. |
required |
on
|
str | list[str] | None
|
Name(s) of the join columns in both DataFrames. If set, |
None
|
how
|
Literal['inner', 'left', 'cross', 'semi', 'anti']
|
Join strategy.
|
'inner'
|
left_on
|
str | list[str] | None
|
Join column of the left DataFrame. |
None
|
right_on
|
str | list[str] | None
|
Join column of the right DataFrame. |
None
|
suffix
|
str
|
Suffix to append to columns with a duplicate name. |
'_right'
|
Returns:
Type | Description |
---|---|
Self
|
A new joined LazyFrame. |
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
>>> data_other = {
... "apple": ["x", "y", "z"],
... "ham": ["a", "b", "d"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> other_pl = pl.LazyFrame(data_other)
>>> lf_dask = dd.from_dict(data, npartitions=2)
>>> other_dask = dd.from_dict(data_other, npartitions=2)
Let's define a dataframe-agnostic function in which we join over "ham" column:
>>> def agnostic_join_on_ham(
... df_native: IntoFrameT,
... other_native: IntoFrameT,
... ) -> IntoFrameT:
... df = nw.from_native(df_native)
... other = nw.from_native(other_native)
... return (
... df.join(other, left_on="ham", right_on="ham")
... .sort("ham")
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_join_on_ham
:
>>> agnostic_join_on_ham(lf_pl, other_pl)
shape: (2, 4)
┌─────┬─────┬─────┬───────┐
│ foo ┆ bar ┆ ham ┆ apple │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str ┆ str │
╞═════╪═════╪═════╪═══════╡
│ 1 ┆ 6.0 ┆ a ┆ x │
│ 2 ┆ 7.0 ┆ b ┆ y │
└─────┴─────┴─────┴───────┘
>>> agnostic_join_on_ham(lf_dask, other_dask)
foo bar ham apple
0 1 6.0 a x
0 2 7.0 b y
join_asof(other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy='backward')
Perform an asof join.
This is similar to a left-join except that we match on nearest key rather than equal keys.
Both DataFrames must be sorted by the asof_join key.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
other
|
Self
|
DataFrame to join with. |
required |
left_on
|
str | None
|
Name(s) of the left join column(s). |
None
|
right_on
|
str | None
|
Name(s) of the right join column(s). |
None
|
on
|
str | None
|
Join column of both DataFrames. If set, left_on and right_on should be None. |
None
|
by_left
|
str | list[str] | None
|
join on these columns before doing asof join |
None
|
by_right
|
str | list[str] | None
|
join on these columns before doing asof join |
None
|
by
|
str | list[str] | None
|
join on these columns before doing asof join |
None
|
strategy
|
Literal['backward', 'forward', 'nearest']
|
Join strategy. The default is "backward".
|
'backward'
|
Returns:
Type | Description |
---|---|
Self
|
A new joined LazyFrame. |
Examples:
>>> from datetime import datetime
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from typing import Literal
>>> from narwhals.typing import IntoFrameT
>>>
>>> data_gdp = {
... "datetime": [
... datetime(2016, 1, 1),
... datetime(2017, 1, 1),
... datetime(2018, 1, 1),
... datetime(2019, 1, 1),
... datetime(2020, 1, 1),
... ],
... "gdp": [4164, 4411, 4566, 4696, 4827],
... }
>>> data_population = {
... "datetime": [
... datetime(2016, 3, 1),
... datetime(2018, 8, 1),
... datetime(2019, 1, 1),
... ],
... "population": [82.19, 82.66, 83.12],
... }
>>> gdp_pl = pl.LazyFrame(data_gdp)
>>> population_pl = pl.LazyFrame(data_population)
>>> gdp_dask = dd.from_dict(data_gdp, npartitions=2)
>>> population_dask = dd.from_dict(data_population, npartitions=2)
Let's define a dataframe-agnostic function in which we join over "datetime" column:
>>> def agnostic_join_asof_datetime(
... df_native: IntoFrameT,
... other_native: IntoFrameT,
... strategy: Literal["backward", "forward", "nearest"],
... ) -> IntoFrameT:
... df = nw.from_native(df_native)
... other = nw.from_native(other_native)
... return (
... df.sort("datetime")
... .join_asof(other, on="datetime", strategy=strategy)
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_join_asof_datetime
:
>>> agnostic_join_asof_datetime(population_pl, gdp_pl, strategy="backward")
shape: (3, 3)
┌─────────────────────┬────────────┬──────┐
│ datetime ┆ population ┆ gdp │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ f64 ┆ i64 │
╞═════════════════════╪════════════╪══════╡
│ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │
│ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │
│ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │
└─────────────────────┴────────────┴──────┘
>>> agnostic_join_asof_datetime(population_dask, gdp_dask, strategy="backward")
datetime population gdp
0 2016-03-01 82.19 4164
1 2018-08-01 82.66 4566
0 2019-01-01 83.12 4696
Here is a real-world times-series example that uses by
argument.
>>> from datetime import datetime
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data_quotes = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 30),
... datetime(2016, 5, 25, 13, 30, 0, 41),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 49),
... datetime(2016, 5, 25, 13, 30, 0, 72),
... datetime(2016, 5, 25, 13, 30, 0, 75),
... ],
... "ticker": [
... "GOOG",
... "MSFT",
... "MSFT",
... "MSFT",
... "GOOG",
... "AAPL",
... "GOOG",
... "MSFT",
... ],
... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
... }
>>> data_trades = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 38),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 49),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... ],
... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
... "quantity": [75, 155, 100, 100, 100],
... }
>>> quotes_pl = pl.LazyFrame(data_quotes)
>>> trades_pl = pl.LazyFrame(data_trades)
>>> quotes_dask = dd.from_dict(data_quotes, npartitions=2)
>>> trades_dask = dd.from_dict(data_trades, npartitions=2)
Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns:
>>> def agnostic_join_asof_datetime_by_ticker(
... df_native: IntoFrameT,
... other_native: IntoFrameT,
... ) -> IntoFrameT:
... df = nw.from_native(df_native)
... other = nw.from_native(other_native)
... return (
... df.sort("datetime", "ticker")
... .join_asof(other, on="datetime", by="ticker")
... .sort("datetime", "ticker")
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_join_asof_datetime_by_ticker
:
>>> agnostic_join_asof_datetime_by_ticker(trades_pl, quotes_pl)
shape: (5, 6)
┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐
│ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │
╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡
│ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │
│ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │
│ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │
│ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │
│ 2016-05-25 13:30:00.000049 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │
└────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘
>>> agnostic_join_asof_datetime_by_ticker(trades_dask, quotes_dask)
datetime ticker price quantity bid ask
0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96
0 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98
1 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN
2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93
3 2016-05-25 13:30:00.000049 GOOG 720.92 100 720.50 720.93
lazy()
Lazify the DataFrame (if possible).
If a library does not support lazy execution, then this is a no-op.
Returns:
Type | Description |
---|---|
Self
|
A LazyFrame. |
Examples:
Construct pandas and Polars objects:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
>>> df_pd = pd.DataFrame(df)
>>> lf_pl = pl.LazyFrame(df)
We define a library agnostic function:
>>> def agnostic_lazy(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.lazy().to_native()
Note that then, pandas dataframe stay eager, and the Polars LazyFrame stays lazy:
>>> agnostic_lazy(df_pd)
foo bar ham
0 1 6.0 a
1 2 7.0 b
2 3 8.0 c
>>> agnostic_lazy(lf_pl)
<LazyFrame ...>
pipe(function, *args, **kwargs)
Pipe function call.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
function
|
Callable[[Any], Self]
|
Function to apply. |
required |
args
|
Any
|
Positional arguments to pass to function. |
()
|
kwargs
|
Any
|
Keyword arguments to pass to function. |
{}
|
Returns:
Type | Description |
---|---|
Self
|
The original object with the function applied. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [1, 2, 3], "ba": [4, 5, 6]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function:
>>> def agnostic_pipe(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.pipe(lambda _df: _df.select("a")).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_pipe
:
>>> agnostic_pipe(lf_pl)
shape: (3, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 3 │
└─────┘
>>> agnostic_pipe(lf_dask)
a
0 1
1 2
2 3
rename(mapping)
Rename column names.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mapping
|
dict[str, str]
|
Key value pairs that map from old name to new name, or a function that takes the old name as input and returns the new name. |
required |
Returns:
Type | Description |
---|---|
Self
|
The LazyFrame with the specified columns renamed. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
We define a library agnostic function:
>>> def agnostic_rename(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.rename({"foo": "apple"}).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_rename
:
>>> agnostic_rename(lf_pl)
shape: (3, 3)
┌───────┬─────┬─────┐
│ apple ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═══════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
│ 2 ┆ 7 ┆ b │
│ 3 ┆ 8 ┆ c │
└───────┴─────┴─────┘
>>> agnostic_rename(lf_dask)
apple bar ham
0 1 6 a
1 2 7 b
2 3 8 c
select(*exprs, **named_exprs)
Select columns from this LazyFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*exprs
|
IntoExpr | Iterable[IntoExpr]
|
Column(s) to select, specified as positional arguments. Accepts expression input. Strings are parsed as column names. |
()
|
**named_exprs
|
IntoExpr
|
Additional columns to select, specified as keyword arguments. The columns will be renamed to the keyword used. |
{}
|
Returns:
Type | Description |
---|---|
Self
|
The LazyFrame containing only the selected columns. |
Notes
If you'd like to select a column whose name isn't a string (for example,
if you're working with pandas) then you should explicitly use nw.col
instead
of just passing the column name. For example, to select a column named
0
use df.select(nw.col(0))
, not df.select(0)
.
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ["a", "b", "c"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we pass the name of a column to select that column.
>>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.select("foo").collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_select
:
>>> agnostic_select(lf_pl)
shape: (3, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 3 │
└─────┘
>>> agnostic_select(lf_dask)
foo
0 1
1 2
2 3
Multiple columns can be selected by passing a list of column names.
>>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.select(["foo", "bar"]).collect().to_native()
>>> agnostic_select(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 6 │
│ 2 ┆ 7 │
│ 3 ┆ 8 │
└─────┴─────┘
>>> agnostic_select(lf_dask)
foo bar
0 1 6
1 2 7
2 3 8
Multiple columns can also be selected using positional arguments instead of a list. Expressions are also accepted.
>>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.select(nw.col("foo"), nw.col("bar") + 1).collect().to_native()
>>> agnostic_select(lf_pl)
shape: (3, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 7 │
│ 2 ┆ 8 │
│ 3 ┆ 9 │
└─────┴─────┘
>>> agnostic_select(lf_dask)
foo bar
0 1 7
1 2 8
2 3 9
Use keyword arguments to easily name your expression inputs.
>>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.select(threshold=nw.col("foo") * 2).collect().to_native()
>>> agnostic_select(lf_pl)
shape: (3, 1)
┌───────────┐
│ threshold │
│ --- │
│ i64 │
╞═══════════╡
│ 2 │
│ 4 │
│ 6 │
└───────────┘
>>> agnostic_select(lf_dask)
threshold
0 2
1 4
2 6
sort(by, *more_by, descending=False, nulls_last=False)
Sort the LazyFrame by the given columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
by
|
str | Iterable[str]
|
Column(s) names to sort by. |
required |
*more_by
|
str
|
Additional columns to sort by, specified as positional arguments. |
()
|
descending
|
bool | Sequence[bool]
|
Sort in descending order. When sorting by multiple columns, can be specified per column by passing a sequence of booleans. |
False
|
nulls_last
|
bool
|
Place null values last; can specify a single boolean applying to all columns or a sequence of booleans for per-column control. |
False
|
Returns:
Type | Description |
---|---|
Self
|
The sorted LazyFrame. |
Warning
Unlike Polars, it is not possible to specify a sequence of booleans for
nulls_last
in order to control per-column behaviour. Instead a single
boolean is applied for all by
columns.
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": [1, 2, None],
... "b": [6.0, 5.0, 4.0],
... "c": ["a", "c", "b"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we sort by multiple columns in different orders
>>> def agnostic_sort(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.sort("c", "a", descending=[False, True]).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_sort
:
>>> agnostic_sort(lf_pl)
shape: (3, 3)
┌──────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞══════╪═════╪═════╡
│ 1 ┆ 6.0 ┆ a │
│ null ┆ 4.0 ┆ b │
│ 2 ┆ 5.0 ┆ c │
└──────┴─────┴─────┘
>>> agnostic_sort(lf_dask)
a b c
0 1.0 6.0 a
2 NaN 4.0 b
1 2.0 5.0 c
tail(n=5)
Get the last n
rows.
Warning
LazyFrame.tail
is deprecated and will be removed in a future version.
Note: this will remain available in narwhals.stable.v1
.
See stable api for more information.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n
|
int
|
Number of rows to return. |
5
|
Returns:
Type | Description |
---|---|
Self
|
A subset of the LazyFrame of shape (n, n_columns). |
to_native()
Convert Narwhals LazyFrame to native one.
Returns:
Type | Description |
---|---|
FrameT
|
Object of class that user started with. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>>
>>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Calling to_native
on a Narwhals LazyFrame returns the native object:
>>> nw.from_native(lf_pl).to_native().collect()
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6.0 ┆ a │
│ 2 ┆ 7.0 ┆ b │
│ 3 ┆ 8.0 ┆ c │
└─────┴─────┴─────┘
>>> nw.from_native(lf_dask).to_native().compute()
foo bar ham
0 1 6.0 a
1 2 7.0 b
2 3 8.0 c
unique(subset=None, *, keep='any', maintain_order=None)
Drop duplicate rows from this LazyFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
subset
|
str | list[str] | None
|
Column name(s) to consider when identifying duplicate rows.
If set to |
None
|
keep
|
Literal['any', 'none']
|
{'first', 'none'} Which of the duplicate rows to keep.
|
'any'
|
maintain_order
|
bool | None
|
Has no effect and is kept around only for backwards-compatibility. |
None
|
Returns:
Type | Description |
---|---|
Self
|
The LazyFrame with unique rows. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "foo": [1, 2, 3, 1],
... "bar": ["a", "a", "a", "a"],
... "ham": ["b", "b", "b", "b"],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
We define a library agnostic function:
>>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.unique(["bar", "ham"]).collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_unique
:
>>> agnostic_unique(lf_pl)
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ a ┆ b │
└─────┴─────┴─────┘
>>> agnostic_unique(lf_dask)
foo bar ham
0 1 a b
unpivot(on=None, *, index=None, variable_name=None, value_name=None)
Unpivot a DataFrame from wide to long format.
Optionally leaves identifiers set.
This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (index) while all other columns, considered measured variables (on), are "unpivoted" to the row axis leaving just two non-identifier columns, 'variable' and 'value'.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
on
|
str | list[str] | None
|
Column(s) to use as values variables; if |
None
|
index
|
str | list[str] | None
|
Column(s) to use as identifier variables. |
None
|
variable_name
|
str | None
|
Name to give to the |
None
|
value_name
|
str | None
|
Name to give to the |
None
|
Returns:
Type | Description |
---|---|
Self
|
The unpivoted LazyFrame. |
Notes
If you're coming from pandas, this is similar to pandas.DataFrame.melt
,
but with index
replacing id_vars
and on
replacing value_vars
.
In other frameworks, you might know this operation as pivot_longer
.
Examples:
>>> import narwhals as nw
>>> import polars as pl
>>> import dask.dataframe as dd
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": ["x", "y", "z"],
... "b": [1, 3, 5],
... "c": [2, 4, 6],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
We define a library agnostic function:
>>> def agnostic_unpivot(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... (df.unpivot(on=["b", "c"], index="a").sort(["variable", "a"]))
... .collect()
... .to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_unpivot
:
>>> agnostic_unpivot(lf_pl)
shape: (6, 3)
┌─────┬──────────┬───────┐
│ a ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 │
╞═════╪══════════╪═══════╡
│ x ┆ b ┆ 1 │
│ y ┆ b ┆ 3 │
│ z ┆ b ┆ 5 │
│ x ┆ c ┆ 2 │
│ y ┆ c ┆ 4 │
│ z ┆ c ┆ 6 │
└─────┴──────────┴───────┘
>>> agnostic_unpivot(lf_dask)
a variable value
0 x b 1
1 y b 3
0 z b 5
2 x c 2
3 y c 4
1 z c 6
with_columns(*exprs, **named_exprs)
Add columns to this LazyFrame.
Added columns will replace existing columns with the same name.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*exprs
|
IntoExpr | Iterable[IntoExpr]
|
Column(s) to add, specified as positional arguments. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. |
()
|
**named_exprs
|
IntoExpr
|
Additional columns to add, specified as keyword arguments. The columns will be renamed to the keyword used. |
{}
|
Returns:
Name | Type | Description |
---|---|---|
LazyFrame |
Self
|
A new LazyFrame with the columns added. |
Note
Creating a new LazyFrame using this method does not create a new copy of existing data.
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {
... "a": [1, 2, 3, 4],
... "b": [0.5, 4, 10, 13],
... "c": [True, True, False, True],
... }
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function in which we pass an expression to add it as a new column:
>>> def agnostic_with_columns(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return (
... df.with_columns((nw.col("a") * 2).alias("2a")).collect().to_native()
... )
We can then pass any supported library such as Polars or Dask to agnostic_with_columns
:
>>> agnostic_with_columns(lf_pl)
shape: (4, 4)
┌─────┬──────┬───────┬─────┐
│ a ┆ b ┆ c ┆ 2a │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ i64 │
╞═════╪══════╪═══════╪═════╡
│ 1 ┆ 0.5 ┆ true ┆ 2 │
│ 2 ┆ 4.0 ┆ true ┆ 4 │
│ 3 ┆ 10.0 ┆ false ┆ 6 │
│ 4 ┆ 13.0 ┆ true ┆ 8 │
└─────┴──────┴───────┴─────┘
>>> agnostic_with_columns(lf_dask)
a b c 2a
0 1 0.5 True 2
1 2 4.0 True 4
2 3 10.0 False 6
3 4 13.0 True 8
with_row_index(name='index')
Insert column which enumerates rows.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name
|
str
|
The name of the column as a string. The default is "index". |
'index'
|
Returns:
Type | Description |
---|---|
Self
|
The original object with the column added. |
Examples:
>>> import polars as pl
>>> import dask.dataframe as dd
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [1, 2, 3], "b": [4, 5, 6]}
>>> lf_pl = pl.LazyFrame(data)
>>> lf_dask = dd.from_dict(data, npartitions=2)
Let's define a dataframe-agnostic function:
>>> def agnostic_with_row_index(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_row_index().collect().to_native()
We can then pass any supported library such as Polars or Dask to agnostic_with_row_index
:
>>> agnostic_with_row_index(lf_pl)
shape: (3, 3)
┌───────┬─────┬─────┐
│ index ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞═══════╪═════╪═════╡
│ 0 ┆ 1 ┆ 4 │
│ 1 ┆ 2 ┆ 5 │
│ 2 ┆ 3 ┆ 6 │
└───────┴─────┴─────┘
>>> agnostic_with_row_index(lf_dask)
index a b
0 0 1 4
1 1 2 5
2 2 3 6