Null/NaN handling
pandas doesn't distinguish between Null and NaN values as Polars and PyArrow do.
Depending on the data type of the underlying data structure, np.nan
, pd.NaT
, None
and pd.NA
all encode missing data in pandas.
Polars and PyArrow, instead, treat NaN
as a valid floating point value which is rare to encounter and more often produced as the result of a computation than explicitly set during data initialization; they treat null
as the missing data indicator, regardless of the data type.
In Narwhals, then, is_null
behaves differently across backends (and so do drop_nulls
, fill_null
and null_count
):
import narwhals as nw
import numpy as np
from narwhals.typing import IntoFrameT
data = {"a": [1.4, float("nan"), np.nan, 4.2, None]}
def check_null_behavior(df: IntoFrameT) -> IntoFrameT:
return nw.from_native(df).with_columns(a_is_null=nw.col("a").is_null()).to_native()
import pandas as pd
df = pd.DataFrame(data)
print(check_null_behavior(df))
a a_is_null
0 1.4 False
1 NaN True
2 NaN True
3 4.2 False
4 NaN True
import polars as pl
df = pl.DataFrame(data)
print(check_null_behavior(df))
shape: (5, 2)
┌──────┬───────────┐
│ a ┆ a_is_null │
│ --- ┆ --- │
│ f64 ┆ bool │
╞══════╪═══════════╡
│ 1.4 ┆ false │
│ NaN ┆ false │
│ NaN ┆ false │
│ 4.2 ┆ false │
│ null ┆ true │
└──────┴───────────┘
import pyarrow as pa
df = pa.table(data)
print(check_null_behavior(df))
pyarrow.Table
a: double
a_is_null: bool
----
a: [[1.4,nan,nan,4.2,null]]
a_is_null: [[false,false,false,false,true]]
Conversely, is_nan
is consistent across backends. This consistency comes from Narwhals exploiting its native implementations
in Polars and PyArrow, while ensuring that pandas only identifies the floating-point NaN values and not those encoding the missing value indicator.
import narwhals as nw
from narwhals.typing import IntoFrameT
data = {"a": [0.0, None, 2.0]}
def check_nan_behavior(df: IntoFrameT) -> IntoFrameT:
return (
nw.from_native(df)
.with_columns(
a_div_a=(nw.col("a") / nw.col("a")),
a_div_a_is_nan=(nw.col("a") / nw.col("a")).is_nan(),
)
.to_native()
)
import pandas as pd
df = pd.DataFrame(data).astype({"a": "Float64"})
print(check_nan_behavior(df))
a a_div_a a_div_a_is_nan
0 0.0 NaN True
1 <NA> <NA> <NA>
2 2.0 1.0 False
import polars as pl
df = pl.DataFrame(data)
print(check_nan_behavior(df))
shape: (3, 3)
┌──────┬─────────┬────────────────┐
│ a ┆ a_div_a ┆ a_div_a_is_nan │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ bool │
╞══════╪═════════╪════════════════╡
│ 0.0 ┆ NaN ┆ true │
│ null ┆ null ┆ null │
│ 2.0 ┆ 1.0 ┆ false │
└──────┴─────────┴────────────────┘
import pyarrow as pa
df = pa.table(data)
print(check_nan_behavior(df))
pyarrow.Table
a: double
a_div_a: double
a_div_a_is_nan: bool
----
a: [[0,null,2]]
a_div_a: [[nan,null,1]]
a_div_a_is_nan: [[true,null,false]]