Skip to content

API: consistent NaN treatment for pyarrow dtypes #61732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]


def is_nan_na() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nan_is_na"]
1 change: 1 addition & 0 deletions pandas/_libs/missing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object) -> bool: ...
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
18 changes: 18 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_pdna_or_none(values: ndarray) -> ndarray:
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA:
result[i] = True
return result.view(bool)


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1461,7 +1461,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr))

return arr

Expand Down
7 changes: 7 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2122,3 +2122,10 @@ def temp_file(tmp_path):
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(params=[True, False])
def using_nan_is_na(request):
opt = request.param
with pd.option_context("mode.nan_is_na", opt):
yield opt
15 changes: 13 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas.errors import LossySetitemError

from pandas.core.dtypes.cast import np_can_hold_element
Expand All @@ -21,7 +24,11 @@


def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
arr: ArrayLike,
dtype: npt.DTypeLike | None,
na_value,
hasna: bool,
is_pyarrow: bool = True,
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
Expand All @@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
if is_pyarrow and not is_nan_na():
na_value = NA
dtype = np.dtype(object)
else:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
Expand Down
91 changes: 75 additions & 16 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import numpy as np

from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import is_pdna_or_none
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
Expand All @@ -32,6 +35,7 @@

from pandas.core.dtypes.cast import (
can_hold_element,
construct_1d_object_array_from_listlike,
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -325,6 +329,11 @@ def _from_sequence_of_strings(
"""
Construct a new ExtensionArray from a sequence of strings.
"""
mask = isna(strings)

if isinstance(strings, cls):
strings = strings._pa_array

pa_type = to_pyarrow_type(dtype)
if (
pa_type is None
Expand All @@ -343,17 +352,21 @@ def _from_sequence_of_strings(
from pandas.core.tools.datetimes import to_datetime

scalars = to_datetime(strings, errors="raise").date

scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)

elif pa.types.is_duration(pa_type):
from pandas.core.tools.timedeltas import to_timedelta

scalars = to_timedelta(strings, errors="raise")

if pa_type.unit != "ns":
# GH51175: test_from_sequence_of_strings_pa_array
# attempt to parse as int64 reflecting pyarrow's
# duration to string casting behavior
mask = isna(scalars)
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
strings = pa.array(strings, type=pa.string(), from_pandas=True)
strings = pa.array(strings, type=pa.string(), mask=mask)
strings = pc.if_else(mask, None, strings)
try:
scalars = strings.cast(pa.int64())
Expand All @@ -374,7 +387,7 @@ def _from_sequence_of_strings(
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings
else:
scalars = pa.array(strings, type=pa.string(), from_pandas=True)
scalars = pa.array(strings, type=pa.string(), mask=mask)
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
scalars = scalars.cast(pa.bool_())
Expand All @@ -386,6 +399,11 @@ def _from_sequence_of_strings(
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings.cast(pa_type)
elif mask is not None:
scalars = pa.array(scalars, mask=mask, type=pa_type)

else:
raise NotImplementedError(
f"Converting strings to {pa_type} is not implemented."
Expand Down Expand Up @@ -428,7 +446,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif isna(value) and not (lib.is_float(value) and not is_nan_na()):
pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand All @@ -445,7 +463,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
value = value.as_unit(pa_type.unit)
value = value._value

pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
pa_scalar = pa.scalar(value, type=pa_type)

if pa_type is not None and pa_scalar.type != pa_type:
pa_scalar = pa_scalar.cast(pa_type)
Expand Down Expand Up @@ -477,6 +495,13 @@ def _box_pa_array(
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

elif hasattr(value, "__arrow_array__"):
# e.g. StringArray
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

else:
if (
isinstance(value, np.ndarray)
Expand Down Expand Up @@ -530,19 +555,40 @@ def _box_pa_array(
pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask)
return pa_array

mask = None
if is_nan_na():
try:
arr_value = np.asarray(value)
if arr_value.ndim > 1:
# e.g. test_fixed_size_list we have list data. ndim > 1
# means there were no scalar (NA) entries.
mask = np.zeros(len(value), dtype=np.bool_)
else:
mask = isna(arr_value)
except ValueError:
# Ragged data that numpy raises on
arr_value = construct_1d_object_array_from_listlike(value)
mask = isna(arr_value)
elif (
getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf"
):
arr_value = np.asarray(value, dtype=object)
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
mask = is_pdna_or_none(arr_value) # type: ignore[assignment]

try:
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type, mask=mask)
except (pa.ArrowInvalid, pa.ArrowTypeError):
# GH50430: let pyarrow infer type, then cast
pa_array = pa.array(value, from_pandas=True)
pa_array = pa.array(value, mask=mask)

if pa_type is None and pa.types.is_duration(pa_array.type):
# Workaround https://github.com/apache/arrow/issues/37291
from pandas.core.tools.timedeltas import to_timedelta

value = to_timedelta(value)
value = value.to_numpy()
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type)

if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
# GH52843: upstream bug for duration types when originally
Expand Down Expand Up @@ -877,7 +923,13 @@ def _logical_method(self, other, op) -> Self:
return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)

def _arith_method(self, other, op) -> Self:
return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
if is_nan_na() and result.dtype.kind == "f":
parr = result._pa_array
mask = pc.is_nan(parr).to_numpy()
arr = pc.replace_with_mask(parr, mask, pa.scalar(None, type=parr.type))
result = type(self)(arr)
return result

def equals(self, other) -> bool:
if not isinstance(other, ArrowExtensionArray):
Expand Down Expand Up @@ -1208,7 +1260,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
if not len(values):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
result = pc.is_in(self._pa_array, value_set=pa.array(values))
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down Expand Up @@ -1460,7 +1512,9 @@ def to_numpy(
na_value: object = lib.no_default,
) -> np.ndarray:
original_na_value = na_value
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
dtype, na_value = to_numpy_dtype_inference(
self, dtype, na_value, self._hasna, is_pyarrow=True
)
pa_type = self._pa_array.type
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
data = self
Expand Down Expand Up @@ -1489,7 +1543,11 @@ def to_numpy(
pa.types.is_floating(pa_type)
and (
na_value is np.nan
or (original_na_value is lib.no_default and is_float_dtype(dtype))
or (
original_na_value is lib.no_default
and is_float_dtype(dtype)
and is_nan_na()
)
)
):
result = data._pa_array.to_numpy()
Expand Down Expand Up @@ -2015,7 +2073,7 @@ def __setitem__(self, key, value) -> None:
raise ValueError("Length of indexer and values mismatch")
chunks = [
*self._pa_array[:key].chunks,
pa.array([value], type=self._pa_array.type, from_pandas=True),
pa.array([value], type=self._pa_array.type),
*self._pa_array[key + 1 :].chunks,
]
data = pa.chunked_array(chunks).combine_chunks()
Expand Down Expand Up @@ -2069,7 +2127,7 @@ def _rank_calc(
pa_type = pa.float64()
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
result = pa.array(ranked, type=pa_type)
return result

data = self._pa_array.combine_chunks()
Expand Down Expand Up @@ -2321,7 +2379,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
right, right_type = _to_numpy_and_type(right)
pa_type = left_type or right_type
result = np.where(cond, left, right)
return pa.array(result, type=pa_type, from_pandas=True)
return pa.array(result, type=pa_type)

@classmethod
def _replace_with_mask(
Expand Down Expand Up @@ -2362,9 +2420,10 @@ def _replace_with_mask(
replacements = np.array(replacements, dtype=object)
elif isinstance(replacements, pa.Scalar):
replacements = replacements.as_py()

result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)
return pa.array(result, type=values.type)

# ------------------------------------------------------------------
# GroupBy Methods
Expand Down Expand Up @@ -2443,7 +2502,7 @@ def _groupby_op(
return type(self)(pa_result)
else:
# DatetimeArray, TimedeltaArray
pa_result = pa.array(result, from_pandas=True)
pa_result = pa.array(result)
return type(self)(pa_result)

def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,9 @@ def to_numpy(
array([ True, False, False])
"""
hasna = self._hasna
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
dtype, na_value = to_numpy_dtype_inference(
self, dtype, na_value, hasna, is_pyarrow=False
Comment on lines +499 to +500
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change means to use object dtype instead of converting NA to NaNs?

We initially did that for the masked arrays conversion to numpy, but then changed it use NaNs, because constantly getting object dtype was too annoying (there is some issue discussing this IIRC)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

)
if dtype is None:
dtype = object

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,12 @@ def _str_map_str_or_object(
if self.dtype.storage == "pyarrow":
import pyarrow as pa

# TODO: shouldn't this already be caught my passed mask?
# it isn't in test_extract_expand_capture_groups_index
# mask = mask | np.array(
# [x is libmissing.NA for x in result], dtype=bool
# )

result = pa.array(
result, mask=mask, type=pa.large_string(), from_pandas=True
)
Expand Down Expand Up @@ -754,7 +760,7 @@ def __arrow_array__(self, type=None):

values = self._ndarray.copy()
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
return pa.array(values, type=type)

def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override]
arr = self._ndarray
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,15 @@ def is_terminal() -> bool:
validator=is_one_of_factory([True, False, "warn"]),
)

with cf.config_prefix("mode"):
cf.register_option(
"nan_is_na",
True,
"Whether to make ArrowDtype arrays consistently treat NaN as "
"interchangeable with pd.NA",
validator=is_one_of_factory([True, False]),
)


# user warnings
chained_assignment = """
Expand Down
Loading
Loading