chatExcel

series.py•42.5 KiB

from __future__ import annotations from typing import ( TYPE_CHECKING, Any, Iterable, Iterator, Mapping, Sequence, cast, overload, ) import pyarrow as pa import pyarrow.compute as pc from narwhals._arrow.series_cat import ArrowSeriesCatNamespace from narwhals._arrow.series_dt import ArrowSeriesDateTimeNamespace from narwhals._arrow.series_list import ArrowSeriesListNamespace from narwhals._arrow.series_str import ArrowSeriesStringNamespace from narwhals._arrow.series_struct import ArrowSeriesStructNamespace from narwhals._arrow.utils import ( cast_for_truediv, chunked_array, extract_native, floordiv_compat, lit, narwhals_to_native_dtype, native_to_narwhals_dtype, nulls_like, pad_series, ) from narwhals._compliant import EagerSeries from narwhals._expression_parsing import ExprKind from narwhals._utils import ( Implementation, generate_temporary_column_name, is_list_of, not_implemented, requires, validate_backend_version, ) from narwhals.dependencies import is_numpy_array_1d from narwhals.exceptions import InvalidOperationError if TYPE_CHECKING: from types import ModuleType import pandas as pd import polars as pl from typing_extensions import Self, TypeIs from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.namespace import ArrowNamespace from narwhals._arrow.typing import ( # type: ignore[attr-defined] ArrayAny, ArrayOrChunkedArray, ArrayOrScalar, ChunkedArrayAny, Incomplete, NullPlacement, Order, TieBreaker, _AsPyType, _BasicDataType, ) from narwhals._utils import Version, _FullContext from narwhals.dtypes import DType from narwhals.typing import ( ClosedInterval, FillNullStrategy, Into1DArray, NonNestedLiteral, NumericLiteral, PythonLiteral, RankMethod, RollingInterpolationMethod, SizedMultiIndexSelector, TemporalLiteral, _1DArray, _2DArray, _SliceIndex, ) # TODO @dangotbanned: move into `_arrow.utils` # Lots of modules are importing inline @overload def maybe_extract_py_scalar( value: pa.Scalar[_BasicDataType[_AsPyType]], return_py_scalar: bool, # noqa: FBT001 ) -> _AsPyType: ... @overload def maybe_extract_py_scalar( value: pa.Scalar[pa.StructType], return_py_scalar: bool, # noqa: FBT001 ) -> list[dict[str, Any]]: ... @overload def maybe_extract_py_scalar( value: pa.Scalar[pa.ListType[_BasicDataType[_AsPyType]]], return_py_scalar: bool, # noqa: FBT001 ) -> list[_AsPyType]: ... @overload def maybe_extract_py_scalar( value: pa.Scalar[Any] | Any, return_py_scalar: bool, # noqa: FBT001 ) -> Any: ... def maybe_extract_py_scalar(value: Any, return_py_scalar: bool) -> Any: # noqa: FBT001 if TYPE_CHECKING: return value.as_py() if return_py_scalar: return getattr(value, "as_py", lambda: value)() return value class ArrowSeries(EagerSeries["ChunkedArrayAny"]): def __init__( self, native_series: ChunkedArrayAny, *, name: str, backend_version: tuple[int, ...], version: Version, ) -> None: self._name = name self._native_series: ChunkedArrayAny = native_series self._implementation = Implementation.PYARROW self._backend_version = backend_version self._version = version validate_backend_version(self._implementation, self._backend_version) self._broadcast = False @property def native(self) -> ChunkedArrayAny: return self._native_series def _with_version(self, version: Version) -> Self: return self.__class__( self.native, name=self._name, backend_version=self._backend_version, version=version, ) def _with_native( self, series: ArrayOrScalar, *, preserve_broadcast: bool = False ) -> Self: result = self.from_native(chunked_array(series), name=self.name, context=self) if preserve_broadcast: result._broadcast = self._broadcast return result @classmethod def from_iterable( cls, data: Iterable[Any], *, context: _FullContext, name: str = "", dtype: DType | type[DType] | None = None, ) -> Self: version = context._version dtype_pa = narwhals_to_native_dtype(dtype, version) if dtype else None return cls.from_native( chunked_array([data], dtype_pa), name=name, context=context ) def _from_scalar(self, value: Any) -> Self: if self._backend_version < (13,) and hasattr(value, "as_py"): value = value.as_py() return super()._from_scalar(value) @staticmethod def _is_native(obj: ChunkedArrayAny | Any) -> TypeIs[ChunkedArrayAny]: return isinstance(obj, pa.ChunkedArray) @classmethod def from_native( cls, data: ChunkedArrayAny, /, *, context: _FullContext, name: str = "" ) -> Self: return cls( data, backend_version=context._backend_version, version=context._version, name=name, ) @classmethod def from_numpy(cls, data: Into1DArray, /, *, context: _FullContext) -> Self: return cls.from_iterable( data if is_numpy_array_1d(data) else [data], context=context ) def __narwhals_namespace__(self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace return ArrowNamespace( backend_version=self._backend_version, version=self._version ) def __eq__(self, other: object) -> Self: # type: ignore[override] other = cast("PythonLiteral | ArrowSeries | None", other) ser, rhs = extract_native(self, other) return self._with_native(pc.equal(ser, rhs)) def __ne__(self, other: object) -> Self: # type: ignore[override] other = cast("PythonLiteral | ArrowSeries | None", other) ser, rhs = extract_native(self, other) return self._with_native(pc.not_equal(ser, rhs)) def __ge__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.greater_equal(ser, other)) def __gt__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.greater(ser, other)) def __le__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.less_equal(ser, other)) def __lt__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.less(ser, other)) def __and__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.and_kleene(ser, other)) # type: ignore[arg-type] def __rand__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.and_kleene(other, ser)) # type: ignore[arg-type] def __or__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.or_kleene(ser, other)) # type: ignore[arg-type] def __ror__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.or_kleene(other, ser)) # type: ignore[arg-type] def __add__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.add(ser, other)) def __radd__(self, other: Any) -> Self: return self + other def __sub__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.subtract(ser, other)) def __rsub__(self, other: Any) -> Self: return (self - other) * (-1) def __mul__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.multiply(ser, other)) def __rmul__(self, other: Any) -> Self: return self * other def __pow__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.power(ser, other)) def __rpow__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.power(other, ser)) def __floordiv__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(floordiv_compat(ser, other)) def __rfloordiv__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(floordiv_compat(other, ser)) def __truediv__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.divide(*cast_for_truediv(ser, other))) # type: ignore[type-var] def __rtruediv__(self, other: Any) -> Self: ser, other = extract_native(self, other) return self._with_native(pc.divide(*cast_for_truediv(other, ser))) # type: ignore[type-var] def __mod__(self, other: Any) -> Self: floor_div = (self // other).native ser, other = extract_native(self, other) res = pc.subtract(ser, pc.multiply(floor_div, other)) return self._with_native(res) def __rmod__(self, other: Any) -> Self: floor_div = (other // self).native ser, other = extract_native(self, other) res = pc.subtract(other, pc.multiply(floor_div, ser)) return self._with_native(res) def __invert__(self) -> Self: return self._with_native(pc.invert(self.native)) @property def _type(self) -> pa.DataType: return self.native.type def len(self, *, _return_py_scalar: bool = True) -> int: return maybe_extract_py_scalar(len(self.native), _return_py_scalar) def filter(self, predicate: ArrowSeries | list[bool | None]) -> Self: other_native: Any if not is_list_of(predicate, bool): _, other_native = extract_native(self, predicate) else: other_native = predicate return self._with_native(self.native.filter(other_native)) def mean(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar(pc.mean(self.native), _return_py_scalar) def median(self, *, _return_py_scalar: bool = True) -> float: from narwhals.exceptions import InvalidOperationError if not self.dtype.is_numeric(): msg = "`median` operation not supported for non-numeric input type." raise InvalidOperationError(msg) return maybe_extract_py_scalar( pc.approximate_median(self.native), _return_py_scalar ) def min(self, *, _return_py_scalar: bool = True) -> Any: return maybe_extract_py_scalar(pc.min(self.native), _return_py_scalar) def max(self, *, _return_py_scalar: bool = True) -> Any: return maybe_extract_py_scalar(pc.max(self.native), _return_py_scalar) def arg_min(self, *, _return_py_scalar: bool = True) -> int: index_min = pc.index(self.native, pc.min(self.native)) return maybe_extract_py_scalar(index_min, _return_py_scalar) def arg_max(self, *, _return_py_scalar: bool = True) -> int: index_max = pc.index(self.native, pc.max(self.native)) return maybe_extract_py_scalar(index_max, _return_py_scalar) def sum(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar( pc.sum(self.native, min_count=0), _return_py_scalar ) def drop_nulls(self) -> Self: return self._with_native(self.native.drop_null()) def shift(self, n: int) -> Self: if n > 0: arrays = [nulls_like(n, self), *self.native[:-n].chunks] elif n < 0: arrays = [*self.native[-n:].chunks, nulls_like(-n, self)] else: return self._with_native(self.native) return self._with_native(pa.concat_arrays(arrays)) def std(self, ddof: int, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar( pc.stddev(self.native, ddof=ddof), _return_py_scalar ) def var(self, ddof: int, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar( pc.variance(self.native, ddof=ddof), _return_py_scalar ) def skew(self, *, _return_py_scalar: bool = True) -> float | None: ser_not_null = self.native.drop_null() if len(ser_not_null) == 0: return None elif len(ser_not_null) == 1: return float("nan") elif len(ser_not_null) == 2: return 0.0 else: m = pc.subtract(ser_not_null, pc.mean(ser_not_null)) m2 = pc.mean(pc.power(m, lit(2))) m3 = pc.mean(pc.power(m, lit(3))) biased_population_skewness = pc.divide(m3, pc.power(m2, lit(1.5))) return maybe_extract_py_scalar(biased_population_skewness, _return_py_scalar) def count(self, *, _return_py_scalar: bool = True) -> int: return maybe_extract_py_scalar(pc.count(self.native), _return_py_scalar) def n_unique(self, *, _return_py_scalar: bool = True) -> int: return maybe_extract_py_scalar( pc.count(self.native.unique(), mode="all"), _return_py_scalar ) def __native_namespace__(self) -> ModuleType: if self._implementation is Implementation.PYARROW: return self._implementation.to_native_namespace() msg = f"Expected pyarrow, got: {type(self._implementation)}" # pragma: no cover raise AssertionError(msg) @property def name(self) -> str: return self._name def _gather(self, rows: SizedMultiIndexSelector[ChunkedArrayAny]) -> Self: if len(rows) == 0: return self._with_native(self.native.slice(0, 0)) if self._backend_version < (18,) and isinstance(rows, tuple): rows = list(rows) return self._with_native(self.native.take(rows)) def _gather_slice(self, rows: _SliceIndex | range) -> Self: start = rows.start or 0 stop = rows.stop if rows.stop is not None else len(self.native) if start < 0: start = len(self.native) + start if stop < 0: stop = len(self.native) + stop if rows.step is not None and rows.step != 1: msg = "Slicing with step is not supported on PyArrow tables" raise NotImplementedError(msg) return self._with_native(self.native.slice(start, stop - start)) def scatter(self, indices: int | Sequence[int], values: Any) -> Self: import numpy as np # ignore-banned-import values_native: ArrayAny if isinstance(indices, int): indices_native = pa.array([indices]) values_native = pa.array([values]) else: # TODO(unassigned): we may also want to let `indices` be a Series. # https://github.com/narwhals-dev/narwhals/issues/2155 indices_native = pa.array(indices) if isinstance(values, self.__class__): values_native = values.native.combine_chunks() else: # NOTE: Requires fixes in https://github.com/zen-xu/pyarrow-stubs/pull/209 pa_array: Incomplete = pa.array values_native = pa_array(values) sorting_indices = pc.sort_indices(indices_native) indices_native = indices_native.take(sorting_indices) values_native = values_native.take(sorting_indices) mask: _1DArray = np.zeros(self.len(), dtype=bool) mask[indices_native] = True # NOTE: Multiple issues # - Missing `values` type # - `mask` accepts a `np.ndarray`, but not mentioned in stubs # - Missing `replacements` type # - Missing return type pc_replace_with_mask: Incomplete = pc.replace_with_mask return self._with_native( pc_replace_with_mask(self.native, mask, values_native.take(indices_native)) ) def to_list(self) -> list[Any]: return self.native.to_pylist() def __array__(self, dtype: Any = None, *, copy: bool | None = None) -> _1DArray: return self.native.__array__(dtype=dtype, copy=copy) def to_numpy(self, dtype: Any = None, *, copy: bool | None = None) -> _1DArray: return self.native.to_numpy() def alias(self, name: str) -> Self: result = self.__class__( self.native, name=name, backend_version=self._backend_version, version=self._version, ) result._broadcast = self._broadcast return result @property def dtype(self) -> DType: return native_to_narwhals_dtype(self.native.type, self._version) def abs(self) -> Self: return self._with_native(pc.abs(self.native)) def cum_sum(self, *, reverse: bool) -> Self: cum_sum = pc.cumulative_sum result = ( cum_sum(self.native, skip_nulls=True) if not reverse else cum_sum(self.native[::-1], skip_nulls=True)[::-1] ) return self._with_native(result) def round(self, decimals: int) -> Self: return self._with_native( pc.round(self.native, decimals, round_mode="half_towards_infinity") ) def diff(self) -> Self: return self._with_native(pc.pairwise_diff(self.native.combine_chunks())) def any(self, *, _return_py_scalar: bool = True) -> bool: return maybe_extract_py_scalar( pc.any(self.native, min_count=0), _return_py_scalar ) def all(self, *, _return_py_scalar: bool = True) -> bool: return maybe_extract_py_scalar( pc.all(self.native, min_count=0), _return_py_scalar ) def is_between( self, lower_bound: Any, upper_bound: Any, closed: ClosedInterval ) -> Self: _, lower_bound = extract_native(self, lower_bound) _, upper_bound = extract_native(self, upper_bound) if closed == "left": ge = pc.greater_equal(self.native, lower_bound) lt = pc.less(self.native, upper_bound) res = pc.and_kleene(ge, lt) elif closed == "right": gt = pc.greater(self.native, lower_bound) le = pc.less_equal(self.native, upper_bound) res = pc.and_kleene(gt, le) elif closed == "none": gt = pc.greater(self.native, lower_bound) lt = pc.less(self.native, upper_bound) res = pc.and_kleene(gt, lt) elif closed == "both": ge = pc.greater_equal(self.native, lower_bound) le = pc.less_equal(self.native, upper_bound) res = pc.and_kleene(ge, le) else: # pragma: no cover raise AssertionError return self._with_native(res) def is_null(self) -> Self: return self._with_native(self.native.is_null(), preserve_broadcast=True) def is_nan(self) -> Self: return self._with_native(pc.is_nan(self.native), preserve_broadcast=True) def cast(self, dtype: DType | type[DType]) -> Self: data_type = narwhals_to_native_dtype(dtype, self._version) return self._with_native(pc.cast(self.native, data_type), preserve_broadcast=True) def null_count(self, *, _return_py_scalar: bool = True) -> int: return maybe_extract_py_scalar(self.native.null_count, _return_py_scalar) def head(self, n: int) -> Self: if n >= 0: return self._with_native(self.native.slice(0, n)) else: num_rows = len(self) return self._with_native(self.native.slice(0, max(0, num_rows + n))) def tail(self, n: int) -> Self: if n >= 0: num_rows = len(self) return self._with_native(self.native.slice(max(0, num_rows - n))) else: return self._with_native(self.native.slice(abs(n))) def is_in(self, other: Any) -> Self: if self._is_native(other): value_set: ArrayOrChunkedArray = other else: value_set = pa.array(other) return self._with_native(pc.is_in(self.native, value_set=value_set)) def arg_true(self) -> Self: import numpy as np # ignore-banned-import res = np.flatnonzero(self.native) return self.from_iterable(res, name=self.name, context=self) def item(self, index: int | None = None) -> Any: if index is None: if len(self) != 1: msg = ( "can only call '.item()' if the Series is of length 1," f" or an explicit index is provided (Series is of length {len(self)})" ) raise ValueError(msg) return maybe_extract_py_scalar(self.native[0], return_py_scalar=True) return maybe_extract_py_scalar(self.native[index], return_py_scalar=True) def value_counts( self, *, sort: bool, parallel: bool, name: str | None, normalize: bool ) -> ArrowDataFrame: """Parallel is unused, exists for compatibility.""" from narwhals._arrow.dataframe import ArrowDataFrame index_name_ = "index" if self._name is None else self._name value_name_ = name or ("proportion" if normalize else "count") val_counts = pc.value_counts(self.native) values = val_counts.field("values") counts = cast("ChunkedArrayAny", val_counts.field("counts")) if normalize: arrays = [values, pc.divide(*cast_for_truediv(counts, pc.sum(counts)))] else: arrays = [values, counts] val_count = pa.Table.from_arrays(arrays, names=[index_name_, value_name_]) if sort: val_count = val_count.sort_by([(value_name_, "descending")]) return ArrowDataFrame( val_count, backend_version=self._backend_version, version=self._version, validate_column_names=True, ) def zip_with(self, mask: Self, other: Self) -> Self: cond = mask.native.combine_chunks() return self._with_native(pc.if_else(cond, self.native, other.native)) def sample( self, n: int | None, *, fraction: float | None, with_replacement: bool, seed: int | None, ) -> Self: import numpy as np # ignore-banned-import num_rows = len(self) if n is None and fraction is not None: n = int(num_rows * fraction) rng = np.random.default_rng(seed=seed) idx = np.arange(0, num_rows) mask = rng.choice(idx, size=n, replace=with_replacement) return self._with_native(self.native.take(mask)) def fill_null( self, value: Self | NonNestedLiteral, strategy: FillNullStrategy | None, limit: int | None, ) -> Self: import numpy as np # ignore-banned-import def fill_aux( arr: ChunkedArrayAny, limit: int, direction: FillNullStrategy | None ) -> ArrayAny: # this algorithm first finds the indices of the valid values to fill all the null value positions # then it calculates the distance of each new index and the original index # if the distance is equal to or less than the limit and the original value is null, it is replaced valid_mask = pc.is_valid(arr) indices = pa.array(np.arange(len(arr)), type=pa.int64()) if direction == "forward": valid_index = np.maximum.accumulate(np.where(valid_mask, indices, -1)) distance = indices - valid_index else: valid_index = np.minimum.accumulate( np.where(valid_mask[::-1], indices[::-1], len(arr)) )[::-1] distance = valid_index - indices return pc.if_else( pc.and_(pc.is_null(arr), pc.less_equal(distance, lit(limit))), # pyright: ignore[reportArgumentType, reportCallIssue] arr.take(valid_index), arr, ) if value is not None: _, native_value = extract_native(self, value) series: ArrayOrScalar = pc.fill_null(self.native, native_value) elif limit is None: fill_func = ( pc.fill_null_forward if strategy == "forward" else pc.fill_null_backward ) series = fill_func(self.native) else: series = fill_aux(self.native, limit, strategy) return self._with_native(series, preserve_broadcast=True) def to_frame(self) -> ArrowDataFrame: from narwhals._arrow.dataframe import ArrowDataFrame df = pa.Table.from_arrays([self.native], names=[self.name]) return ArrowDataFrame( df, backend_version=self._backend_version, version=self._version, validate_column_names=False, ) def to_pandas(self) -> pd.Series[Any]: import pandas as pd # ignore-banned-import() return pd.Series(self.native, name=self.name) def to_polars(self) -> pl.Series: import polars as pl # ignore-banned-import return cast("pl.Series", pl.from_arrow(self.native)) def is_unique(self) -> ArrowSeries: return self.to_frame().is_unique().alias(self.name) def is_first_distinct(self) -> Self: import numpy as np # ignore-banned-import row_number = pa.array(np.arange(len(self))) col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) first_distinct_index = ( pa.Table.from_arrays([self.native], names=[self.name]) .append_column(col_token, row_number) .group_by(self.name) .aggregate([(col_token, "min")]) .column(f"{col_token}_min") ) return self._with_native(pc.is_in(row_number, first_distinct_index)) def is_last_distinct(self) -> Self: import numpy as np # ignore-banned-import row_number = pa.array(np.arange(len(self))) col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) last_distinct_index = ( pa.Table.from_arrays([self.native], names=[self.name]) .append_column(col_token, row_number) .group_by(self.name) .aggregate([(col_token, "max")]) .column(f"{col_token}_max") ) return self._with_native(pc.is_in(row_number, last_distinct_index)) def is_sorted(self, *, descending: bool) -> bool: if not isinstance(descending, bool): msg = f"argument 'descending' should be boolean, found {type(descending)}" raise TypeError(msg) if descending: result = pc.all(pc.greater_equal(self.native[:-1], self.native[1:])) else: result = pc.all(pc.less_equal(self.native[:-1], self.native[1:])) return maybe_extract_py_scalar(result, return_py_scalar=True) def unique(self, *, maintain_order: bool) -> Self: # TODO(marco): `pc.unique` seems to always maintain order, is that guaranteed? return self._with_native(self.native.unique()) def replace_strict( self, old: Sequence[Any] | Mapping[Any, Any], new: Sequence[Any], *, return_dtype: DType | type[DType] | None, ) -> Self: # https://stackoverflow.com/a/79111029/4451315 idxs = pc.index_in(self.native, pa.array(old)) result_native = pc.take(pa.array(new), idxs) if return_dtype is not None: result_native.cast(narwhals_to_native_dtype(return_dtype, self._version)) result = self._with_native(result_native) if result.is_null().sum() != self.is_null().sum(): msg = ( "replace_strict did not replace all non-null values.\n\n" "The following did not get replaced: " f"{self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}" ) raise ValueError(msg) return result def sort(self, *, descending: bool, nulls_last: bool) -> Self: order: Order = "descending" if descending else "ascending" null_placement: NullPlacement = "at_end" if nulls_last else "at_start" sorted_indices = pc.array_sort_indices( self.native, order=order, null_placement=null_placement ) return self._with_native(self.native.take(sorted_indices)) def to_dummies(self, *, separator: str, drop_first: bool) -> ArrowDataFrame: import numpy as np # ignore-banned-import from narwhals._arrow.dataframe import ArrowDataFrame name = self._name # NOTE: stub is missing attributes (https://arrow.apache.org/docs/python/generated/pyarrow.DictionaryArray.html) da: Incomplete = self.native.combine_chunks().dictionary_encode("encode") columns: _2DArray = np.zeros((len(da.dictionary), len(da)), np.int8) columns[da.indices, np.arange(len(da))] = 1 null_col_pa, null_col_pl = f"{name}{separator}None", f"{name}{separator}null" cols = [ {null_col_pa: null_col_pl}.get( f"{name}{separator}{v}", f"{name}{separator}{v}" ) for v in da.dictionary ] output_order = ( [ null_col_pl, *sorted([c for c in cols if c != null_col_pl])[int(drop_first) :], ] if null_col_pl in cols else sorted(cols)[int(drop_first) :] ) return ArrowDataFrame( pa.Table.from_arrays(columns, names=cols), backend_version=self._backend_version, version=self._version, validate_column_names=True, ).simple_select(*output_order) def quantile( self, quantile: float, interpolation: RollingInterpolationMethod, *, _return_py_scalar: bool = True, ) -> float: return maybe_extract_py_scalar( pc.quantile(self.native, q=quantile, interpolation=interpolation)[0], _return_py_scalar, ) def gather_every(self, n: int, offset: int = 0) -> Self: return self._with_native(self.native[offset::n]) def clip( self, lower_bound: Self | NumericLiteral | TemporalLiteral | None, upper_bound: Self | NumericLiteral | TemporalLiteral | None, ) -> Self: _, lower = extract_native(self, lower_bound) if lower_bound else (None, None) _, upper = extract_native(self, upper_bound) if upper_bound else (None, None) if lower is None: return self._with_native(pc.min_element_wise(self.native, upper)) if upper is None: return self._with_native(pc.max_element_wise(self.native, lower)) return self._with_native( pc.max_element_wise(pc.min_element_wise(self.native, upper), lower) ) def to_arrow(self) -> ArrayAny: return self.native.combine_chunks() def mode(self) -> ArrowSeries: plx = self.__narwhals_namespace__() col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) counts = self.value_counts( name=col_token, normalize=False, sort=False, parallel=False ) return counts.filter( plx.col(col_token) == plx.col(col_token).max().broadcast(kind=ExprKind.AGGREGATION) ).get_column(self.name) def is_finite(self) -> Self: return self._with_native(pc.is_finite(self.native)) def cum_count(self, *, reverse: bool) -> Self: dtypes = self._version.dtypes return (~self.is_null()).cast(dtypes.UInt32()).cum_sum(reverse=reverse) @requires.backend_version((13,)) def cum_min(self, *, reverse: bool) -> Self: result = ( pc.cumulative_min(self.native, skip_nulls=True) if not reverse else pc.cumulative_min(self.native[::-1], skip_nulls=True)[::-1] ) return self._with_native(result) @requires.backend_version((13,)) def cum_max(self, *, reverse: bool) -> Self: result = ( pc.cumulative_max(self.native, skip_nulls=True) if not reverse else pc.cumulative_max(self.native[::-1], skip_nulls=True)[::-1] ) return self._with_native(result) @requires.backend_version((13,)) def cum_prod(self, *, reverse: bool) -> Self: result = ( pc.cumulative_prod(self.native, skip_nulls=True) if not reverse else pc.cumulative_prod(self.native[::-1], skip_nulls=True)[::-1] ) return self._with_native(result) def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self: min_samples = min_samples if min_samples is not None else window_size padded_series, offset = pad_series(self, window_size=window_size, center=center) cum_sum = padded_series.cum_sum(reverse=False).fill_null( value=None, strategy="forward", limit=None ) rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(value=0, strategy=None, limit=None) if window_size != 0 else cum_sum ) valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null( value=0, strategy=None, limit=None ) result = self._with_native( pc.if_else((count_in_window >= min_samples).native, rolling_sum.native, None) ) return result._gather_slice(slice(offset, None)) def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self: min_samples = min_samples if min_samples is not None else window_size padded_series, offset = pad_series(self, window_size=window_size, center=center) cum_sum = padded_series.cum_sum(reverse=False).fill_null( value=None, strategy="forward", limit=None ) rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(value=0, strategy=None, limit=None) if window_size != 0 else cum_sum ) valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null( value=0, strategy=None, limit=None ) result = ( self._with_native( pc.if_else( (count_in_window >= min_samples).native, rolling_sum.native, None ) ) / count_in_window ) return result._gather_slice(slice(offset, None)) def rolling_var( self, window_size: int, *, min_samples: int, center: bool, ddof: int ) -> Self: min_samples = min_samples if min_samples is not None else window_size padded_series, offset = pad_series(self, window_size=window_size, center=center) cum_sum = padded_series.cum_sum(reverse=False).fill_null( value=None, strategy="forward", limit=None ) rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(value=0, strategy=None, limit=None) if window_size != 0 else cum_sum ) cum_sum_sq = ( pow(padded_series, 2) .cum_sum(reverse=False) .fill_null(value=None, strategy="forward", limit=None) ) rolling_sum_sq = ( cum_sum_sq - cum_sum_sq.shift(window_size).fill_null(value=0, strategy=None, limit=None) if window_size != 0 else cum_sum_sq ) valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null( value=0, strategy=None, limit=None ) result = self._with_native( pc.if_else( (count_in_window >= min_samples).native, (rolling_sum_sq - (rolling_sum**2 / count_in_window)).native, None, ) ) / self._with_native(pc.max_element_wise((count_in_window - ddof).native, 0)) return result._gather_slice(slice(offset, None, None)) def rolling_std( self, window_size: int, *, min_samples: int, center: bool, ddof: int ) -> Self: return ( self.rolling_var( window_size=window_size, min_samples=min_samples, center=center, ddof=ddof ) ** 0.5 ) def rank(self, method: RankMethod, *, descending: bool) -> Self: if method == "average": msg = ( "`rank` with `method='average' is not supported for pyarrow backend. " "The available methods are {'min', 'max', 'dense', 'ordinal'}." ) raise ValueError(msg) sort_keys: Order = "descending" if descending else "ascending" tiebreaker: TieBreaker = "first" if method == "ordinal" else method native_series: ArrayOrChunkedArray if self._backend_version < (14, 0, 0): # pragma: no cover native_series = self.native.combine_chunks() else: native_series = self.native null_mask = pc.is_null(native_series) rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) result = pc.if_else(null_mask, lit(None, native_series.type), rank) return self._with_native(result) @requires.backend_version((13,)) def hist( # noqa: C901, PLR0912, PLR0915 self, bins: list[float | int] | None, *, bin_count: int | None, include_breakpoint: bool, ) -> ArrowDataFrame: import numpy as np # ignore-banned-import from narwhals._arrow.dataframe import ArrowDataFrame def _hist_from_bin_count(bin_count: int): # type: ignore[no-untyped-def] # noqa: ANN202 d = pc.min_max(self.native) lower, upper = d["min"].as_py(), d["max"].as_py() if lower == upper: lower -= 0.5 upper += 0.5 bins = np.linspace(lower, upper, bin_count + 1) return _hist_from_bins(bins) def _hist_from_bins(bins: Sequence[int | float]): # type: ignore[no-untyped-def] # noqa: ANN202 bin_indices = np.searchsorted(bins, self.native, side="left") bin_indices = pc.if_else( # lowest bin is inclusive pc.equal(self.native, lit(bins[0])), 1, bin_indices ) # align unique categories and counts appropriately obs_cats, obs_counts = np.unique(bin_indices, return_counts=True) obj_cats = np.arange(1, len(bins)) counts = np.zeros_like(obj_cats) counts[np.isin(obj_cats, obs_cats)] = obs_counts[np.isin(obs_cats, obj_cats)] bin_right = bins[1:] return counts, bin_right counts: Sequence[int | float | pa.Scalar[Any]] | np.typing.ArrayLike bin_right: Sequence[int | float | pa.Scalar[Any]] | np.typing.ArrayLike data_count = pc.sum( pc.invert(pc.or_(pc.is_nan(self.native), pc.is_null(self.native))).cast( pa.uint8() ), min_count=0, ) if bins is not None: if len(bins) < 2: counts, bin_right = [], [] elif data_count == pa.scalar(0, type=pa.uint64()): # type:ignore[comparison-overlap] counts = np.zeros(len(bins) - 1) bin_right = bins[1:] elif len(bins) == 2: counts = [ pc.sum( pc.and_( pc.greater_equal(self.native, lit(float(bins[0]))), pc.less_equal(self.native, lit(float(bins[1]))), ).cast(pa.uint8()) ) ] bin_right = [bins[-1]] else: counts, bin_right = _hist_from_bins(bins) elif bin_count is not None: if bin_count == 0: counts, bin_right = [], [] elif data_count == pa.scalar(0, type=pa.uint64()): # type:ignore[comparison-overlap] counts, bin_right = ( np.zeros(bin_count), np.linspace(0, 1, bin_count + 1)[1:], ) elif bin_count == 1: d = pc.min_max(self.native) lower, upper = d["min"], d["max"] if lower == upper: counts, bin_right = [data_count], [pc.add(upper, pa.scalar(0.5))] else: counts, bin_right = [data_count], [upper] else: counts, bin_right = _hist_from_bin_count(bin_count) else: # pragma: no cover # caller guarantees that either bins or bin_count is specified msg = "must provide one of `bin_count` or `bins`" raise InvalidOperationError(msg) data: dict[str, Any] = {} if include_breakpoint: data["breakpoint"] = bin_right data["count"] = counts return ArrowDataFrame( pa.Table.from_pydict(data), backend_version=self._backend_version, version=self._version, validate_column_names=True, ) def __iter__(self) -> Iterator[Any]: for x in self.native: yield maybe_extract_py_scalar(x, return_py_scalar=True) def __contains__(self, other: Any) -> bool: from pyarrow import ( ArrowInvalid, # ignore-banned-imports ArrowNotImplementedError, # ignore-banned-imports ArrowTypeError, # ignore-banned-imports ) try: other_ = lit(other) if other is not None else lit(None, type=self._type) return maybe_extract_py_scalar( pc.is_in(other_, self.native), return_py_scalar=True ) except (ArrowInvalid, ArrowNotImplementedError, ArrowTypeError) as exc: from narwhals.exceptions import InvalidOperationError msg = f"Unable to compare other of type {type(other)} with series of type {self.dtype}." raise InvalidOperationError(msg) from exc def log(self, base: float) -> Self: return self._with_native(pc.logb(self.native, lit(base))) @property def dt(self) -> ArrowSeriesDateTimeNamespace: return ArrowSeriesDateTimeNamespace(self) @property def cat(self) -> ArrowSeriesCatNamespace: return ArrowSeriesCatNamespace(self) @property def str(self) -> ArrowSeriesStringNamespace: return ArrowSeriesStringNamespace(self) @property def list(self) -> ArrowSeriesListNamespace: return ArrowSeriesListNamespace(self) @property def struct(self) -> ArrowSeriesStructNamespace: return ArrowSeriesStructNamespace(self) ewm_mean = not_implemented()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Lillard01/chatExcel-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

series.py•42.5 KiB