Skip to content

Commit c43918d

Browse files
authored
Fix dtype of series result for DataFrame.apply (#2978)
1 parent a057995 commit c43918d

File tree

7 files changed

+35
-30
lines changed

7 files changed

+35
-30
lines changed

azure-pipelines.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,8 @@ jobs:
7777
conda install -n test --quiet --yes -c pkgs/main python=$PYTHON certifi
7878
7979
if [[ "$(mars.test.module)" == "learn" ]]; then
80-
# remove version limit when blue-yonder/tsfresh#897 is fixed.
81-
# remove keras version after https://github.com/tensorflow/tensorflow/issues/52922 is fixed.
82-
pip install xgboost lightgbm keras==2.6.0 tensorflow faiss-cpu torch torchvision \
83-
statsmodels\<0.13.0 tsfresh
80+
pip install xgboost lightgbm keras tensorflow faiss-cpu torch torchvision \
81+
statsmodels tsfresh
8482
fi
8583
fi
8684
conda list -n test
@@ -95,7 +93,7 @@ jobs:
9593
9694
# do compatibility test for earliest supported pandas release
9795
if [[ "$(mars.test.module)" == "dataframe" ]]; then
98-
pip install pandas==1.0.5
96+
pip install -i https://pkgs.dev.azure.com/mars-project/mars/_packaging/pandas/pypi/simple/ pandas==1.0.5
9997
pytest $PYTEST_CONFIG -m pd_compat mars/dataframe
10098
mv .coverage build/.coverage.pd_compat.file
10199
fi

mars/dataframe/base/apply.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def _infer_df_func_returns(self, df, dtypes, dtype=None, name=None, index=None):
295295
if self.output_types is not None and (
296296
dtypes is not None or dtype is not None
297297
):
298-
ret_dtypes = dtypes if dtypes is not None else (dtype, name)
298+
ret_dtypes = dtypes if dtypes is not None else (name, dtype)
299299
ret_index_value = parse_index(index) if index is not None else None
300300
self._elementwise = False
301301
return ret_dtypes, ret_index_value
@@ -473,7 +473,9 @@ def __call__(self, df_or_series, dtypes=None, dtype=None, name=None, index=None)
473473
self._axis = validate_axis(axis, df_or_series)
474474

475475
if df_or_series.op.output_types[0] == OutputType.dataframe:
476-
return self._call_dataframe(df_or_series, dtypes=dtypes, index=index)
476+
return self._call_dataframe(
477+
df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index
478+
)
477479
else:
478480
return self._call_series(
479481
df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index

mars/dataframe/base/tests/test_base.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def test_rechunk():
217217
assert series2.nsplits == series.nsplits
218218

219219

220-
def test_data_frame_apply():
220+
def test_dataframe_apply():
221221
cols = [chr(ord("A") + i) for i in range(10)]
222222
df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
223223

@@ -231,6 +231,10 @@ def df_func_with_err(v):
231231
assert len(v) > 2
232232
return v.sort_values()
233233

234+
def df_series_func_with_err(v):
235+
assert len(v) > 2
236+
return 0
237+
234238
with pytest.raises(TypeError):
235239
df.apply(df_func_with_err)
236240

@@ -240,6 +244,15 @@ def df_func_with_err(v):
240244
assert r.op.output_types[0] == OutputType.dataframe
241245
assert r.op.elementwise is False
242246

247+
r = df.apply(
248+
df_series_func_with_err, output_type="series", dtype=object, name="output"
249+
)
250+
assert r.dtype == np.dtype("O")
251+
assert r.shape == (df.shape[-1],)
252+
assert r.op._op_type_ == opcodes.APPLY
253+
assert r.op.output_types[0] == OutputType.series
254+
assert r.op.elementwise is False
255+
243256
r = df.apply("ffill")
244257
assert r.op._op_type_ == opcodes.FILL_NA
245258

mars/dataframe/groupby/tests/test_groupby.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -396,28 +396,28 @@ def test_groupby_fill():
396396
)
397397
mdf = md.DataFrame(df1, chunk_size=3)
398398

399-
r = tile(getattr(mdf.groupby(["one", "two"]), "ffill")())
399+
r = tile(mdf.groupby(["one", "two"]).ffill())
400400
assert r.op.output_types[0] == OutputType.dataframe
401401
assert r.shape == (len(df1), 1)
402402
assert len(r.chunks) == 3
403403
assert r.chunks[0].shape == (np.nan, 1)
404404
assert r.dtypes.index.tolist() == ["three"]
405405

406-
r = tile(getattr(mdf.groupby(["two"]), "bfill")())
406+
r = tile(mdf.groupby(["two"]).bfill())
407407
assert r.op.output_types[0] == OutputType.dataframe
408408
assert r.shape == (len(df1), 2)
409409
assert len(r.chunks) == 3
410410
assert r.chunks[0].shape == (np.nan, 2)
411411
assert r.dtypes.index.tolist() == ["one", "three"]
412412

413-
r = tile(getattr(mdf.groupby(["two"]), "backfill")())
413+
r = tile(mdf.groupby(["two"]).backfill())
414414
assert r.op.output_types[0] == OutputType.dataframe
415415
assert r.shape == (len(df1), 2)
416416
assert len(r.chunks) == 3
417417
assert r.chunks[0].shape == (np.nan, 2)
418418
assert r.dtypes.index.tolist() == ["one", "three"]
419419

420-
r = tile(getattr(mdf.groupby(["one"]), "fillna")(5))
420+
r = tile(mdf.groupby(["one"]).fillna(5))
421421
assert r.op.output_types[0] == OutputType.dataframe
422422
assert r.shape == (len(df1), 2)
423423
assert len(r.chunks) == 3
@@ -426,25 +426,25 @@ def test_groupby_fill():
426426

427427
s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
428428
ms1 = md.Series(s1, chunk_size=3)
429-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "ffill")())
429+
r = tile(ms1.groupby(lambda x: x % 2).ffill())
430430
assert r.op.output_types[0] == OutputType.series
431431
assert len(r.chunks) == 4
432432
assert r.shape == (len(s1),)
433433
assert r.chunks[0].shape == (np.nan,)
434434

435-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "bfill")())
435+
r = tile(ms1.groupby(lambda x: x % 2).bfill())
436436
assert r.op.output_types[0] == OutputType.series
437437
assert len(r.chunks) == 4
438438
assert r.shape == (len(s1),)
439439
assert r.chunks[0].shape == (np.nan,)
440440

441-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "backfill")())
441+
r = tile(ms1.groupby(lambda x: x % 2).backfill())
442442
assert r.op.output_types[0] == OutputType.series
443443
assert len(r.chunks) == 4
444444
assert r.shape == (len(s1),)
445445
assert r.chunks[0].shape == (np.nan,)
446446

447-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "fillna")(5))
447+
r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
448448
assert r.op.output_types[0] == OutputType.series
449449
assert len(r.chunks) == 4
450450
assert r.shape == (len(s1),)
@@ -453,25 +453,25 @@ def test_groupby_fill():
453453
s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
454454
ms1 = md.Series(s1, chunk_size=3)
455455

456-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "ffill")())
456+
r = tile(ms1.groupby(lambda x: x % 2).ffill())
457457
assert r.op.output_types[0] == OutputType.series
458458
assert len(r.chunks) == 4
459459
assert r.shape == (len(s1),)
460460
assert r.chunks[0].shape == (np.nan,)
461461

462-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "bfill")())
462+
r = tile(ms1.groupby(lambda x: x % 2).bfill())
463463
assert r.op.output_types[0] == OutputType.series
464464
assert len(r.chunks) == 4
465465
assert r.shape == (len(s1),)
466466
assert r.chunks[0].shape == (np.nan,)
467467

468-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "backfill")())
468+
r = tile(ms1.groupby(lambda x: x % 2).backfill())
469469
assert r.op.output_types[0] == OutputType.series
470470
assert len(r.chunks) == 4
471471
assert r.shape == (len(s1),)
472472
assert r.chunks[0].shape == (np.nan,)
473473

474-
r = tile(getattr(ms1.groupby(lambda x: x % 2), "fillna")(5))
474+
r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
475475
assert r.op.output_types[0] == OutputType.series
476476
assert len(r.chunks) == 4
477477
assert r.shape == (len(s1),)

mars/dataframe/reduction/custom_reduction.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,7 @@ class DataFrameCustomReduction(DataFrameReductionOperand, DataFrameReductionMixi
2323
_op_type_ = OperandDef.CUSTOM_REDUCTION
2424
_func_name = "custom_reduction"
2525

26-
_custom_reduction = AnyField("custom_reduction")
27-
28-
def __init__(self, custom_reduction=None, **kw):
29-
super().__init__(_custom_reduction=custom_reduction, **kw)
30-
31-
@property
32-
def custom_reduction(self):
33-
return self._custom_reduction
26+
custom_reduction = AnyField("custom_reduction")
3427

3528
@property
3629
def is_atomic(self):

mars/dataframe/reduction/tests/test_reduction_execution.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ def test_nunique(setup, check_ref_counts):
673673

674674

675675
@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
676-
def test_use_arrow_dtype_n_unique(setup, check_ref_counts):
676+
def test_use_arrow_dtype_nunique(setup, check_ref_counts):
677677
with option_context({"dataframe.use_arrow_dtype": True, "combine_size": 2}):
678678
rs = np.random.RandomState(0)
679679
data1 = pd.DataFrame(

mars/oscar/backends/message.pyx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,6 @@ cpdef reset_random_seed():
556556
global _rnd_is_seed_set
557557

558558
seed_bytes = getrandbits(64).to_bytes(8, "little")
559-
# memcpy(&seed, <char *>seed_bytes, 8)
560559
_rnd_gen.seed((<uint_fast64_t *><char *>seed_bytes)[0])
561560
_rnd_is_seed_set = True
562561

0 commit comments

Comments
 (0)