Fix dtype of series result for DataFrame.apply (#2978)

wjsi · web-flow · commit c43918deaaab · 2022-04-28T15:31:57.000+08:00
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -77,10 +77,8 @@ jobs:
           conda install -n test --quiet --yes -c pkgs/main python=$PYTHON certifi
 
           if [[ "$(mars.test.module)" == "learn" ]]; then
-            # remove version limit when blue-yonder/tsfresh#897 is fixed.
-            # remove keras version after https://github.com/tensorflow/tensorflow/issues/52922 is fixed.
-            pip install xgboost lightgbm keras==2.6.0 tensorflow faiss-cpu torch torchvision \
-              statsmodels\<0.13.0 tsfresh
+            pip install xgboost lightgbm keras tensorflow faiss-cpu torch torchvision \
+              statsmodels tsfresh
           fi
         fi
         conda list -n test
@@ -95,7 +93,7 @@ jobs:
 
         # do compatibility test for earliest supported pandas release
         if [[ "$(mars.test.module)" == "dataframe" ]]; then
-          pip install pandas==1.0.5
+          pip install -i https://pkgs.dev.azure.com/mars-project/mars/_packaging/pandas/pypi/simple/ pandas==1.0.5
           pytest $PYTEST_CONFIG -m pd_compat mars/dataframe
           mv .coverage build/.coverage.pd_compat.file
         fi
diff --git a/mars/dataframe/base/apply.py b/mars/dataframe/base/apply.py
@@ -295,7 +295,7 @@ def _infer_df_func_returns(self, df, dtypes, dtype=None, name=None, index=None):
             if self.output_types is not None and (
                 dtypes is not None or dtype is not None
             ):
-                ret_dtypes = dtypes if dtypes is not None else (dtype, name)
+                ret_dtypes = dtypes if dtypes is not None else (name, dtype)
                 ret_index_value = parse_index(index) if index is not None else None
                 self._elementwise = False
                 return ret_dtypes, ret_index_value
@@ -473,7 +473,9 @@ def __call__(self, df_or_series, dtypes=None, dtype=None, name=None, index=None)
         self._axis = validate_axis(axis, df_or_series)
 
         if df_or_series.op.output_types[0] == OutputType.dataframe:
-            return self._call_dataframe(df_or_series, dtypes=dtypes, index=index)
+            return self._call_dataframe(
+                df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index
+            )
         else:
             return self._call_series(
                 df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index
diff --git a/mars/dataframe/base/tests/test_base.py b/mars/dataframe/base/tests/test_base.py
@@ -217,7 +217,7 @@ def test_rechunk():
     assert series2.nsplits == series.nsplits
 
 
-def test_data_frame_apply():
+def test_dataframe_apply():
     cols = [chr(ord("A") + i) for i in range(10)]
     df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
 
@@ -231,6 +231,10 @@ def df_func_with_err(v):
             assert len(v) > 2
             return v.sort_values()
 
+        def df_series_func_with_err(v):
+            assert len(v) > 2
+            return 0
+
         with pytest.raises(TypeError):
             df.apply(df_func_with_err)
 
@@ -240,6 +244,15 @@ def df_func_with_err(v):
         assert r.op.output_types[0] == OutputType.dataframe
         assert r.op.elementwise is False
 
+        r = df.apply(
+            df_series_func_with_err, output_type="series", dtype=object, name="output"
+        )
+        assert r.dtype == np.dtype("O")
+        assert r.shape == (df.shape[-1],)
+        assert r.op._op_type_ == opcodes.APPLY
+        assert r.op.output_types[0] == OutputType.series
+        assert r.op.elementwise is False
+
         r = df.apply("ffill")
         assert r.op._op_type_ == opcodes.FILL_NA
 
diff --git a/mars/dataframe/groupby/tests/test_groupby.py b/mars/dataframe/groupby/tests/test_groupby.py
@@ -396,28 +396,28 @@ def test_groupby_fill():
     )
     mdf = md.DataFrame(df1, chunk_size=3)
 
-    r = tile(getattr(mdf.groupby(["one", "two"]), "ffill")())
+    r = tile(mdf.groupby(["one", "two"]).ffill())
     assert r.op.output_types[0] == OutputType.dataframe
     assert r.shape == (len(df1), 1)
     assert len(r.chunks) == 3
     assert r.chunks[0].shape == (np.nan, 1)
     assert r.dtypes.index.tolist() == ["three"]
 
-    r = tile(getattr(mdf.groupby(["two"]), "bfill")())
+    r = tile(mdf.groupby(["two"]).bfill())
     assert r.op.output_types[0] == OutputType.dataframe
     assert r.shape == (len(df1), 2)
     assert len(r.chunks) == 3
     assert r.chunks[0].shape == (np.nan, 2)
     assert r.dtypes.index.tolist() == ["one", "three"]
 
-    r = tile(getattr(mdf.groupby(["two"]), "backfill")())
+    r = tile(mdf.groupby(["two"]).backfill())
     assert r.op.output_types[0] == OutputType.dataframe
     assert r.shape == (len(df1), 2)
     assert len(r.chunks) == 3
     assert r.chunks[0].shape == (np.nan, 2)
     assert r.dtypes.index.tolist() == ["one", "three"]
 
-    r = tile(getattr(mdf.groupby(["one"]), "fillna")(5))
+    r = tile(mdf.groupby(["one"]).fillna(5))
     assert r.op.output_types[0] == OutputType.dataframe
     assert r.shape == (len(df1), 2)
     assert len(r.chunks) == 3
@@ -426,25 +426,25 @@ def test_groupby_fill():
 
     s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
     ms1 = md.Series(s1, chunk_size=3)
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "ffill")())
+    r = tile(ms1.groupby(lambda x: x % 2).ffill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "bfill")())
+    r = tile(ms1.groupby(lambda x: x % 2).bfill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "backfill")())
+    r = tile(ms1.groupby(lambda x: x % 2).backfill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "fillna")(5))
+    r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
@@ -453,25 +453,25 @@ def test_groupby_fill():
     s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
     ms1 = md.Series(s1, chunk_size=3)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "ffill")())
+    r = tile(ms1.groupby(lambda x: x % 2).ffill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "bfill")())
+    r = tile(ms1.groupby(lambda x: x % 2).bfill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "backfill")())
+    r = tile(ms1.groupby(lambda x: x % 2).backfill())
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
     assert r.chunks[0].shape == (np.nan,)
 
-    r = tile(getattr(ms1.groupby(lambda x: x % 2), "fillna")(5))
+    r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
     assert r.op.output_types[0] == OutputType.series
     assert len(r.chunks) == 4
     assert r.shape == (len(s1),)
diff --git a/mars/dataframe/reduction/custom_reduction.py b/mars/dataframe/reduction/custom_reduction.py
@@ -23,14 +23,7 @@ class DataFrameCustomReduction(DataFrameReductionOperand, DataFrameReductionMixi
     _op_type_ = OperandDef.CUSTOM_REDUCTION
     _func_name = "custom_reduction"
 
-    _custom_reduction = AnyField("custom_reduction")
-
-    def __init__(self, custom_reduction=None, **kw):
-        super().__init__(_custom_reduction=custom_reduction, **kw)
-
-    @property
-    def custom_reduction(self):
-        return self._custom_reduction
+    custom_reduction = AnyField("custom_reduction")
 
     @property
     def is_atomic(self):
diff --git a/mars/dataframe/reduction/tests/test_reduction_execution.py b/mars/dataframe/reduction/tests/test_reduction_execution.py
@@ -673,7 +673,7 @@ def test_nunique(setup, check_ref_counts):
 
 
 @pytest.mark.skipif(pa is None, reason="pyarrow not installed")
-def test_use_arrow_dtype_n_unique(setup, check_ref_counts):
+def test_use_arrow_dtype_nunique(setup, check_ref_counts):
     with option_context({"dataframe.use_arrow_dtype": True, "combine_size": 2}):
         rs = np.random.RandomState(0)
         data1 = pd.DataFrame(
diff --git a/mars/oscar/backends/message.pyx b/mars/oscar/backends/message.pyx
@@ -556,7 +556,6 @@ cpdef reset_random_seed():
     global _rnd_is_seed_set
 
     seed_bytes = getrandbits(64).to_bytes(8, "little")
-    # memcpy(&seed, <char *>seed_bytes, 8)
     _rnd_gen.seed((<uint_fast64_t *><char *>seed_bytes)[0])
     _rnd_is_seed_set = True