DoubleML · jer2ig · Jan 13, 2025 · Jan 27, 2025 · Feb 21, 2025 · Feb 27, 2025
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
@@ -258,13 +258,6 @@ def learner(self):
         """
         return self._learner
 
-    @property
-    def predictions_names(self):
-        """
-        The names of predictions for the nuisance functions.
-        """
-        return list(self.params_names)
-
     @property
     def learner_names(self):
         """
@@ -1088,7 +1081,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m
             _check_external_predictions(
                 external_predictions=external_predictions,
                 valid_treatments=self._dml_data.d_cols,
-                valid_learners=self.predictions_names,
+                valid_learners=self.params_names,
                 n_obs=self.n_obs,
                 n_rep=self.n_rep,
             )
@@ -1111,7 +1104,7 @@ def _initalize_fit(self, store_predictions, store_models):
     def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models):
         ext_prediction_dict = _set_external_predictions(
             external_predictions,
-            learners=self.predictions_names,
+            learners=self.params_names,
             treatment=self._dml_data.d_cols[self._i_treat],
             i_rep=self._i_rep,
         )
@@ -1178,8 +1171,8 @@ def _initialize_arrays(self):
         self._all_se = np.full((n_thetas, n_rep), np.nan)
 
     def _initialize_predictions_and_targets(self):
-        self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
-        self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
+        self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
+        self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
 
     def _initialize_nuisance_loss(self):
         self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names}
@@ -1190,7 +1183,7 @@ def _initialize_models(self):
         }
 
     def _store_predictions_and_targets(self, preds, targets):
-        for learner in self.predictions_names:
+        for learner in self.params_names:
             self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner]
             self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner]
 

diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py
@@ -13,6 +13,7 @@
 from doubleml.utils._estimation import (
     _dml_cv_predict,
     _dml_tune,
+    _double_dml_cv_predict,
 )
 
 
@@ -104,10 +105,6 @@ def __init__(
 
         ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True)
         self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M}
-        # replace aggregated inner names with per-inner-fold names
-        inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)]
-        inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)]
-        self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names
 
         if ml_a is not None:
             ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True)
@@ -162,56 +159,15 @@ def __init__(
         self._sensitivity_implemented = False
 
     def _initialize_ml_nuisance_params(self):
-        self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner}
+        inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds)]
+        inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds)]
+        params_names = ["ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names
+        self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in params_names}
 
     def _check_data(self, obj_dml_data):
         if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]):
             raise TypeError("The outcome variable y must be binary with values 0 and 1.")
 
-    def _double_dml_cv_predict(
-        self,
-        estimator,
-        estimator_name,
-        x,
-        y,
-        smpls=None,
-        smpls_inner=None,
-        n_jobs=None,
-        est_params=None,
-        method="predict",
-        sample_weights=None,
-    ):
-        res = {}
-        res["preds"] = np.zeros(y.shape, dtype=float)
-        res["preds_inner"] = []
-        res["targets_inner"] = []
-        res["models"] = []
-        for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner):
-            res_inner = _dml_cv_predict(
-                estimator,
-                x,
-                y,
-                smpls=smpls_double_split,
-                n_jobs=n_jobs,
-                est_params=est_params,
-                method=method,
-                return_models=True,
-                sample_weights=sample_weights,
-            )
-            _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split)
-
-            res["preds_inner"].append(res_inner["preds"])
-            res["targets_inner"].append(res_inner["targets"])
-            for model in res_inner["models"]:
-                res["models"].append(model)
-                if method == "predict_proba":
-                    res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1]
-                else:
-                    res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]])
-        res["preds"] /= len(smpls)
-        res["targets"] = np.copy(y)
-        return res
-
     def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
         x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
         x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
@@ -234,9 +190,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                     f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
                 )
             M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)]
-            M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None}
+            M_hat = {
+                "preds": external_predictions["ml_M"],
+                "preds_inner": M_hat_inner,
+                "targets": self._dml_data.y,
+                "models": None,
+            }
         else:
-            M_hat = self._double_dml_cv_predict(
+            M_hat = _double_dml_cv_predict(
                 self._learner["ml_M"],
                 "ml_M",
                 x_d_concat,
@@ -250,7 +211,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         # nuisance m
         if m_external:
-            m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None}
+            m_hat = {"preds": external_predictions["ml_m"], "targets": self._dml_data.d, "models": None}
         else:
             if self.score == "instrument":
                 weights = M_hat["preds"] * (1 - M_hat["preds"])
@@ -303,9 +264,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                     f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
                 )
             a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)]
-            a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None}
+            a_hat = {
+                "preds": external_predictions["ml_a"],
+                "preds_inner": a_hat_inner,
+                "targets": self._dml_data.d,
+                "models": None,
+            }
         else:
-            a_hat = self._double_dml_cv_predict(
+            a_hat = _double_dml_cv_predict(
                 self._learner["ml_a"],
                 "ml_a",
                 x,
@@ -404,13 +370,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         return psi_elements, preds
 
-    @property
-    def predictions_names(self):
-        """
-        The names of predictions for the nuisance functions.
-        """
-        return self._predictions_names
-
     def _score_elements(self, y, d, r_hat, m_hat):
         # compute residual
         d_tilde = d - m_hat
@@ -438,8 +397,6 @@ def _sensitivity_element_est(self, preds):
     def _nuisance_tuning(
         self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
     ):
-        if self._i_rep is None:
-            raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.")
         x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
         x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
         x_d_concat = np.hstack((d.reshape(-1, 1), x))
@@ -500,34 +457,16 @@ def _nuisance_tuning(
         a_best_params = [xx.best_params_ for xx in a_tune_res]
 
         # Create targets for tuning ml_t
-        M_hat = self._double_dml_cv_predict(
-            self._learner["ml_M"],
-            "ml_M",
-            x_d_concat,
-            y,
-            smpls=smpls,
-            smpls_inner=self._DoubleML__smpls__inner,
-            n_jobs=n_jobs_cv,
-            est_params=M_best_params,
-            method=self._predict_method["ml_M"],
-        )
 
-        W_inner = []
-        for i, (train, _) in enumerate(smpls):
-            M_iteration = M_hat["preds_inner"][i][train]
-            M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8)
-            w = scipy.special.logit(M_iteration)
-            W_inner.append(w)
+        M_hat = np.full_like(y, np.nan)
+        for idx, (train_index, _) in enumerate(smpls):
+            M_hat[train_index] = M_tune_res[idx].predict_proba(x_d_concat[train_index, :])[:, 1]
 
-        # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN
-        W_targets = []
-        for i, train in enumerate(train_inds):
-            wt = np.full(x.shape[0], np.nan, dtype=float)
-            wt[train] = W_inner[i]
-            W_targets.append(wt)
+        M_hat = np.clip(M_hat, 1e-8, 1 - 1e-8)
+        W_hat = scipy.special.logit(M_hat)
 
         t_tune_res = _dml_tune(
-            W_inner,
+            W_hat,
             x,
             train_inds,
             self._learner["ml_t"],
@@ -537,7 +476,6 @@ def _nuisance_tuning(
             n_jobs_cv,
             search_mode,
             n_iter_randomized_search,
-            fold_specific_target=True,
         )
         t_best_params = [xx.best_params_ for xx in t_tune_res]
 

diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py
@@ -7,22 +7,22 @@
 from doubleml.plm.datasets import make_lplr_LZZ2020
 
 
-@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
+@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
 def learner_M(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
+@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
 def learner_t(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
+@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
 def learner_m(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
+@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
 def learner_m_classifier(request):
     return request.param
 
@@ -33,7 +33,6 @@ def score(request):
 
 
 @pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"])
-# TODO: Error for continuous treatment?
 def treatment(request):
     return request.param
 

diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py
@@ -14,9 +14,9 @@
 # create test data and basic learners
 dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20)
 dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20)
-ml_M = RandomForestClassifier()
-ml_t = RandomForestRegressor()
-ml_m = RandomForestRegressor()
+ml_M = RandomForestClassifier(max_depth=2, n_estimators=10)
+ml_t = RandomForestRegressor(max_depth=2, n_estimators=10)
+ml_m = RandomForestRegressor(max_depth=2, n_estimators=10)
 dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m)
 dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument")
 

diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py
@@ -44,7 +44,7 @@ def dml_lplr_fixture(
     learner_m,
     learner_a,
     score,
-    tune_on_folds=True,
+    tune_on_folds=False,
 ):
     par_grid = {
         "ml_M": get_par_grid(),
@@ -94,28 +94,3 @@ def test_dml_selection_coef(dml_lplr_fixture):
     se = dml_lplr_fixture["se"]
     true_coef = dml_lplr_fixture["true_coef"]
     assert abs(coef - true_coef) <= 3.0 * np.sqrt(se)
-
-
-@pytest.mark.ci
-def test_lplr_exception_tuning(
-    learner_M,
-    learner_t,
-    learner_m,
-    learner_a,
-):
-    # LPLR valid scores are 'nuisance_space' and 'instrument'
-    obj_dml_data = make_lplr_LZZ2020(alpha=0.5)
-    ml_M = clone(learner_M)
-    ml_t = clone(learner_t)
-    ml_m = clone(learner_m)
-
-    dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m)
-    par_grid = {
-        "ml_M": get_par_grid(),
-        "ml_t": get_par_grid(),
-        "ml_m": get_par_grid(),
-        "ml_a": get_par_grid(),
-    }
-    msg = "tune_on_folds must be True as targets have to be created for ml_t on folds."
-    with pytest.raises(ValueError, match=msg):
-        dml_lplr_obj.tune(par_grid, tune_on_folds=False)
diff --git a/doubleml/plm/tests/test_model_defaults.py b/doubleml/plm/tests/test_model_defaults.py
@@ -0,0 +1,51 @@
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml import DoubleMLLPLR
+from doubleml.plm.datasets import make_lplr_LZZ2020
+from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap
+
+dml_data_lplr = make_lplr_LZZ2020(n_obs=100)
+
+dml_lplr_obj = DoubleMLLPLR(dml_data_lplr, LogisticRegression(), LinearRegression(), LinearRegression())
+
+
+@pytest.mark.ci
+def test_lplr_defaults():
+    _check_basic_defaults_before_fit(dml_lplr_obj)
+
+    _fit_bootstrap(dml_lplr_obj)
+
+    _check_basic_defaults_after_fit(dml_lplr_obj)
+
+
+@pytest.mark.ci
+def test_did_multi_str():
+    # Test the string representation before fitting
+    dml_str = str(dml_lplr_obj)
+
+    # Check that all important sections are present
+    assert "================== DoubleMLLPLR Object ==================" in dml_str
+    assert "------------------ Data Summary      ------------------" in dml_str
+    assert "------------------ Score & Algorithm ------------------" in dml_str
+    assert "------------------ Machine Learner   ------------------" in dml_str
+    assert "------------------ Resampling        ------------------" in dml_str
+    assert "------------------ Fit Summary       ------------------" in dml_str
+
+    # Check specific content before fitting
+    assert "No. folds: 5" in dml_str
+    assert "No. repeated sample splits: 1" in dml_str
+    assert "Learner ml_M:" in dml_str
+    assert "Learner ml_m:" in dml_str
+    assert "Learner ml_t:" in dml_str
+
+    # Fit the model
+    dml_lplr_obj_fit = dml_lplr_obj.fit()
+    dml_str_after_fit = str(dml_lplr_obj_fit)
+
+    # Check that additional information is present after fitting
+    assert "coef" in dml_str_after_fit
+    assert "std err" in dml_str_after_fit
+    assert "t" in dml_str_after_fit
+    assert "P>|t|" in dml_str_after_fit
+    assert "Out-of-sample Performance:" in dml_str_after_fit