Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
df03887
Logistic regression implementation WIP
jer2ig Jan 13, 2025
f5521f1
First WIP of implementation
jer2ig Jan 27, 2025
bfa756c
Working implementation. Started on test set-up.
jer2ig Feb 21, 2025
d729d0a
Changed data type of arrays
jer2ig Feb 27, 2025
8fe7ca6
Fix variable name
jer2ig Feb 27, 2025
18bac23
Moved into plm folder, started testing setup
jer2ig Aug 27, 2025
c6e600d
Fixed bug in score computation
jer2ig Aug 27, 2025
6f556e0
Reverted from ensure_all_finite to force_all_finite
jer2ig Aug 27, 2025
3a332bf
Fixes to instrument score
jer2ig Aug 28, 2025
b41a773
Added option for exception on convergence failure
jer2ig Sep 3, 2025
c434667
Added unbalanced dataset option, bug fixes
jer2ig Sep 29, 2025
443d82d
Added binary treatment dataset, fixed bug for model check
jer2ig Oct 7, 2025
774c74d
Adjusted dataset balancing
jer2ig Oct 7, 2025
9695820
Renamed Logistic to LPLR
jer2ig Oct 27, 2025
dbfea73
Clean-up of branch
jer2ig Oct 27, 2025
29114ce
Ruff checks and formatting
jer2ig Oct 27, 2025
5d2d1ed
Unit tests work and bug fix in lplr
jer2ig Oct 28, 2025
2c626a0
Cleanup
jer2ig Oct 28, 2025
9819436
Tests updated
jer2ig Nov 6, 2025
5a7e279
Pre-commit checks
jer2ig Nov 6, 2025
fc03cc6
Pre-commit checks on all files
jer2ig Nov 6, 2025
5dae651
Changed function signature, test
jer2ig Nov 7, 2025
13fca2f
Argument fix
jer2ig Nov 7, 2025
ff4c75b
Updated tests for improved coverage
jer2ig Nov 7, 2025
8a181cd
Unused var removed
jer2ig Nov 7, 2025
f2ecea7
Fixed resampling
jer2ig Nov 7, 2025
a9a2959
External predictions
jer2ig Nov 8, 2025
cd6055b
Bugfix and addtl text
jer2ig Nov 8, 2025
4a8be08
Change to ext predictions
jer2ig Nov 10, 2025
0472f1c
Change to targets data type
jer2ig Nov 10, 2025
2fc1f53
DoubleResamplin integrated into mixin, small changes
jer2ig Nov 10, 2025
ecfe2c7
Added attribute to sample mixin
jer2ig Nov 10, 2025
a9c0deb
Smpls inner access adjusted
jer2ig Nov 10, 2025
6abff49
Docstring, complexity reduction
jer2ig Nov 11, 2025
0f08e37
Weights updated, seed corrected
jer2ig Nov 11, 2025
430f4a6
Fix
jer2ig Nov 11, 2025
5b92395
Renaming
jer2ig Nov 11, 2025
042aa26
Doctest
jer2ig Nov 11, 2025
3b6f3b7
Test updated and comments implemented
jer2ig Nov 12, 2025
883aa77
Merge branch 'main' into jh-logistic-model
jer2ig Nov 12, 2025
74b1caa
Sample splitting exceptions
jer2ig Nov 12, 2025
46b575b
Merge remote-tracking branch 'origin/jh-logistic-model' into jh-logis…
jer2ig Nov 12, 2025
72be054
Test coverage increase
jer2ig Nov 12, 2025
5d9e0eb
Exception fixed
jer2ig Nov 13, 2025
99e78bf
PR Review
jer2ig Nov 14, 2025
8f7125f
Exceptions fixed
jer2ig Nov 14, 2025
03fd191
Test fixed
jer2ig Nov 14, 2025
33a86d0
Skip doctests for summary outputs
SvenKlaassen Nov 14, 2025
96f33ae
Enhance learner evaluation checks and handle NaN targets in DoubleML …
SvenKlaassen Nov 14, 2025
3d362aa
removed unnecessary test
SvenKlaassen Nov 14, 2025
a3f7f82
change to ensure_all_finite
SvenKlaassen Nov 14, 2025
f9e61a7
Comment added
jer2ig Nov 14, 2025
127e0cb
Merge remote-tracking branch 'origin/jh-logistic-model' into jh-logis…
jer2ig Nov 14, 2025
1f954db
update docstrings
SvenKlaassen Nov 18, 2025
6999467
formatting
SvenKlaassen Nov 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions doubleml/double_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,6 @@ def learner(self):
"""
return self._learner

@property
def predictions_names(self):
"""
The names of predictions for the nuisance functions.
"""
return list(self.params_names)

@property
def learner_names(self):
"""
Expand Down Expand Up @@ -1088,7 +1081,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m
_check_external_predictions(
external_predictions=external_predictions,
valid_treatments=self._dml_data.d_cols,
valid_learners=self.predictions_names,
valid_learners=self.params_names,
n_obs=self.n_obs,
n_rep=self.n_rep,
)
Expand All @@ -1111,7 +1104,7 @@ def _initalize_fit(self, store_predictions, store_models):
def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models):
ext_prediction_dict = _set_external_predictions(
external_predictions,
learners=self.predictions_names,
learners=self.params_names,
treatment=self._dml_data.d_cols[self._i_treat],
i_rep=self._i_rep,
)
Expand Down Expand Up @@ -1178,8 +1171,8 @@ def _initialize_arrays(self):
self._all_se = np.full((n_thetas, n_rep), np.nan)

def _initialize_predictions_and_targets(self):
self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.predictions_names}
self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}

def _initialize_nuisance_loss(self):
self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names}
Expand All @@ -1190,7 +1183,7 @@ def _initialize_models(self):
}

def _store_predictions_and_targets(self, preds, targets):
for learner in self.predictions_names:
for learner in self.params_names:
self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner]
self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner]

Expand Down
114 changes: 26 additions & 88 deletions doubleml/plm/lplr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from doubleml.utils._estimation import (
_dml_cv_predict,
_dml_tune,
_double_dml_cv_predict,
)


Expand Down Expand Up @@ -104,10 +105,6 @@ def __init__(

ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True)
self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M}
# replace aggregated inner names with per-inner-fold names
inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds_inner)]
inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds_inner)]
self._predictions_names = ["ml_r", "ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names

if ml_a is not None:
ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True)
Expand Down Expand Up @@ -162,56 +159,15 @@ def __init__(
self._sensitivity_implemented = False

def _initialize_ml_nuisance_params(self):
self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner}
inner_M_names = [f"ml_M_inner_{i}" for i in range(self.n_folds)]
inner_a_names = [f"ml_a_inner_{i}" for i in range(self.n_folds)]
params_names = ["ml_m", "ml_a", "ml_t", "ml_M"] + inner_M_names + inner_a_names
self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in params_names}

def _check_data(self, obj_dml_data):
if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]):
raise TypeError("The outcome variable y must be binary with values 0 and 1.")

def _double_dml_cv_predict(
self,
estimator,
estimator_name,
x,
y,
smpls=None,
smpls_inner=None,
n_jobs=None,
est_params=None,
method="predict",
sample_weights=None,
):
res = {}
res["preds"] = np.zeros(y.shape, dtype=float)
res["preds_inner"] = []
res["targets_inner"] = []
res["models"] = []
for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner):
res_inner = _dml_cv_predict(
estimator,
x,
y,
smpls=smpls_double_split,
n_jobs=n_jobs,
est_params=est_params,
method=method,
return_models=True,
sample_weights=sample_weights,
)
_check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split)

res["preds_inner"].append(res_inner["preds"])
res["targets_inner"].append(res_inner["targets"])
for model in res_inner["models"]:
res["models"].append(model)
if method == "predict_proba":
res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1]
else:
res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]])
res["preds"] /= len(smpls)
res["targets"] = np.copy(y)
return res

def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
Expand All @@ -234,9 +190,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
)
M_hat_inner = [external_predictions[f"ml_M_inner_{i}"] for i in range(self.n_folds_inner)]
M_hat = {"preds": external_predictions["ml_M"], "preds_inner": M_hat_inner, "targets": None, "models": None}
M_hat = {
"preds": external_predictions["ml_M"],
"preds_inner": M_hat_inner,
"targets": self._dml_data.y,
"models": None,
}
else:
M_hat = self._double_dml_cv_predict(
M_hat = _double_dml_cv_predict(
self._learner["ml_M"],
"ml_M",
x_d_concat,
Expand All @@ -250,7 +211,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa

# nuisance m
if m_external:
m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None}
m_hat = {"preds": external_predictions["ml_m"], "targets": self._dml_data.d, "models": None}
else:
if self.score == "instrument":
weights = M_hat["preds"] * (1 - M_hat["preds"])
Expand Down Expand Up @@ -303,9 +264,14 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
f"have to be provided (missing: {', '.join([str(i) for i in missing])})."
)
a_hat_inner = [external_predictions[f"ml_a_inner_{i}"] for i in range(self.n_folds_inner)]
a_hat = {"preds": external_predictions["ml_a"], "preds_inner": a_hat_inner, "targets": None, "models": None}
a_hat = {
"preds": external_predictions["ml_a"],
"preds_inner": a_hat_inner,
"targets": self._dml_data.d,
"models": None,
}
else:
a_hat = self._double_dml_cv_predict(
a_hat = _double_dml_cv_predict(
self._learner["ml_a"],
"ml_a",
x,
Expand Down Expand Up @@ -404,13 +370,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa

return psi_elements, preds

@property
def predictions_names(self):
"""
The names of predictions for the nuisance functions.
"""
return self._predictions_names

def _score_elements(self, y, d, r_hat, m_hat):
# compute residual
d_tilde = d - m_hat
Expand Down Expand Up @@ -438,8 +397,6 @@ def _sensitivity_element_est(self, preds):
def _nuisance_tuning(
self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
):
if self._i_rep is None:
raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.")
x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False)
x, d = check_X_y(x, self._dml_data.d, force_all_finite=False)
x_d_concat = np.hstack((d.reshape(-1, 1), x))
Expand Down Expand Up @@ -500,34 +457,16 @@ def _nuisance_tuning(
a_best_params = [xx.best_params_ for xx in a_tune_res]

# Create targets for tuning ml_t
M_hat = self._double_dml_cv_predict(
self._learner["ml_M"],
"ml_M",
x_d_concat,
y,
smpls=smpls,
smpls_inner=self._DoubleML__smpls__inner,
n_jobs=n_jobs_cv,
est_params=M_best_params,
method=self._predict_method["ml_M"],
)

W_inner = []
for i, (train, _) in enumerate(smpls):
M_iteration = M_hat["preds_inner"][i][train]
M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8)
w = scipy.special.logit(M_iteration)
W_inner.append(w)
M_hat = np.full_like(y, np.nan)
for idx, (train_index, _) in enumerate(smpls):
M_hat[train_index] = M_tune_res[idx].predict_proba(x_d_concat[train_index, :])[:, 1]

# Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN
W_targets = []
for i, train in enumerate(train_inds):
wt = np.full(x.shape[0], np.nan, dtype=float)
wt[train] = W_inner[i]
W_targets.append(wt)
M_hat = np.clip(M_hat, 1e-8, 1 - 1e-8)
W_hat = scipy.special.logit(M_hat)

t_tune_res = _dml_tune(
W_inner,
W_hat,
x,
train_inds,
self._learner["ml_t"],
Expand All @@ -537,7 +476,6 @@ def _nuisance_tuning(
n_jobs_cv,
search_mode,
n_iter_randomized_search,
fold_specific_target=True,
)
t_best_params = [xx.best_params_ for xx in t_tune_res]

Expand Down
9 changes: 4 additions & 5 deletions doubleml/plm/tests/test_lplr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@
from doubleml.plm.datasets import make_lplr_LZZ2020


@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
def learner_M(request):
return request.param


@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
def learner_t(request):
return request.param


@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42, max_depth=2, n_estimators=10)])
def learner_m(request):
return request.param


@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)])
@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42, max_depth=2, n_estimators=10)])
def learner_m_classifier(request):
return request.param

Expand All @@ -33,7 +33,6 @@ def score(request):


@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"])
# TODO: Error for continuous treatment?
def treatment(request):
return request.param

Expand Down
6 changes: 3 additions & 3 deletions doubleml/plm/tests/test_lplr_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
# create test data and basic learners
dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20)
dml_data_binary = make_lplr_LZZ2020(alpha=0.5, n_obs=n, treatment="binary", dim_x=20)
ml_M = RandomForestClassifier()
ml_t = RandomForestRegressor()
ml_m = RandomForestRegressor()
ml_M = RandomForestClassifier(max_depth=2, n_estimators=10)
ml_t = RandomForestRegressor(max_depth=2, n_estimators=10)
ml_m = RandomForestRegressor(max_depth=2, n_estimators=10)
dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m)
dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument")

Expand Down
27 changes: 1 addition & 26 deletions doubleml/plm/tests/test_lplr_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def dml_lplr_fixture(
learner_m,
learner_a,
score,
tune_on_folds=True,
tune_on_folds=False,
):
par_grid = {
"ml_M": get_par_grid(),
Expand Down Expand Up @@ -94,28 +94,3 @@ def test_dml_selection_coef(dml_lplr_fixture):
se = dml_lplr_fixture["se"]
true_coef = dml_lplr_fixture["true_coef"]
assert abs(coef - true_coef) <= 3.0 * np.sqrt(se)


@pytest.mark.ci
def test_lplr_exception_tuning(
learner_M,
learner_t,
learner_m,
learner_a,
):
# LPLR valid scores are 'nuisance_space' and 'instrument'
obj_dml_data = make_lplr_LZZ2020(alpha=0.5)
ml_M = clone(learner_M)
ml_t = clone(learner_t)
ml_m = clone(learner_m)

dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m)
par_grid = {
"ml_M": get_par_grid(),
"ml_t": get_par_grid(),
"ml_m": get_par_grid(),
"ml_a": get_par_grid(),
}
msg = "tune_on_folds must be True as targets have to be created for ml_t on folds."
with pytest.raises(ValueError, match=msg):
dml_lplr_obj.tune(par_grid, tune_on_folds=False)
51 changes: 51 additions & 0 deletions doubleml/plm/tests/test_model_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest
from sklearn.linear_model import LinearRegression, LogisticRegression

from doubleml import DoubleMLLPLR
from doubleml.plm.datasets import make_lplr_LZZ2020
from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap

dml_data_lplr = make_lplr_LZZ2020(n_obs=100)

dml_lplr_obj = DoubleMLLPLR(dml_data_lplr, LogisticRegression(), LinearRegression(), LinearRegression())


@pytest.mark.ci
def test_lplr_defaults():
_check_basic_defaults_before_fit(dml_lplr_obj)

_fit_bootstrap(dml_lplr_obj)

_check_basic_defaults_after_fit(dml_lplr_obj)


@pytest.mark.ci
def test_did_multi_str():
# Test the string representation before fitting
dml_str = str(dml_lplr_obj)

# Check that all important sections are present
assert "================== DoubleMLLPLR Object ==================" in dml_str
assert "------------------ Data Summary ------------------" in dml_str
assert "------------------ Score & Algorithm ------------------" in dml_str
assert "------------------ Machine Learner ------------------" in dml_str
assert "------------------ Resampling ------------------" in dml_str
assert "------------------ Fit Summary ------------------" in dml_str

# Check specific content before fitting
assert "No. folds: 5" in dml_str
assert "No. repeated sample splits: 1" in dml_str
assert "Learner ml_M:" in dml_str
assert "Learner ml_m:" in dml_str
assert "Learner ml_t:" in dml_str

# Fit the model
dml_lplr_obj_fit = dml_lplr_obj.fit()
dml_str_after_fit = str(dml_lplr_obj_fit)

# Check that additional information is present after fitting
assert "coef" in dml_str_after_fit
assert "std err" in dml_str_after_fit
assert "t" in dml_str_after_fit
assert "P>|t|" in dml_str_after_fit
assert "Out-of-sample Performance:" in dml_str_after_fit
Loading
Loading