Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ hf = [
"huggingface_hub>=0.24",
]
policyengine = [
"microimpute==3.1.0; python_full_version >= '3.12' and python_full_version < '3.15'",
"microimpute==3.1.1; python_full_version >= '3.12' and python_full_version < '3.15'",
"policyengine-us==1.715.2; python_version >= '3.11' and python_version < '3.15'",
"spm-calculator>=0.3.1",
# Standalone tax-unit construction engine (the extraction of eCPS's
Expand Down
26 changes: 25 additions & 1 deletion src/microplex_us/pipelines/donor_imputers.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,19 +178,43 @@ def __init__(
condition_vars: list[str],
target_vars: list[str],
n_estimators: int = 100,
max_train_samples: int | None = 50_000,
classifier_type: str = "hist_gb",
seed: int = 42,
) -> None:
self.condition_vars = list(condition_vars)
self.target_vars = list(target_vars)
self.n_estimators = int(n_estimators)
if max_train_samples is not None and int(max_train_samples) < 1:
raise ValueError("max_train_samples must be a positive integer")
self.max_train_samples = (
None if max_train_samples is None else int(max_train_samples)
)
self.classifier_type = str(classifier_type)
self.seed = int(seed)
self._fitted: dict[str, Any] = {}
self._fitted_columns: tuple[str, ...] = ()
self._predictor_columns: tuple[str, ...] = ()
self._regimes: dict[str, str] = {}

def _configured_qrf_class(self, qrf_class: type[Any]) -> type[Any]:
n_estimators = self.n_estimators
max_train_samples = self.max_train_samples

class ConfiguredQRF(qrf_class):
def __init__(self, *args: Any, **kwargs: Any) -> None:
if max_train_samples is not None:
kwargs.setdefault("max_train_samples", max_train_samples)
super().__init__(*args, **kwargs)

def fit(self, *args: Any, **kwargs: Any) -> Any:
kwargs.setdefault("n_estimators", n_estimators)
kwargs.setdefault("n_jobs", -1)
return super().fit(*args, **kwargs)

ConfiguredQRF.__name__ = "ConfiguredRegimeAwareQRF"
return ConfiguredQRF

def fit(
self,
data: pd.DataFrame,
Expand Down Expand Up @@ -234,7 +258,7 @@ def fit(
return self

wrapper = ZeroInflatedImputer(
base_imputer_class=QRF,
base_imputer_class=self._configured_qrf_class(QRF),
base_imputer_kwargs={},
classifier_type=self.classifier_type,
sequential=True,
Expand Down
2 changes: 2 additions & 0 deletions src/microplex_us/pipelines/us.py
Original file line number Diff line number Diff line change
Expand Up @@ -2186,6 +2186,7 @@ class USMicroplexBuildConfig:
donor_imputer_hidden_dim: int = 32
donor_imputer_backend: Literal["maf", "qrf", "zi_qrf", "regime_aware"] = "maf"
donor_imputer_qrf_n_estimators: int = 100
donor_imputer_qrf_max_train_samples: int | None = 50_000
donor_imputer_qrf_zero_threshold: float = 0.05
donor_imputer_condition_selection: Literal[
"all_shared",
Expand Down Expand Up @@ -6012,6 +6013,7 @@ def _build_donor_imputer(
condition_vars=condition_vars,
target_vars=list(target_vars),
n_estimators=self.config.donor_imputer_qrf_n_estimators,
max_train_samples=self.config.donor_imputer_qrf_max_train_samples,
seed=self.config.random_seed,
)
zero_inflated_vars = (
Expand Down
35 changes: 35 additions & 0 deletions tests/pipelines/test_regime_aware_donor_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,41 @@ def test_factory_dispatches_to_regime_aware(self) -> None:
class TestRegimeAwareFitGenerate:
"""Fit/generate contract and tripartite-specific guarantees."""

def test_qrf_budget_reaches_microimpute_base(self, monkeypatch) -> None:
from microplex_us.pipelines.us import RegimeAwareDonorImputer

captured: dict[str, object] = {}

class FakeQRF:
def __init__(self, *args, **kwargs):
captured["init_args"] = args
captured["init_kwargs"] = kwargs

def fit(self, *args, **kwargs):
captured["fit_args"] = args
captured["fit_kwargs"] = kwargs
return self

monkeypatch.setattr("microimpute.models.qrf.QRF", FakeQRF)

train = pd.DataFrame(
{
"age": [25.0, 35.0, 45.0, 55.0] * 10,
"income_leaf": [100.0, 200.0, 300.0, 400.0] * 10,
}
)
imputer = RegimeAwareDonorImputer(
condition_vars=["age"],
target_vars=["income_leaf"],
n_estimators=7,
max_train_samples=17,
)
imputer.fit(train)

assert captured["init_kwargs"]["max_train_samples"] == 17
assert captured["fit_kwargs"]["n_estimators"] == 7
assert captured["fit_kwargs"]["n_jobs"] == -1

def test_multi_target_fit_uses_one_chained_zero_inflated_imputer(self) -> None:
from microplex_us.pipelines.us import RegimeAwareDonorImputer

Expand Down
4 changes: 4 additions & 0 deletions tests/pipelines/test_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -3600,6 +3600,8 @@ def __init__(self, **kwargs):
USMicroplexBuildConfig(
n_synthetic=4,
donor_imputer_backend="regime_aware",
donor_imputer_qrf_n_estimators=77,
donor_imputer_qrf_max_train_samples=1234,
)
)
regime_pipeline._build_donor_imputer(
Expand All @@ -3619,6 +3621,8 @@ def __init__(self, **kwargs):
)

assert "nonnegative_vars" not in captured["regime_aware"]
assert captured["regime_aware"]["n_estimators"] == 77
assert captured["regime_aware"]["max_train_samples"] == 1234
assert captured["zi_qrf"]["nonnegative_vars"] == set()
assert captured["zi_qrf"]["zero_inflated_vars"] == {
"partnership_s_corp_income",
Expand Down
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading