Skip to content

Commit cd4f5a6

Browse files
authored
Merge pull request #260 from igerber/survey-maturity-2
Add bootstrap lonely_psu adjust, CV on estimates, weight trimming, and pretrends+survey
2 parents 55a710b + 4fbb82d commit cd4f5a6

18 files changed

Lines changed: 1314 additions & 162 deletions

TODO.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Deferred items from PR reviews that were not addressed before merge.
5454
|-------|----------|----|----------|
5555
| CallawaySantAnna: consider materializing NaN entries for non-estimable (g,t) cells in group_time_effects dict (currently omitted with consolidated warning); would require updating downstream consumers (event study, balance_e, aggregation) | `staggered.py` | #256 | Low |
5656
| ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails) |
57+
| ImputationDiD survey pretrends: subpopulation approach implemented (full design with zero-padded scores). Resolved in #260. | `imputation.py` | #260 | Resolved |
5758
| Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium |
5859
| Replicate-weight survey df — **Resolved**. `df_survey = rank(replicate_weights) - 1` matching R's `survey::degf()`. For IF paths, `n_valid - 1` when dropped replicates reduce effective count. | `survey.py` | #238 | Resolved |
5960
| CallawaySantAnna survey: strata/PSU/FPC — **Resolved**. Aggregated SEs (overall, event study, group) use `compute_survey_if_variance()`. Bootstrap uses PSU-level multiplier weights. | `staggered.py` | #237 | Resolved |

diff_diff/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
make_treatment_indicator,
9595
rank_control_units,
9696
summarize_did_data,
97+
trim_weights,
9798
validate_did_data,
9899
wide_to_long,
99100
)
@@ -307,6 +308,7 @@
307308
"make_post_indicator",
308309
"wide_to_long",
309310
"balance_panel",
311+
"trim_weights",
310312
"validate_did_data",
311313
"summarize_did_data",
312314
"generate_did_data",

diff_diff/bootstrap_utils.py

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,10 @@ def generate_survey_multiplier_weights_batch(
433433
is present, weights are scaled by ``sqrt(1 - f_h)`` per stratum so
434434
the bootstrap variance matches the TSL variance.
435435
436+
For ``lonely_psu="adjust"``, singleton PSUs from different strata are
437+
pooled into a combined pseudo-stratum and weights are generated for
438+
the pooled group (no FPC scaling on pooled singletons).
439+
436440
Parameters
437441
----------
438442
n_bootstrap : int
@@ -454,11 +458,7 @@ def generate_survey_multiplier_weights_batch(
454458
psu = resolved_survey.psu
455459
strata = resolved_survey.strata
456460

457-
if resolved_survey.lonely_psu == "adjust":
458-
raise NotImplementedError(
459-
"lonely_psu='adjust' is not yet supported for survey-aware bootstrap. "
460-
"Use lonely_psu='remove' or 'certainty', or use analytical inference."
461-
)
461+
_lonely_psu = resolved_survey.lonely_psu
462462

463463
if psu is None:
464464
# Each observation is its own PSU
@@ -499,6 +499,7 @@ def generate_survey_multiplier_weights_batch(
499499
psu_to_col = {int(p): i for i, p in enumerate(psu_ids)}
500500

501501
unique_strata = np.unique(strata)
502+
_singleton_cols = [] # For lonely_psu="adjust" pooling
502503
for h in unique_strata:
503504
mask_h = strata == h
504505

@@ -511,8 +512,12 @@ def generate_survey_multiplier_weights_batch(
511512
cols = np.array([psu_to_col[int(p)] for p in psus_in_h])
512513

513514
if n_h < 2:
514-
# Lonely PSU — zero weight (matches remove/certainty behavior)
515-
weights[:, cols] = 0.0
515+
if _lonely_psu == "adjust":
516+
# Collect for pooled pseudo-stratum processing
517+
_singleton_cols.extend(cols.tolist())
518+
else:
519+
# remove / certainty — zero weight
520+
weights[:, cols] = 0.0
516521
continue
517522

518523
# Generate weights for this stratum
@@ -536,6 +541,31 @@ def generate_survey_multiplier_weights_batch(
536541

537542
weights[:, cols] = stratum_weights
538543

544+
# Pool singleton PSUs into a pseudo-stratum for "adjust"
545+
if _singleton_cols:
546+
n_pooled = len(_singleton_cols)
547+
if n_pooled >= 2:
548+
pooled_weights = generate_bootstrap_weights_batch_numpy(
549+
n_bootstrap, n_pooled, weight_type, rng
550+
)
551+
# No FPC scaling for pooled singletons (conservative)
552+
pooled_cols = np.array(_singleton_cols)
553+
weights[:, pooled_cols] = pooled_weights
554+
else:
555+
# Single singleton — cannot pool, zero weight (library-specific
556+
# fallback; bootstrap adjust with one singleton = remove).
557+
import warnings
558+
559+
warnings.warn(
560+
"lonely_psu='adjust' with only 1 singleton stratum in "
561+
"bootstrap: singleton PSU contributes zero variance "
562+
"(same as 'remove'). At least 2 singleton strata are "
563+
"needed for pooled pseudo-stratum bootstrap.",
564+
UserWarning,
565+
stacklevel=3,
566+
)
567+
weights[:, _singleton_cols[0]] = 0.0
568+
539569
return weights, psu_ids
540570

541571

@@ -553,6 +583,9 @@ def generate_rao_wu_weights(
553583
With FPC: ``m_h = max(1, round((1 - f_h) * (n_h - 1)))``
554584
(Rao, Wu & Yue 1992, Section 3).
555585
586+
For ``lonely_psu="adjust"``, singleton PSUs are pooled into a combined
587+
pseudo-stratum and resampled together (no FPC scaling on pooled group).
588+
556589
Parameters
557590
----------
558591
resolved_survey : ResolvedSurveyDesign
@@ -570,11 +603,7 @@ def generate_rao_wu_weights(
570603
psu = resolved_survey.psu
571604
strata = resolved_survey.strata
572605

573-
if resolved_survey.lonely_psu == "adjust":
574-
raise NotImplementedError(
575-
"lonely_psu='adjust' is not yet supported for survey-aware bootstrap. "
576-
"Use lonely_psu='remove' or 'certainty', or use analytical inference."
577-
)
606+
_lonely_psu_rw = resolved_survey.lonely_psu
578607

579608
rescaled = np.zeros(n_obs, dtype=np.float64)
580609

@@ -589,14 +618,20 @@ def generate_rao_wu_weights(
589618
unique_strata = np.unique(strata)
590619
strata_masks = [strata == h for h in unique_strata]
591620

621+
# Collect singleton PSUs for "adjust" pooling
622+
_singleton_info = [] # list of (mask_h, unique_psu_h) tuples
623+
592624
for mask_h in strata_masks:
593625
psu_h = obs_psu[mask_h]
594626
unique_psu_h = np.unique(psu_h)
595627
n_h = len(unique_psu_h)
596628

597629
if n_h < 2:
598-
# Census / lonely PSU — keep original weights (zero variance)
599-
rescaled[mask_h] = base_weights[mask_h]
630+
if _lonely_psu_rw == "adjust":
631+
_singleton_info.append((mask_h, unique_psu_h))
632+
else:
633+
# remove / certainty — keep original weights (zero variance)
634+
rescaled[mask_h] = base_weights[mask_h]
600635
continue
601636

602637
# Compute resample size
@@ -629,6 +664,41 @@ def generate_rao_wu_weights(
629664
local_indices = np.array([psu_to_local[int(obs_psu[idx])] for idx in obs_in_h])
630665
rescaled[obs_in_h] = base_weights[obs_in_h] * scale_per_psu[local_indices]
631666

667+
# Pool singleton PSUs into a pseudo-stratum for "adjust"
668+
if _singleton_info:
669+
# Combine all singleton PSUs into one group
670+
pooled_psus = np.concatenate([p for _, p in _singleton_info])
671+
n_pooled = len(pooled_psus)
672+
673+
if n_pooled >= 2:
674+
m_pooled = n_pooled - 1 # No FPC for pooled singletons
675+
drawn = rng.choice(n_pooled, size=m_pooled, replace=True)
676+
counts = np.bincount(drawn, minlength=n_pooled)
677+
scale_per_psu = (n_pooled / m_pooled) * counts.astype(np.float64)
678+
679+
# Build PSU → scale mapping and apply
680+
psu_scale_map = {int(pooled_psus[i]): scale_per_psu[i] for i in range(n_pooled)}
681+
for mask_h, _ in _singleton_info:
682+
obs_in_h = np.where(mask_h)[0]
683+
for idx in obs_in_h:
684+
p = int(obs_psu[idx])
685+
rescaled[idx] = base_weights[idx] * psu_scale_map.get(p, 1.0)
686+
else:
687+
# Single singleton — cannot pool, keep base weights (library-specific
688+
# fallback; bootstrap adjust with one singleton = remove).
689+
import warnings
690+
691+
warnings.warn(
692+
"lonely_psu='adjust' with only 1 singleton stratum in "
693+
"bootstrap: singleton PSU contributes zero variance "
694+
"(same as 'remove'). At least 2 singleton strata are "
695+
"needed for pooled pseudo-stratum bootstrap.",
696+
UserWarning,
697+
stacklevel=2,
698+
)
699+
for mask_h, _ in _singleton_info:
700+
rescaled[mask_h] = base_weights[mask_h]
701+
632702
return rescaled
633703

634704

diff_diff/continuous_did_results.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,15 @@ def __repr__(self) -> str:
154154
f"n_periods={len(self.time_periods)})"
155155
)
156156

157+
@property
158+
def coef_var(self) -> float:
159+
"""Coefficient of variation: SE / |overall ATT|. NaN when ATT is 0 or SE non-finite."""
160+
if not (np.isfinite(self.overall_att_se) and self.overall_att_se >= 0):
161+
return np.nan
162+
if not np.isfinite(self.overall_att) or self.overall_att == 0:
163+
return np.nan
164+
return self.overall_att_se / abs(self.overall_att)
165+
157166
def summary(self, alpha: Optional[float] = None) -> str:
158167
"""Generate formatted summary."""
159168
alpha = alpha or self.alpha
@@ -223,10 +232,15 @@ def summary(self, alpha: Optional[float] = None) -> str:
223232
f"[{self.overall_att_conf_int[0]:.4f}, {self.overall_att_conf_int[1]:.4f}]",
224233
f"{conf_level}% CI for ACRT_glob: "
225234
f"[{self.overall_acrt_conf_int[0]:.4f}, {self.overall_acrt_conf_int[1]:.4f}]",
226-
"",
227235
]
228236
)
229237

238+
cv = self.coef_var
239+
if np.isfinite(cv):
240+
lines.append(f"{'CV (SE/|ATT|):':<25} {cv:>10.4f}")
241+
242+
lines.append("")
243+
230244
# Dose-response curve summary (first/mid/last points)
231245
if len(self.dose_grid) > 0:
232246
lines.extend(

diff_diff/efficient_did_results.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,15 @@ def __repr__(self) -> str:
172172
f"n_periods={len(self.time_periods)})"
173173
)
174174

175+
@property
176+
def coef_var(self) -> float:
177+
"""Coefficient of variation: SE / |overall ATT|. NaN when ATT is 0 or SE non-finite."""
178+
if not (np.isfinite(self.overall_se) and self.overall_se >= 0):
179+
return np.nan
180+
if not np.isfinite(self.overall_att) or self.overall_att == 0:
181+
return np.nan
182+
return self.overall_se / abs(self.overall_att)
183+
175184
def summary(self, alpha: Optional[float] = None) -> str:
176185
"""Generate formatted summary of estimation results."""
177186
alpha = alpha or self.alpha
@@ -219,10 +228,15 @@ def summary(self, alpha: Optional[float] = None) -> str:
219228
"",
220229
f"{conf_level}% Confidence Interval: "
221230
f"[{self.overall_conf_int[0]:.4f}, {self.overall_conf_int[1]:.4f}]",
222-
"",
223231
]
224232
)
225233

234+
cv = self.coef_var
235+
if np.isfinite(cv):
236+
lines.append(f"{'CV (SE/|ATT|):':<25} {cv:>10.4f}")
237+
238+
lines.append("")
239+
226240
# Event study effects
227241
if self.event_study_effects:
228242
lines.extend(

0 commit comments

Comments
 (0)