From 0788f6f152c0f54562d0aba154e7e9a590320fc9 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Mon, 22 Jun 2026 16:01:09 -0400
Subject: [PATCH] fix: clamp negative vcov diagonal in LinearRegression SE (no
 NaN SE)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A high-leverage / degenerate coefficient (e.g. an absorbed-FE dummy
near-collinear with the treatment, whose Bell-McCaffrey Satterthwaite DOF
already hits the noise-floor guard) can have a CR2/HC variance of ~0
(≈1e-32) whose vcov diagonal lands just-below-zero under BLAS-dependent
float rounding. `np.sqrt` of the negative then produced a NaN SE
NONDETERMINISTICALLY — passing single-threaded but failing only under the
parallel pure-Python full-suite run (the Pure Python Fallback CI leg), on
test_methodology_wls_cr2.py::TestLinearRegressionFENanGuardEndToEnd.

- linalg.py: clamp the vcov diagonal at 0 before sqrt in both SE sites
  (get_se, get_inference). No-op for positive variances; the SE is now
  finite (0 for a genuinely-zero variance), deterministic, BLAS-independent.
- test: relax the guarded-coef assertion to finite & >= 0 (rounding-robust)
  + a deterministic regression test (a tiny-negative diagonal yields a
  finite 0 SE, not NaN).
- REGISTRY (Variance Estimation, Cluster-Robust SE) + CHANGELOG documented.

Pre-existing failure surfaced by PR #539's labeled full-suite run; main
does not run the Pure Python Fallback leg outside labeled PRs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                      |  3 +++
 diff_diff/linalg.py               | 11 +++++++--
 docs/methodology/REGISTRY.md      |  7 ++++++
 tests/test_methodology_wls_cr2.py | 38 ++++++++++++++++++++++++++++++-
 4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1f0e7cd4..699849364 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+- **`LinearRegression.get_se()` / `get_inference()` no longer return a `NaN` standard error from a tiny-negative variance artifact.** A high-leverage / degenerate coefficient (e.g. an absorbed-FE dummy near-collinear with the treatment, whose Bell-McCaffrey Satterthwaite DOF already hits the noise-floor guard) can have a CR2/HC variance of ~0 (≈1e-32) whose vcov diagonal lands just-below-zero under BLAS-dependent float rounding; `np.sqrt` of the negative then produced a `NaN` SE **nondeterministically** — passing single-threaded but failing under the parallel pure-Python full-suite run (`tests/test_methodology_wls_cr2.py::TestLinearRegressionFENanGuardEndToEnd::test_did_absorbed_fe_lr_inference_nan_for_guarded_coefs`). Both SE sites now clamp the vcov diagonal at 0, so the SE is finite (0 for a genuinely-zero variance), deterministic, and BLAS-independent. **No change for any positive variance** (the clamp is a no-op there); only the previously-`NaN` degenerate case is affected.
+
 ## [3.5.2] - 2026-06-08
 
 ### Added
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
index 34c545e3a..876c39643 100644
--- a/diff_diff/linalg.py
+++ b/diff_diff/linalg.py
@@ -3865,7 +3865,13 @@ def get_se(self, index: int) -> float:
         """
         self._check_fitted()
         assert self.vcov_ is not None
-        return float(np.sqrt(self.vcov_[index, index]))
+        # Clamp a tiny-negative variance artifact at 0 before sqrt. A high-leverage
+        # / degenerate coefficient (e.g. an absorbed-FE dummy near-collinear with the
+        # treatment) can have a CR2/HC variance of ~0 that lands just below zero under
+        # BLAS-dependent float rounding; without the clamp `np.sqrt` returns NaN
+        # nondeterministically (passes single-threaded, fails under parallel test
+        # load). The SE is then finite — 0 for a genuinely-zero variance.
+        return float(np.sqrt(max(float(self.vcov_[index, index]), 0.0)))
 
     def get_inference(
         self,
@@ -3908,7 +3914,8 @@ def get_inference(
         assert self.vcov_ is not None
 
         coef = float(self.coefficients_[index])
-        se = float(np.sqrt(self.vcov_[index, index]))
+        # See get_se: clamp a tiny-negative variance artifact at 0 so SE is finite, not NaN.
+        se = float(np.sqrt(max(float(self.vcov_[index, index]), 0.0)))
 
         # Use instance alpha if not provided
         effective_alpha = alpha if alpha is not None else self.alpha
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 9a9c65d2b..7c7c27407 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -186,6 +186,13 @@ where V is the VCV sub-matrix for post-treatment δ_e coefficients.
   DiD panel sizes (n ≤ few thousand); tracked in `TODO.md` under Performance for
   a follow-up that plumbs the contrast DOF through the existing CR2 vcov path or
   shares precomputes.
+- **Note:** `LinearRegression.get_se()` / `get_inference()` clamp the vcov diagonal at 0
+  before `sqrt`. A high-leverage / degenerate coefficient (an absorbed-FE dummy
+  near-collinear with the treatment, whose Satterthwaite DOF already hits the noise-floor
+  guard) has a CR2/HC variance of ~0 (≈1e-32) that can land just-below-zero under
+  BLAS-dependent rounding; the clamp keeps the SE finite (0 for a genuinely-zero variance)
+  and deterministic across BLAS implementations, never `NaN`. No effect on any positive
+  variance. Regression: `tests/test_methodology_wls_cr2.py::TestLinearRegressionFENanGuardEndToEnd`.
 - Optional: Wild cluster bootstrap (complex for multi-coefficient testing;
   requires joint bootstrap distribution)
 - Degrees of freedom adjusted for absorbed fixed effects
diff --git a/tests/test_methodology_wls_cr2.py b/tests/test_methodology_wls_cr2.py
index ab19bc3d5..29feaf02b 100644
--- a/tests/test_methodology_wls_cr2.py
+++ b/tests/test_methodology_wls_cr2.py
@@ -775,7 +775,13 @@ def test_did_absorbed_fe_lr_inference_nan_for_guarded_coefs(self, goldens):
                 np.isnan(b) for b in inf_i.conf_int
             ), f"NaN guard must produce NaN conf_int at index {i}; got {inf_i.conf_int}"
             # SE and coefficient remain valid (vcov matches at machine precision).
-            assert np.isfinite(inf_i.se) and inf_i.se > 0
+            # These guarded high-leverage FE dummies have a genuinely-~0 CR2
+            # variance (~1e-32); the SE is therefore ~0 and may clamp to exactly 0
+            # when the tiny diagonal lands just-negative under BLAS-dependent
+            # rounding (get_se/get_inference clamp at 0 so the SE is finite, never
+            # NaN — previously this produced a nondeterministic NaN that failed
+            # only under the parallel pure-Python full-suite run).
+            assert np.isfinite(inf_i.se) and inf_i.se >= 0
             assert np.isfinite(inf_i.coefficient)
 
         # Non-guarded coefficients still emit finite inference.
@@ -789,6 +795,36 @@ def test_did_absorbed_fe_lr_inference_nan_for_guarded_coefs(self, goldens):
             assert np.isfinite(inf_i.p_value)
             assert all(np.isfinite(b) for b in inf_i.conf_int)
 
+    def test_negative_variance_artifact_yields_finite_se_not_nan(self):
+        """A tiny-negative vcov diagonal (numerical artifact of a ~0 variance for a
+        degenerate/high-leverage coefficient) must clamp to a finite SE, never NaN.
+
+        Regression for the pure-Python full-suite flake: the guarded FE-dummy CR2
+        variances are ~1e-32 and tip just-below-zero under BLAS-dependent rounding,
+        so `np.sqrt(vcov[i,i])` returned NaN nondeterministically (passed single-
+        threaded, failed under the parallel `-n auto` run). get_se / get_inference
+        now clamp the diagonal at 0 so the SE is finite (0 for a genuinely-0 variance).
+        """
+        from diff_diff.linalg import LinearRegression
+
+        rng = np.random.default_rng(7)
+        n = 40
+        X = np.column_stack([np.ones(n), rng.normal(size=n)])
+        y = X @ np.array([1.0, 2.0]) + rng.normal(size=n)
+        lr = LinearRegression(include_intercept=False)
+        lr.fit(X, y)
+        assert lr.vcov_ is not None
+
+        # Sanity: the unperturbed SE is finite and positive.
+        assert np.isfinite(lr.get_se(1)) and lr.get_se(1) > 0
+
+        # Inject a tiny-negative diagonal artifact (as parallel-load rounding can).
+        lr.vcov_[1, 1] = -1e-30
+        se = lr.get_se(1)
+        assert np.isfinite(se) and se == 0.0, f"expected finite 0 SE, got {se}"
+        inf = lr.get_inference(1)
+        assert np.isfinite(inf.se) and inf.se == 0.0, f"expected finite 0 SE, got {inf.se}"
+
 
 class TestUnweightedRegressionStillBitEqual:
     """Regression safety: existing unweighted goldens must still match bit-equal."""