Weight Processing Bugs for larger models (GPT OSS, Gemma3 Multimodal, Phi 3) (#1224)

jlarson4 · web-flow · commit d0fa6cc73a3c · 2026-03-30T17:00:41.000-05:00
* Fix small flaws in the updated position embeddings attention, and bugs introduced into Hookedtransformer by transformers v5

* Gemma3 &amp; base Gemma weight processing bug fixes

* Additional weight processing bug fixes for Mistral and Phi3
diff --git a/transformer_lens/benchmarks/granular_weight_processing.py b/transformer_lens/benchmarks/granular_weight_processing.py
@@ -370,20 +370,6 @@ def run_granular_weight_processing_benchmarks(
                     for key, value in forward_hooks_result.details.items():
                         print(f"  {key}: {value}")
 
-            # Clean up
-            del bridge
-            del ht_ref
-            # Force garbage collection (multiple passes to break circular references)
-            import gc
-
-            for _ in range(3):
-                gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
-                torch.mps.synchronize()
-                torch.mps.empty_cache()
-
         except Exception as e:
             # Record failure
             results.append(
@@ -395,6 +381,20 @@ def run_granular_weight_processing_benchmarks(
                     details={"error": str(e), "config": str(config)},
                 )
             )
+        finally:
+            # Always clean up models after each config (success or failure)
+            # to prevent memory leaks on large models
+            import gc
+
+            bridge = None  # type: ignore[assignment]
+            ht_ref = None  # type: ignore[assignment]
+            for _ in range(3):
+                gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
+                torch.mps.synchronize()
+                torch.mps.empty_cache()
 
         # Store results
         all_results[config.name] = results
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -71,7 +71,6 @@
     "Qwen/Qwen-",
     "Qwen/Qwen3-",
     "microsoft/phi-2",
-    "microsoft/Phi-3-mini-4k-instruct",
     "microsoft/phi-4",
     "apple/OpenELM",
     "openai/gpt-oss-",
@@ -861,7 +860,6 @@ def convert_hf_model_config(model_name: str, **kwargs: Any) -> dict[str, Any]:
             "initializer_range": hf_config.initializer_range,
             "normalization_type": "RMS",
             "positional_embedding_type": "rotary",
-            "trust_remote_code": True,
             "rotary_base": _get_rope_theta(hf_config),
             "use_attn_scale": True,
             "gated_mlp": True,
diff --git a/transformer_lens/model_bridge/supported_architectures/mistral.py b/transformer_lens/model_bridge/supported_architectures/mistral.py
@@ -11,8 +11,8 @@
     AttentionBridge,
     BlockBridge,
     EmbeddingBridge,
+    GatedMLPBridge,
     LinearBridge,
-    MLPBridge,
     RMSNormalizationBridge,
     RotaryEmbeddingBridge,
     UnembeddingBridge,
@@ -83,7 +83,7 @@ def __init__(self, cfg: Any) -> None:
                             "o": LinearBridge(name="o_proj"),
                         },
                     ),
-                    "mlp": MLPBridge(
+                    "mlp": GatedMLPBridge(
                         name="mlp",
                         submodules={
                             "gate": LinearBridge(name="gate_proj"),
diff --git a/transformer_lens/model_bridge/supported_architectures/phi3.py b/transformer_lens/model_bridge/supported_architectures/phi3.py
@@ -13,10 +13,10 @@
 )
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.generalized_components import (
-    AttentionBridge,
     BlockBridge,
     EmbeddingBridge,
     GatedMLPBridge,
+    JointQKVPositionEmbeddingsAttentionBridge,
     LinearBridge,
     RMSNormalizationBridge,
     UnembeddingBridge,
@@ -94,24 +94,19 @@ def __init__(self, cfg: Any) -> None:
                 submodules={
                     "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
                     "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
-                    "attn": AttentionBridge(
+                    "attn": JointQKVPositionEmbeddingsAttentionBridge(
                         name="self_attn",
                         config=self.cfg,
-                        requires_position_embeddings=True,
-                        requires_attention_mask=True,
+                        split_qkv_matrix=self._split_phi3_qkv,
                         submodules={
-                            # Phi-3 uses combined qkv_proj, but we still need submodules for hooks
-                            "q": LinearBridge(name="qkv_proj"),
-                            "k": LinearBridge(name="qkv_proj"),
-                            "v": LinearBridge(name="qkv_proj"),
+                            "qkv": LinearBridge(name="qkv_proj"),
                             "o": LinearBridge(name="o_proj"),
                         },
                     ),
                     "mlp": GatedMLPBridge(
                         name="mlp",
                         config=self.cfg,
                         submodules={
-                            # Phi-3 uses joint gate_up_proj, but we need submodules for hooks
                             "gate": LinearBridge(name="gate_up_proj"),
                             "in": LinearBridge(name="gate_up_proj"),
                             "out": LinearBridge(name="down_proj"),
@@ -123,6 +118,79 @@ def __init__(self, cfg: Any) -> None:
             "unembed": UnembeddingBridge(name="lm_head"),
         }
 
+    @staticmethod
+    def _split_gate_up(
+        original_mlp_component: Any,
+    ) -> tuple[torch.nn.Module, torch.nn.Module]:
+        """Split Phi-3's fused gate_up_proj into separate gate and up Linear modules."""
+        fused_weight = original_mlp_component.gate_up_proj.weight
+        gate_w, up_w = torch.tensor_split(fused_weight, 2, dim=0)
+        d_model = fused_weight.shape[1]
+        d_mlp = gate_w.shape[0]
+
+        has_bias = (
+            hasattr(original_mlp_component.gate_up_proj, "bias")
+            and original_mlp_component.gate_up_proj.bias is not None
+        )
+        gate_b: torch.Tensor | None
+        up_b: torch.Tensor | None
+        if has_bias:
+            gate_b, up_b = torch.tensor_split(original_mlp_component.gate_up_proj.bias, 2, dim=0)
+        else:
+            gate_b = up_b = None
+
+        gate_proj = torch.nn.Linear(d_model, d_mlp, bias=has_bias)
+        gate_proj.weight = torch.nn.Parameter(gate_w)
+        if gate_b is not None:
+            gate_proj.bias = torch.nn.Parameter(gate_b)
+
+        up_proj = torch.nn.Linear(d_model, d_mlp, bias=has_bias)
+        up_proj.weight = torch.nn.Parameter(up_w)
+        if up_b is not None:
+            up_proj.bias = torch.nn.Parameter(up_b)
+
+        return gate_proj, up_proj
+
+    def _split_phi3_qkv(
+        self, original_attention_component: Any
+    ) -> tuple[torch.nn.Module, torch.nn.Module, torch.nn.Module]:
+        """Split Phi-3's fused qkv_proj into separate Q, K, V linear modules."""
+        qkv_weight = original_attention_component.qkv_proj.weight
+        d_model = qkv_weight.shape[1]
+        # Phi-3 QKV is [3*n_heads*d_head, d_model], split into equal thirds
+        q_weight, k_weight, v_weight = torch.tensor_split(qkv_weight, 3, dim=0)
+
+        has_bias = (
+            hasattr(original_attention_component.qkv_proj, "bias")
+            and original_attention_component.qkv_proj.bias is not None
+        )
+        q_bias: torch.Tensor | None
+        k_bias: torch.Tensor | None
+        v_bias: torch.Tensor | None
+        if has_bias:
+            q_bias, k_bias, v_bias = torch.tensor_split(
+                original_attention_component.qkv_proj.bias, 3, dim=0
+            )
+        else:
+            q_bias = k_bias = v_bias = None
+
+        q_linear = torch.nn.Linear(d_model, q_weight.shape[0], bias=has_bias)
+        q_linear.weight = torch.nn.Parameter(q_weight)
+        if q_bias is not None:
+            q_linear.bias = torch.nn.Parameter(q_bias)
+
+        k_linear = torch.nn.Linear(d_model, k_weight.shape[0], bias=has_bias)
+        k_linear.weight = torch.nn.Parameter(k_weight)
+        if k_bias is not None:
+            k_linear.bias = torch.nn.Parameter(k_bias)
+
+        v_linear = torch.nn.Linear(d_model, v_weight.shape[0], bias=has_bias)
+        v_linear.weight = torch.nn.Parameter(v_weight)
+        if v_bias is not None:
+            v_linear.bias = torch.nn.Parameter(v_bias)
+
+        return q_linear, k_linear, v_linear
+
     def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
         """Fix compatibility issues for Phi-3 models with trust_remote_code=True.