add: support sampling and warmup instrumentation policies

Essoz · Essoz · commit 6406fc7563cd · 2026-02-11T19:00:07.000-05:00
diff --git a/traincheck/collect_trace.py b/traincheck/collect_trace.py
@@ -380,6 +380,20 @@ def main():
         help="Indicate wthether use torch.compile to speed the model, necessary to realize compatibility",
     )
 
+    ## instrumentation policy configs
+    parser.add_argument(
+        "--sampling-interval",
+        type=int,
+        default=None,
+        help="Interval of steps to instrument (e.g., 10 for every 10th step).",
+    )
+    parser.add_argument(
+        "--warm-up-steps",
+        type=int,
+        default=0,
+        help="Number of initial steps to always instrument.",
+    )
+
     args = parser.parse_args()
 
     # read the configuration file
@@ -508,6 +522,8 @@ def main():
             instr_descriptors=args.instr_descriptors,
             no_auto_var_instr=args.no_auto_var_instr,
             use_torch_compile=args.use_torch_compile,
+            sampling_interval=args.sampling_interval,
+            warm_up_steps=args.warm_up_steps,
         )
 
     if args.copy_all_files:
diff --git a/traincheck/config/config.py b/traincheck/config/config.py
@@ -95,6 +95,13 @@
 TYPE_ERR_THRESHOLD = 3
 RECURSION_ERR_THRESHOLD = 5
 
+INSTRUMENTATION_POLICY = {
+    "interval": 1,
+    "warm_up": 1,  # default to 1 to ensure the first step is always instrumented: before warm-up is depleted, we do instrumentation with interval=1, after warm-up is depleted, we do instrumentation with the specified interval
+}
+
+DISABLE_WRAPPER = False
+
 
 class InstrOpt:
     def __init__(
diff --git a/traincheck/developer/annotations.py b/traincheck/developer/annotations.py
@@ -1,3 +1,4 @@
+import traincheck.config.config as config
 import traincheck.instrumentor.tracer as tracer
 from traincheck.config.config import ALL_STAGE_NAMES
 from traincheck.instrumentor import META_VARS
@@ -16,8 +17,13 @@ def annotate_stage(stage_name: str):
         stage_name in ALL_STAGE_NAMES
     ), f"Invalid stage name: {stage_name}, valid ones are {ALL_STAGE_NAMES}"
 
+    old_stage = META_VARS.get("stage", None)
     META_VARS["stage"] = stage_name
 
+    # We always reset the wrapper when stage changes, and let the policy decide later if we should skip
+    if old_stage != stage_name:
+        config.DISABLE_WRAPPER = False
+
 
 def annotate_answer_start_token_ids(
     answer_start_token_id: int, include_start_token: bool = False
diff --git a/traincheck/instrumentor/control.py b/traincheck/instrumentor/control.py
@@ -0,0 +1,80 @@
+import logging
+
+from traincheck.config import config
+from traincheck.instrumentor.caches import META_VARS
+
+logger = logging.getLogger(__name__)
+
+
+def start_step():
+    """
+    Called at the start of a training iteration to control instrumentation policy.
+    increments step count and sets config.DISABLE_WRAPPER based on policy.
+    """
+    # Only control policy if we are in training stage.
+    # If explicit stage annotation is used, respect it.
+    # If not tracking stage (or stage is None), we assume training if this is called?
+    # Better to be safe and check if specific stage is set to non-training.
+    stage = META_VARS.get("stage")
+    if stage and stage != "training":
+        # If explicitly in a non-training stage (e.g. evaluation),
+        # we might want to disable wrapping?
+        # Or just do nothing and let other logic handle it?
+        # The user's request specificially mentioned alignment with training steps.
+        # If we are in evaluation loop, we probably shouldn't be incrementing "step" or applying sampling policy intended for training.
+        return
+
+    META_VARS["step"] += 1
+    current_step = META_VARS["step"]
+
+    policy = config.INSTRUMENTATION_POLICY
+    if policy:
+        warm_up = policy.get("warm_up", 0)
+        interval = policy.get("interval", 1)
+
+        # Default to enabled
+        config.DISABLE_WRAPPER = False
+
+        if current_step < warm_up:
+            print(f"Warmup step {current_step}")
+            config.DISABLE_WRAPPER = False
+        elif (current_step - warm_up) % interval == 0:
+            print(f"Interval step {current_step}")
+            config.DISABLE_WRAPPER = False
+        else:
+            print(f"Skipping step {current_step}")
+            config.DISABLE_WRAPPER = True
+    else:
+        # No policy, always enable
+        config.DISABLE_WRAPPER = False
+
+
+def start_eval_step():
+    """
+    Called at the start of an evaluation iteration.
+    Controls instrumentation policy using a separate step counter.
+    """
+    if "eval_step" not in META_VARS:
+        META_VARS["eval_step"] = 0
+
+    META_VARS["eval_step"] += 1
+    current_step = META_VARS["eval_step"]
+
+    policy = config.INSTRUMENTATION_POLICY
+    if policy:
+        warm_up = policy.get("warm_up", 0)
+        interval = policy.get("interval", 1)
+
+        config.DISABLE_WRAPPER = False
+
+        if current_step < warm_up:
+            print(f"Eval: Warmup step {current_step}")
+            config.DISABLE_WRAPPER = False
+        elif (current_step - warm_up) % interval == 0:
+            print(f"Eval: Interval step {current_step}")
+            config.DISABLE_WRAPPER = False
+        else:
+            print(f"Eval: Skipping step {current_step}")
+            config.DISABLE_WRAPPER = True
+    else:
+        config.DISABLE_WRAPPER = False
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy.py b/traincheck/instrumentor/proxy_wrapper/proxy.py
@@ -8,6 +8,7 @@
 
 import torch
 
+import traincheck.config.config as config
 import traincheck.instrumentor.proxy_wrapper.proxy_config as proxy_config  # HACK: cannot directly import config variables as then they would be local variables
 import traincheck.instrumentor.proxy_wrapper.proxy_methods as proxy_methods
 from traincheck.config.config import should_disable_proxy_dumping
@@ -158,6 +159,9 @@ def __deepcopy__(self, memo):
         return new_copy
 
     def dump_trace(self, phase, dump_loc):
+        if config.DISABLE_WRAPPER:
+            return
+
         obj = self._obj
         var_name = self.__dict__["var_name"]
         assert var_name is not None  # '' is allowed as a var_name (root object)
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy_observer.py b/traincheck/instrumentor/proxy_wrapper/proxy_observer.py
@@ -1,6 +1,7 @@
 import functools
 import typing
 
+import traincheck.config.config as config
 from traincheck.config.config import should_disable_proxy_dumping
 from traincheck.instrumentor.proxy_wrapper.subclass import ProxyParameter
 from traincheck.utils import typename
@@ -21,6 +22,8 @@ def observe_proxy_var(
     phase,
     observe_api_name: str,
 ):
+    if config.DISABLE_WRAPPER:
+        return
 
     # update the proxy object's timestamp
     var.update_timestamp()
diff --git a/traincheck/instrumentor/proxy_wrapper/subclass.py b/traincheck/instrumentor/proxy_wrapper/subclass.py
@@ -6,6 +6,7 @@
 import torch
 from torch import nn
 
+import traincheck.config.config as config
 from traincheck.config.config import should_disable_proxy_dumping
 from traincheck.instrumentor.dumper import dump_trace_VAR
 from traincheck.instrumentor.proxy_wrapper.dumper import dump_attributes, get_meta_vars
@@ -178,6 +179,9 @@ def register_object(self):
         )
 
     def dump_trace(self, phase, dump_loc):
+        if config.DISABLE_WRAPPER:
+            return
+
         # TODO
         var_name = self.__dict__["var_name"]
         # assert var_name is not None  # '' is allowed as a var_name (root object)
diff --git a/traincheck/instrumentor/source_file.py b/traincheck/instrumentor/source_file.py
@@ -33,6 +33,8 @@ def __init__(
         use_full_instr: bool,
         funcs_to_instr: list[str] | None,
         API_dump_stack_trace: bool,
+        sampling_interval: int,
+        warm_up_steps: int,
     ):
         super().__init__()
         if not modules_to_instr:
@@ -44,10 +46,27 @@ def __init__(
         self.use_full_instr = use_full_instr
         self.funcs_to_instr = funcs_to_instr
         self.API_dump_stack_trace = API_dump_stack_trace
+        self.sampling_interval = sampling_interval
+        self.warm_up_steps = warm_up_steps
+        self.current_function = None
+
+    def visit_FunctionDef(self, node):
+        old_function = self.current_function
+        self.current_function = node.name
+        self.generic_visit(node)
+        self.current_function = old_function
+        return node
+
+    def visit_AsyncFunctionDef(self, node):
+        old_function = self.current_function
+        self.current_function = node.name
+        self.generic_visit(node)
+        self.current_function = old_function
+        return node
 
     def get_instrument_node(self, module_name: str):
         return ast.parse(
-            f"from traincheck.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}).instrument()"
+            f"from traincheck.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}, sampling_interval={str(self.sampling_interval)}, warm_up_steps={str(self.warm_up_steps)}).instrument()"
         ).body
 
     def visit_Import(self, node):
@@ -65,8 +84,6 @@ def visit_Import(self, node):
                 instrument_nodes.append(self.get_instrument_node(n.asname))
             else:
                 instrument_nodes.append(self.get_instrument_node(n.name))
-        # let's see if there are aliases, if yes, use them
-        # if not, let's use the module name directly
         return [node] + instrument_nodes
 
     def visit_ImportFrom(self, node):
@@ -87,6 +104,105 @@ def visit_ImportFrom(self, node):
                 instrument_nodes.append(self.get_instrument_node(n.name))
         return [node] + instrument_nodes
 
+    def _get_loop_context(self, node):
+        # Heuristic: Inject into loops that look like training loops.
+        # Check for calls to .step() or .backward()
+        has_training_signal = False
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call):
+                if isinstance(child.func, ast.Attribute):
+                    if child.func.attr in ["step", "backward"]:
+                        has_training_signal = True
+
+        if has_training_signal:
+            return "training"
+
+        # If no explicit training signal, check if we are in an eval/test function
+        if self.current_function:
+            name_lower = self.current_function.lower()
+            if "test" in name_lower or "eval" in name_lower or "valid" in name_lower:
+                return "eval"
+
+        return None
+
+    def _inject_call(self, node, func_name):
+        import_stmt = ast.ImportFrom(
+            module="traincheck.instrumentor.control",
+            names=[ast.alias(name=func_name, asname=None)],
+            level=0,
+        )
+        call_stmt = ast.Expr(
+            value=ast.Call(
+                func=ast.Name(id=func_name, ctx=ast.Load()), args=[], keywords=[]
+            )
+        )
+        node.body.insert(0, call_stmt)
+        node.body.insert(0, import_stmt)
+        return node
+
+    def visit_For(self, node):
+        self.generic_visit(node)
+        context = self._get_loop_context(node)
+        if context == "training":
+            return self._inject_call(node, "start_step")
+        elif context == "eval":
+            return self._inject_call(node, "start_eval_step")
+        return node
+
+    def visit_While(self, node):
+        self.generic_visit(node)
+        context = self._get_loop_context(node)
+        if context == "training":
+            return self._inject_call(node, "start_step")
+        elif context == "eval":
+            return self._inject_call(node, "start_eval_step")
+        return node
+
+    def _should_inject_control(self, node):
+        # Heuristic: Inject into loops that look like training loops.
+        # Check for calls to .step() or .backward()
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call):
+                if isinstance(child.func, ast.Attribute):
+                    if child.func.attr in ["step", "backward"]:
+                        return True
+        return False
+
+    def _inject_start_step(self, node):
+        import_stmt = ast.ImportFrom(
+            module="traincheck.instrumentor.control",
+            names=[ast.alias(name="start_step", asname=None)],
+            level=0,
+        )
+        call_stmt = ast.Expr(
+            value=ast.Call(
+                func=ast.Name(id="start_step", ctx=ast.Load()), args=[], keywords=[]
+            )
+        )
+        # We need to insert the import at the top of the file ideally,
+        # but inserting inside the loop works if we deal with python scoping (imports are valid statements).
+        # Actually proper way is to add import at module level.
+        # But `visit_Module` is not here.
+        # For simplicity, let's just use fully qualified name or inject import in the loop (a bit inefficient but works).
+        # Better: Inject `import traincheck.instrumentor.control` at top of loop or use `traincheck.instrumentor.control.start_step()` with import logic handled elsewhere?
+        # The `InsertTracerVisitor` modifies the module. We can add an import to the module body if we had access.
+        # `visit_Import` adds imports.
+        # Let's assume `traincheck` is importable.
+
+        # Helper to create `traincheck.instrumentor.control.start_step()` call
+        # And ensure import is present.
+        # Actually `InsertTracerVisitor` is used on the whole file.
+        # Let's just blindly insert the call logic and rely on the fact that we can insert an import at the top of the loop
+        # or just assume the user code can handle it if we inject the import statement right before the call.
+
+        # Let's inject:
+        # from traincheck.instrumentor.control import start_step
+        # start_step()
+
+        node.body.insert(0, call_stmt)
+        node.body.insert(0, import_stmt)
+        return node
+
 
 def instrument_library(
     source: str,
@@ -95,6 +211,8 @@ def instrument_library(
     use_full_instr: bool,
     funcs_to_instr: list[str] | None,
     API_dump_stack_trace: bool,
+    sampling_interval: int,
+    warm_up_steps: int,
 ) -> str:
     """
     Instruments the given source code and returns the instrumented source code.
@@ -116,6 +234,8 @@ def instrument_library(
         use_full_instr,
         funcs_to_instr,
         API_dump_stack_trace,
+        sampling_interval,
+        warm_up_steps,
     )
     root = visitor.visit(root)
     source = ast.unparse(root)
@@ -811,6 +931,8 @@ def instrument_file(
     instr_descriptors: bool,
     no_auto_var_instr: bool,
     use_torch_compile: bool,
+    sampling_interval: int = 1,
+    warm_up_steps: int = 0,
 ) -> str:
     """
     Instruments the given file and returns the instrumented source code.
@@ -827,6 +949,8 @@ def instrument_file(
         use_full_instr,
         funcs_to_instr,
         API_dump_stack_trace,
+        sampling_interval,
+        warm_up_steps,
     )
     # annotate stages
     instrumented_source = annotate_stage(instrumented_source)
diff --git a/traincheck/instrumentor/tracer.py b/traincheck/instrumentor/tracer.py