remove unused proxy config; rename ML_DAIKON to TRAINCHECK

Essoz · Essoz · commit 18b6a8be7e78 · 2026-01-05T15:11:15.000-05:00
diff --git a/docs/5-min-tutorial.md b/docs/5-min-tutorial.md
@@ -246,7 +246,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
             "var_type": NaN,
             "mode": NaN,
             "dump_loc": NaN,
-            "attributes._ML_DAIKON_data_ID": NaN,
+            "attributes._TRAINCHECK_data_ID": NaN,
             "attributes.data": NaN,
             "attributes.dtype": NaN,
             "attributes.grad": NaN,
@@ -274,7 +274,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
             "attributes.requires_grad": NaN,
             "attributes.retains_grad": NaN,
             "attributes.shape": NaN,
-            "attributes._ML_DAIKON_grad_ID": NaN,
+            "attributes._TRAINCHECK_grad_ID": NaN,
             "exception": NaN,
             "exception_msg": NaN,
             "proxy_obj_names": NaN
diff --git a/docs/ae-eval-s5.1-silent-issue-detection.md b/docs/ae-eval-s5.1-silent-issue-detection.md
@@ -145,9 +145,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
 >             "process_id": 9591,
 >             "thread_id": 140324043503424,
 86c86
-<             "attributes._ML_DAIKON_data_ID": 140704882109040,
+<             "attributes._TRAINCHECK_data_ID": 140704882109040,
 ---
->             "attributes._ML_DAIKON_data_ID": 140317529048544,
+>             "attributes._TRAINCHECK_data_ID": 140317529048544,
 116,117c116,117
 <             "time": 2437523672783,
 <             "meta_vars._DATA_PARALLEL_RANK": 4.0,
@@ -161,9 +161,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
 >             "process_id": 9747,
 >             "thread_id": 140028492969792,
 128c128
-<             "attributes._ML_DAIKON_data_ID": 140043703504144,
+<             "attributes._TRAINCHECK_data_ID": 140043703504144,
 ---
->             "attributes._ML_DAIKON_data_ID": 140021978318304,
+>             "attributes._TRAINCHECK_data_ID": 140021978318304,
 158,159c158,159
 <             "time": 2437502499438,
 <             "meta_vars._DATA_PARALLEL_RANK": 2.0,
@@ -182,9 +182,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
 <             "exception_msg": NaN,
 <             "proxy_obj_names": NaN,
 113c110,113
-<             "attributes._ML_DAIKON_grad_ID": NaN
+<             "attributes._TRAINCHECK_grad_ID": NaN
 ---
->             "attributes._ML_DAIKON_grad_ID": NaN,
+>             "attributes._TRAINCHECK_grad_ID": NaN,
 >             "exception": NaN,
 >             "exception_msg": NaN,
 >             "proxy_obj_names": NaN
@@ -193,9 +193,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
 <             "exception_msg": NaN,
 <             "proxy_obj_names": NaN,
 215c212,215
-<             "attributes._ML_DAIKON_grad_ID": NaN
+<             "attributes._TRAINCHECK_grad_ID": NaN
 ---
->             "attributes._ML_DAIKON_grad_ID": NaN,
+>             "attributes._TRAINCHECK_grad_ID": NaN,
 >             "exception": NaN,
 >             "exception_msg": NaN,
 >             "proxy_obj_names": NaN
@@ -210,9 +210,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
 <             "exception_msg": NaN,
 <             "proxy_obj_names": NaN,
 331c328,331
-<             "attributes._ML_DAIKON_grad_ID": NaN
+<             "attributes._TRAINCHECK_grad_ID": NaN
 ---
->             "attributes._ML_DAIKON_grad_ID": NaN,
+>             "attributes._TRAINCHECK_grad_ID": NaN,
 >             "exception": NaN,
 >             "exception_msg": NaN,
 >             "proxy_obj_names": NaN
@@ -247,10 +247,10 @@ diff --color -r checker_output/trace_pytorch-51800/failed.log reference_checker_
 >             "time": 19876858668088743,
 >             "meta_vars.step": 0,
 89c70,89
-<             "attributes._ML_DAIKON_grad_ID": NaN
+<             "attributes._TRAINCHECK_grad_ID": NaN
 ---
 >             "type": "function_call (pre)",
->             "attributes._ML_DAIKON_grad_ID": NaN,
+>             "attributes._TRAINCHECK_grad_ID": NaN,
 >             "func_call_id": "b39a4a81b2c24473ba916ab1832fbf12_19876858668012869",
 >             "function": "torch.nn.modules.module.Module.eval",
 >             "is_bound_method": true,
@@ -290,9 +290,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
 ---
 >             "meta_vars._DATA_PARALLEL_RANK": "1",
 87c87
-<             "attributes._ML_DAIKON_data_ID": 140656561409856,
+<             "attributes._TRAINCHECK_data_ID": 140656561409856,
 ---
->             "attributes._ML_DAIKON_data_ID": 140621279056480,
+>             "attributes._TRAINCHECK_data_ID": 140621279056480,
 117c117
 <             "time": 123297988837864,
 ---
@@ -308,9 +308,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
 ---
 >             "meta_vars._DATA_PARALLEL_RANK": "0",
 129c129
-<             "attributes._ML_DAIKON_data_ID": 140621279058160,
+<             "attributes._TRAINCHECK_data_ID": 140621279058160,
 ---
->             "attributes._ML_DAIKON_data_ID": 140656561411776,
+>             "attributes._TRAINCHECK_data_ID": 140656561411776,
 159c159
 <             "time": 123299970638648,
 ---
diff --git a/docs/assets/code/mnist.py b/docs/assets/code/mnist.py
@@ -40,7 +40,7 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training")  # ML_DAIKON: stage annotation
+    annotate_stage("training")  # TRAINCHECK: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
         META_VARS["step"] += 1
@@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
             if args.dry_run:
                 break
 
-        # ML_DAIKON: break after 100 batches
+        # TRAINCHECK: break after 100 batches
         if batch_idx == 50:
             break
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing")  # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # TRAINCHECK: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -87,7 +87,7 @@ def test(model, device, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
-            # ML_DAIKON: break after 10 batches
+            # TRAINCHECK: break after 10 batches
             if data_idx == 10:
                 break
 
@@ -174,7 +174,7 @@ def main():
     )
     args = parser.parse_args()
 
-    annotate_stage("init")  # ML_DAIKON: stage annotation
+    annotate_stage("init")  # TRAINCHECK: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -191,7 +191,7 @@ def main():
     test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
         cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
-        # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
+        # TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
@@ -212,11 +212,11 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training")  # ML_DAIKON: stage annotation
+        annotate_stage("training")  # TRAINCHECK: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # TRAINCHECK: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
diff --git a/docs/assets/examples/traincheck-collect/mnist-config/mnist.py b/docs/assets/examples/traincheck-collect/mnist-config/mnist.py
@@ -40,7 +40,7 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training")  # ML_DAIKON: stage annotation
+    annotate_stage("training")  # TRAINCHECK: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
         META_VARS["step"] += 1
@@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
             if args.dry_run:
                 break
 
-        # ML_DAIKON: break after 100 batches
+        # TRAINCHECK: break after 100 batches
         if batch_idx == 50:
             break
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing")  # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # TRAINCHECK: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -87,7 +87,7 @@ def test(model, device, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
-            # ML_DAIKON: break after 10 batches
+            # TRAINCHECK: break after 10 batches
             if data_idx == 10:
                 break
 
@@ -174,7 +174,7 @@ def main():
     )
     args = parser.parse_args()
 
-    annotate_stage("init")  # ML_DAIKON: stage annotation
+    annotate_stage("init")  # TRAINCHECK: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -191,7 +191,7 @@ def main():
     test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
         cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
-        # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
+        # TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
@@ -212,11 +212,11 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training")  # ML_DAIKON: stage annotation
+        annotate_stage("training")  # TRAINCHECK: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # TRAINCHECK: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
diff --git a/traincheck/collect_trace.py b/traincheck/collect_trace.py
@@ -347,12 +347,6 @@ def main():
         default="hash",
         help="The format for dumping tensors. Choose from 'hash'(default), 'stats' or 'full'.",
     )
-    parser.add_argument(
-        "--enable-C-level-observer",
-        type=bool,
-        default=proxy_config.enable_C_level_observer,
-        help="Enable the observer at the C level",
-    )
     parser.add_argument(
         "--no-auto-var-instr",
         action="store_true",
@@ -386,7 +380,7 @@ def main():
     # set up logging
     if args.debug_mode:
         logging.basicConfig(level=logging.DEBUG)
-        os.environ["ML_DAIKON_DEBUG"] = "1"
+        os.environ["TRAINCHECK_DEBUG"] = "1"
     else:
         logging.basicConfig(level=logging.INFO)
 
@@ -406,7 +400,6 @@ def main():
     proxy_basic_config: dict[str, int | bool | str] = {}
     for configs in [
         "debug_mode",
-        "enable_C_level_observer",
     ]:
         if getattr(proxy_config, configs) != getattr(args, configs):
             proxy_basic_config[configs] = getattr(args, configs)
diff --git a/traincheck/instrumentor/dumper.py b/traincheck/instrumentor/dumper.py
@@ -27,7 +27,7 @@
 )
 from traincheck.utils import get_timestamp_ns, typename, typename_compile
 
-DEBUG = os.environ.get("ML_DAIKON_DEBUG", False)
+DEBUG = os.environ.get("TRAINCHECK_DEBUG", False)
 THREAD_DATA = threading.local()
 IS_CUDA_AVAILABLE = torch.cuda.is_available()
 
@@ -129,10 +129,10 @@ def get_trace_API_dumper_queue():
     pid = os.getpid()
     tid = threading.get_ident()
 
-    output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
+    output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
     assert (
         output_dir is not None
-    ), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
+    ), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
 
     trace_queue = Queue()
     trace_file_name = f"trace_API_{pid}_{tid}.log"
@@ -161,10 +161,10 @@ def get_trace_VAR_dumper_queue():
     pid = os.getpid()
     tid = threading.current_thread().ident
 
-    output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
+    output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
     assert (
         output_dir is not None
-    ), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
+    ), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
 
     trace_queue = Queue()
     trace_file_name = f"trace_VAR_{pid}_{tid}.log"
@@ -249,10 +249,10 @@ def dump_trace_VAR(trace: dict):
 
 def get_instrumentation_logger_for_process():
     pid = os.getpid()
-    output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
+    output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
     assert (
         output_dir is not None
-    ), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
+    ), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
 
     if pid in instrumentation_loggers:
         return instrumentation_loggers[pid]
@@ -369,7 +369,7 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
         if (
             isinstance(attr_name, str)
             and attr_name.startswith("_")
-            and not attr_name.startswith("_ML_DAIKON")
+            and not attr_name.startswith("_TRAINCHECK")
         ):
             continue
 
@@ -405,12 +405,12 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
             result[attr_name] = attr
 
         elif isinstance(attr, torch.Tensor):
-            result[f"_ML_DAIKON_{attr_name}_ID"] = id(attr)
+            result[f"_TRAINCHECK_{attr_name}_ID"] = id(attr)
             if include_tensor_data:
                 result[attr_name] = dump_tensor(attr)
 
         elif isinstance(attr, torch.nn.parameter.Parameter):
-            result[f"_ML_DAIKON_{attr_name}_ID"] = id(attr)
+            result[f"_TRAINCHECK_{attr_name}_ID"] = id(attr)
             if include_tensor_data:
                 result[attr_name] = dump_tensor(attr.data)
 
@@ -430,7 +430,7 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
             result[attr_name] = str(attr)
         elif isinstance(attr, torch.Size):
             result[attr_name] = tuple(attr)
-        elif "_ML_DAIKON" in attr_name:
+        elif "_TRAINCHECK" in attr_name:
             # should always be serializable, so blindly assign here.
             result[attr_name] = attr
 
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy.py b/traincheck/instrumentor/proxy_wrapper/proxy.py
@@ -102,7 +102,7 @@ class Proxy:
     var_dict: Dict[str, ProxyObjInfo] = {}
     loglevel = logging.INFO
     jsondumper = dumper(
-        os.path.join(os.getenv("ML_DAIKON_OUTPUT_DIR", "."), "proxy_log.json")  # type: ignore
+        os.path.join(os.getenv("TRAINCHECK_OUTPUT_DIR", "."), "proxy_log.json")  # type: ignore
     )
 
     @staticmethod
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy_config.py b/traincheck/instrumentor/proxy_wrapper/proxy_config.py
@@ -25,8 +25,6 @@
     "observe_then_unproxy": True,  # observe the function call and then unproxy the arguments
 }
 
-enable_C_level_observer = False  # enable the observer at the C level (This would potentially lead to a lot of overhead since we need to observe and dump all proxied object at the C level function call, try to use auto observer with proper depth could reduce the overhead)
-
 primitive_types = {
     types.NoneType,
     int,
diff --git a/traincheck/instrumentor/source_file.py b/traincheck/instrumentor/source_file.py
@@ -833,13 +833,13 @@ def instrument_file(
     # logging configs
     logging_start_code = f"""
 import os
-os.environ['ML_DAIKON_OUTPUT_DIR'] = "{output_dir}"
+os.environ['TRAINCHECK_OUTPUT_DIR'] = "{output_dir}"
 """
 
     debug_hook_code = """
 from traincheck.utils import register_custom_excepthook
-if os.environ.get("ML_DAIKON_DEBUG") == "1":
-    print("ML_DAIKON_DEBUG is set to 1, registering custom excepthook")
+if os.environ.get("TRAINCHECK_DEBUG") == "1":
+    print("TRAINCHECK_DEBUG is set to 1, registering custom excepthook")
     register_custom_excepthook(True)
 """
 
diff --git a/traincheck/instrumentor/tracer.py b/traincheck/instrumentor/tracer.py
diff --git a/traincheck/invariant/consistency_transient_vars.py b/traincheck/invariant/consistency_transient_vars.py

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ class Proxy:`
`102`	`102`	`var_dict: Dict[str, ProxyObjInfo] = {}`
`103`	`103`	`loglevel = logging.INFO`
`104`	`104`	`jsondumper = dumper(`
`105`		`- os.path.join(os.getenv("ML_DAIKON_OUTPUT_DIR", "."), "proxy_log.json") # type: ignore`
	`105`	`+ os.path.join(os.getenv("TRAINCHECK_OUTPUT_DIR", "."), "proxy_log.json") # type: ignore`
`106`	`106`	`)`
`107`	`107`
`108`	`108`	`@staticmethod`
Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,6 @@`
`25`	`25`	`"observe_then_unproxy": True, # observe the function call and then unproxy the arguments`
`26`	`26`	`}`
`27`	`27`
`28`		`-enable_C_level_observer = False # enable the observer at the C level (This would potentially lead to a lot of overhead since we need to observe and dump all proxied object at the C level function call, try to use auto observer with proper depth could reduce the overhead)`
`29`		`-`
`30`	`28`	`primitive_types = {`
`31`	`29`	`types.NoneType,`
`32`	`30`	`int,`