Skip to content

Commit 18b6a8b

Browse files
committed
remove unused proxy config; rename ML_DAIKON to TRAINCHECK
1 parent 0c0ebf8 commit 18b6a8b

11 files changed

Lines changed: 65 additions & 81 deletions

File tree

docs/5-min-tutorial.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
246246
"var_type": NaN,
247247
"mode": NaN,
248248
"dump_loc": NaN,
249-
"attributes._ML_DAIKON_data_ID": NaN,
249+
"attributes._TRAINCHECK_data_ID": NaN,
250250
"attributes.data": NaN,
251251
"attributes.dtype": NaN,
252252
"attributes.grad": NaN,
@@ -274,7 +274,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
274274
"attributes.requires_grad": NaN,
275275
"attributes.retains_grad": NaN,
276276
"attributes.shape": NaN,
277-
"attributes._ML_DAIKON_grad_ID": NaN,
277+
"attributes._TRAINCHECK_grad_ID": NaN,
278278
"exception": NaN,
279279
"exception_msg": NaN,
280280
"proxy_obj_names": NaN

docs/ae-eval-s5.1-silent-issue-detection.md

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
145145
> "process_id": 9591,
146146
> "thread_id": 140324043503424,
147147
86c86
148-
< "attributes._ML_DAIKON_data_ID": 140704882109040,
148+
< "attributes._TRAINCHECK_data_ID": 140704882109040,
149149
---
150-
> "attributes._ML_DAIKON_data_ID": 140317529048544,
150+
> "attributes._TRAINCHECK_data_ID": 140317529048544,
151151
116,117c116,117
152152
< "time": 2437523672783,
153153
< "meta_vars._DATA_PARALLEL_RANK": 4.0,
@@ -161,9 +161,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
161161
> "process_id": 9747,
162162
> "thread_id": 140028492969792,
163163
128c128
164-
< "attributes._ML_DAIKON_data_ID": 140043703504144,
164+
< "attributes._TRAINCHECK_data_ID": 140043703504144,
165165
---
166-
> "attributes._ML_DAIKON_data_ID": 140021978318304,
166+
> "attributes._TRAINCHECK_data_ID": 140021978318304,
167167
158,159c158,159
168168
< "time": 2437502499438,
169169
< "meta_vars._DATA_PARALLEL_RANK": 2.0,
@@ -182,9 +182,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
182182
< "exception_msg": NaN,
183183
< "proxy_obj_names": NaN,
184184
113c110,113
185-
< "attributes._ML_DAIKON_grad_ID": NaN
185+
< "attributes._TRAINCHECK_grad_ID": NaN
186186
---
187-
> "attributes._ML_DAIKON_grad_ID": NaN,
187+
> "attributes._TRAINCHECK_grad_ID": NaN,
188188
> "exception": NaN,
189189
> "exception_msg": NaN,
190190
> "proxy_obj_names": NaN
@@ -193,9 +193,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
193193
< "exception_msg": NaN,
194194
< "proxy_obj_names": NaN,
195195
215c212,215
196-
< "attributes._ML_DAIKON_grad_ID": NaN
196+
< "attributes._TRAINCHECK_grad_ID": NaN
197197
---
198-
> "attributes._ML_DAIKON_grad_ID": NaN,
198+
> "attributes._TRAINCHECK_grad_ID": NaN,
199199
> "exception": NaN,
200200
> "exception_msg": NaN,
201201
> "proxy_obj_names": NaN
@@ -210,9 +210,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
210210
< "exception_msg": NaN,
211211
< "proxy_obj_names": NaN,
212212
331c328,331
213-
< "attributes._ML_DAIKON_grad_ID": NaN
213+
< "attributes._TRAINCHECK_grad_ID": NaN
214214
---
215-
> "attributes._ML_DAIKON_grad_ID": NaN,
215+
> "attributes._TRAINCHECK_grad_ID": NaN,
216216
> "exception": NaN,
217217
> "exception_msg": NaN,
218218
> "proxy_obj_names": NaN
@@ -247,10 +247,10 @@ diff --color -r checker_output/trace_pytorch-51800/failed.log reference_checker_
247247
> "time": 19876858668088743,
248248
> "meta_vars.step": 0,
249249
89c70,89
250-
< "attributes._ML_DAIKON_grad_ID": NaN
250+
< "attributes._TRAINCHECK_grad_ID": NaN
251251
---
252252
> "type": "function_call (pre)",
253-
> "attributes._ML_DAIKON_grad_ID": NaN,
253+
> "attributes._TRAINCHECK_grad_ID": NaN,
254254
> "func_call_id": "b39a4a81b2c24473ba916ab1832fbf12_19876858668012869",
255255
> "function": "torch.nn.modules.module.Module.eval",
256256
> "is_bound_method": true,
@@ -290,9 +290,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
290290
---
291291
> "meta_vars._DATA_PARALLEL_RANK": "1",
292292
87c87
293-
< "attributes._ML_DAIKON_data_ID": 140656561409856,
293+
< "attributes._TRAINCHECK_data_ID": 140656561409856,
294294
---
295-
> "attributes._ML_DAIKON_data_ID": 140621279056480,
295+
> "attributes._TRAINCHECK_data_ID": 140621279056480,
296296
117c117
297297
< "time": 123297988837864,
298298
---
@@ -308,9 +308,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
308308
---
309309
> "meta_vars._DATA_PARALLEL_RANK": "0",
310310
129c129
311-
< "attributes._ML_DAIKON_data_ID": 140621279058160,
311+
< "attributes._TRAINCHECK_data_ID": 140621279058160,
312312
---
313-
> "attributes._ML_DAIKON_data_ID": 140656561411776,
313+
> "attributes._TRAINCHECK_data_ID": 140656561411776,
314314
159c159
315315
< "time": 123299970638648,
316316
---

docs/assets/code/mnist.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def forward(self, x):
4040

4141

4242
def train(args, model, device, train_loader, optimizer, epoch):
43-
annotate_stage("training") # ML_DAIKON: stage annotation
43+
annotate_stage("training") # TRAINCHECK: stage annotation
4444
model.train()
4545
for batch_idx, (data, target) in enumerate(train_loader):
4646
META_VARS["step"] += 1
@@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
6363
if args.dry_run:
6464
break
6565

66-
# ML_DAIKON: break after 100 batches
66+
# TRAINCHECK: break after 100 batches
6767
if batch_idx == 50:
6868
break
6969

7070

7171
def test(model, device, test_loader):
72-
annotate_stage("testing") # ML_DAIKON: stage annotation
72+
annotate_stage("testing") # TRAINCHECK: stage annotation
7373
model.eval()
7474
test_loss = 0
7575
correct = 0
@@ -87,7 +87,7 @@ def test(model, device, test_loader):
8787
correct += pred.eq(target.view_as(pred)).sum().item()
8888

8989
data_idx += 1
90-
# ML_DAIKON: break after 10 batches
90+
# TRAINCHECK: break after 10 batches
9191
if data_idx == 10:
9292
break
9393

@@ -174,7 +174,7 @@ def main():
174174
)
175175
args = parser.parse_args()
176176

177-
annotate_stage("init") # ML_DAIKON: stage annotation
177+
annotate_stage("init") # TRAINCHECK: stage annotation
178178
use_cuda = not args.no_cuda and torch.cuda.is_available()
179179
use_mps = not args.no_mps and torch.backends.mps.is_available()
180180

@@ -191,7 +191,7 @@ def main():
191191
test_kwargs = {"batch_size": args.test_batch_size}
192192
if use_cuda:
193193
cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
194-
# ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
194+
# TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
195195
# cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
196196
train_kwargs.update(cuda_kwargs)
197197
test_kwargs.update(cuda_kwargs)
@@ -212,11 +212,11 @@ def main():
212212
train(args, model, device, train_loader, optimizer, epoch)
213213
test(model, device, test_loader)
214214

215-
annotate_stage("training") # ML_DAIKON: stage annotation
215+
annotate_stage("training") # TRAINCHECK: stage annotation
216216
scheduler.step()
217217

218218
if args.save_model:
219-
annotate_stage("checkpointing") # ML_DAIKON: stage annotation
219+
annotate_stage("checkpointing") # TRAINCHECK: stage annotation
220220
torch.save(model.state_dict(), "mnist_cnn.pt")
221221

222222

docs/assets/examples/traincheck-collect/mnist-config/mnist.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def forward(self, x):
4040

4141

4242
def train(args, model, device, train_loader, optimizer, epoch):
43-
annotate_stage("training") # ML_DAIKON: stage annotation
43+
annotate_stage("training") # TRAINCHECK: stage annotation
4444
model.train()
4545
for batch_idx, (data, target) in enumerate(train_loader):
4646
META_VARS["step"] += 1
@@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
6363
if args.dry_run:
6464
break
6565

66-
# ML_DAIKON: break after 100 batches
66+
# TRAINCHECK: break after 100 batches
6767
if batch_idx == 50:
6868
break
6969

7070

7171
def test(model, device, test_loader):
72-
annotate_stage("testing") # ML_DAIKON: stage annotation
72+
annotate_stage("testing") # TRAINCHECK: stage annotation
7373
model.eval()
7474
test_loss = 0
7575
correct = 0
@@ -87,7 +87,7 @@ def test(model, device, test_loader):
8787
correct += pred.eq(target.view_as(pred)).sum().item()
8888

8989
data_idx += 1
90-
# ML_DAIKON: break after 10 batches
90+
# TRAINCHECK: break after 10 batches
9191
if data_idx == 10:
9292
break
9393

@@ -174,7 +174,7 @@ def main():
174174
)
175175
args = parser.parse_args()
176176

177-
annotate_stage("init") # ML_DAIKON: stage annotation
177+
annotate_stage("init") # TRAINCHECK: stage annotation
178178
use_cuda = not args.no_cuda and torch.cuda.is_available()
179179
use_mps = not args.no_mps and torch.backends.mps.is_available()
180180

@@ -191,7 +191,7 @@ def main():
191191
test_kwargs = {"batch_size": args.test_batch_size}
192192
if use_cuda:
193193
cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
194-
# ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
194+
# TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
195195
# cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
196196
train_kwargs.update(cuda_kwargs)
197197
test_kwargs.update(cuda_kwargs)
@@ -212,11 +212,11 @@ def main():
212212
train(args, model, device, train_loader, optimizer, epoch)
213213
test(model, device, test_loader)
214214

215-
annotate_stage("training") # ML_DAIKON: stage annotation
215+
annotate_stage("training") # TRAINCHECK: stage annotation
216216
scheduler.step()
217217

218218
if args.save_model:
219-
annotate_stage("checkpointing") # ML_DAIKON: stage annotation
219+
annotate_stage("checkpointing") # TRAINCHECK: stage annotation
220220
torch.save(model.state_dict(), "mnist_cnn.pt")
221221

222222

traincheck/collect_trace.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -347,12 +347,6 @@ def main():
347347
default="hash",
348348
help="The format for dumping tensors. Choose from 'hash'(default), 'stats' or 'full'.",
349349
)
350-
parser.add_argument(
351-
"--enable-C-level-observer",
352-
type=bool,
353-
default=proxy_config.enable_C_level_observer,
354-
help="Enable the observer at the C level",
355-
)
356350
parser.add_argument(
357351
"--no-auto-var-instr",
358352
action="store_true",
@@ -386,7 +380,7 @@ def main():
386380
# set up logging
387381
if args.debug_mode:
388382
logging.basicConfig(level=logging.DEBUG)
389-
os.environ["ML_DAIKON_DEBUG"] = "1"
383+
os.environ["TRAINCHECK_DEBUG"] = "1"
390384
else:
391385
logging.basicConfig(level=logging.INFO)
392386

@@ -406,7 +400,6 @@ def main():
406400
proxy_basic_config: dict[str, int | bool | str] = {}
407401
for configs in [
408402
"debug_mode",
409-
"enable_C_level_observer",
410403
]:
411404
if getattr(proxy_config, configs) != getattr(args, configs):
412405
proxy_basic_config[configs] = getattr(args, configs)

traincheck/instrumentor/dumper.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
)
2828
from traincheck.utils import get_timestamp_ns, typename, typename_compile
2929

30-
DEBUG = os.environ.get("ML_DAIKON_DEBUG", False)
30+
DEBUG = os.environ.get("TRAINCHECK_DEBUG", False)
3131
THREAD_DATA = threading.local()
3232
IS_CUDA_AVAILABLE = torch.cuda.is_available()
3333

@@ -129,10 +129,10 @@ def get_trace_API_dumper_queue():
129129
pid = os.getpid()
130130
tid = threading.get_ident()
131131

132-
output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
132+
output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
133133
assert (
134134
output_dir is not None
135-
), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
135+
), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
136136

137137
trace_queue = Queue()
138138
trace_file_name = f"trace_API_{pid}_{tid}.log"
@@ -161,10 +161,10 @@ def get_trace_VAR_dumper_queue():
161161
pid = os.getpid()
162162
tid = threading.current_thread().ident
163163

164-
output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
164+
output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
165165
assert (
166166
output_dir is not None
167-
), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
167+
), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
168168

169169
trace_queue = Queue()
170170
trace_file_name = f"trace_VAR_{pid}_{tid}.log"
@@ -249,10 +249,10 @@ def dump_trace_VAR(trace: dict):
249249

250250
def get_instrumentation_logger_for_process():
251251
pid = os.getpid()
252-
output_dir = os.getenv("ML_DAIKON_OUTPUT_DIR")
252+
output_dir = os.getenv("TRAINCHECK_OUTPUT_DIR")
253253
assert (
254254
output_dir is not None
255-
), "ML_DAIKON_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['ML_DAIKON_OUTPUT_DIR'] is set in the main function"
255+
), "TRAINCHECK_OUTPUT_DIR is not set, examine the instrumented code to see if os.environ['TRAINCHECK_OUTPUT_DIR'] is set in the main function"
256256

257257
if pid in instrumentation_loggers:
258258
return instrumentation_loggers[pid]
@@ -369,7 +369,7 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
369369
if (
370370
isinstance(attr_name, str)
371371
and attr_name.startswith("_")
372-
and not attr_name.startswith("_ML_DAIKON")
372+
and not attr_name.startswith("_TRAINCHECK")
373373
):
374374
continue
375375

@@ -405,12 +405,12 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
405405
result[attr_name] = attr
406406

407407
elif isinstance(attr, torch.Tensor):
408-
result[f"_ML_DAIKON_{attr_name}_ID"] = id(attr)
408+
result[f"_TRAINCHECK_{attr_name}_ID"] = id(attr)
409409
if include_tensor_data:
410410
result[attr_name] = dump_tensor(attr)
411411

412412
elif isinstance(attr, torch.nn.parameter.Parameter):
413-
result[f"_ML_DAIKON_{attr_name}_ID"] = id(attr)
413+
result[f"_TRAINCHECK_{attr_name}_ID"] = id(attr)
414414
if include_tensor_data:
415415
result[attr_name] = dump_tensor(attr.data)
416416

@@ -430,7 +430,7 @@ def convert_var_to_dict(var, include_tensor_data=True, dump_config=None) -> dict
430430
result[attr_name] = str(attr)
431431
elif isinstance(attr, torch.Size):
432432
result[attr_name] = tuple(attr)
433-
elif "_ML_DAIKON" in attr_name:
433+
elif "_TRAINCHECK" in attr_name:
434434
# should always be serializable, so blindly assign here.
435435
result[attr_name] = attr
436436

traincheck/instrumentor/proxy_wrapper/proxy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ class Proxy:
102102
var_dict: Dict[str, ProxyObjInfo] = {}
103103
loglevel = logging.INFO
104104
jsondumper = dumper(
105-
os.path.join(os.getenv("ML_DAIKON_OUTPUT_DIR", "."), "proxy_log.json") # type: ignore
105+
os.path.join(os.getenv("TRAINCHECK_OUTPUT_DIR", "."), "proxy_log.json") # type: ignore
106106
)
107107

108108
@staticmethod

traincheck/instrumentor/proxy_wrapper/proxy_config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
"observe_then_unproxy": True, # observe the function call and then unproxy the arguments
2626
}
2727

28-
enable_C_level_observer = False # enable the observer at the C level (This would potentially lead to a lot of overhead since we need to observe and dump all proxied object at the C level function call, try to use auto observer with proper depth could reduce the overhead)
29-
3028
primitive_types = {
3129
types.NoneType,
3230
int,

traincheck/instrumentor/source_file.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -833,13 +833,13 @@ def instrument_file(
833833
# logging configs
834834
logging_start_code = f"""
835835
import os
836-
os.environ['ML_DAIKON_OUTPUT_DIR'] = "{output_dir}"
836+
os.environ['TRAINCHECK_OUTPUT_DIR'] = "{output_dir}"
837837
"""
838838

839839
debug_hook_code = """
840840
from traincheck.utils import register_custom_excepthook
841-
if os.environ.get("ML_DAIKON_DEBUG") == "1":
842-
print("ML_DAIKON_DEBUG is set to 1, registering custom excepthook")
841+
if os.environ.get("TRAINCHECK_DEBUG") == "1":
842+
print("TRAINCHECK_DEBUG is set to 1, registering custom excepthook")
843843
register_custom_excepthook(True)
844844
"""
845845

0 commit comments

Comments
 (0)