File "/InSPyReNet/run/Train.py", line 175, in <module> [00:00<?, ?it/s]
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
Epoch 50%|████████████████████ | 60/120 [00:07<?, ?it/s]
TypeError: '<' not supported between instances of 'complex' and 'float'
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr invalues = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
TypeError: '<' not supported between instances of 'complex' and 'float'
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
scheduler.step() File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
File "/InSPyReNet/run/Train.py", line 175, in <module>
train(opt, args)
File "/InSPyReNet/run/Train.py", line 141, in train
scheduler.step()
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
values = self.get_lr()
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654567 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654568 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654571 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654573 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654574 closing signal SIGTERM
[2024-10-25 04:18:40,324] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 654565) of binary: /usr/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 135, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run/Train.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-10-25_04:18:38
host : ca38e8013903
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 654566)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-10-25_04:18:38
host : ca38e8013903
rank : 4 (local_rank: 4)
exitcode : 1 (pid: 654569)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-10-25_04:18:38
host : ca38e8013903
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 654570)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
time : 2024-10-25_04:18:38
host : ca38e8013903
rank : 7 (local_rank: 7)
exitcode : 1 (pid: 654572)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-10-25_04:18:38
host : ca38e8013903
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 654565)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================