Skip to content

ERROR prevent resume #54

@crapthings

Description

@crapthings
File "/InSPyReNet/run/Train.py", line 175, in <module> [00:00<?, ?it/s]
                                                                         Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>                 
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
Epoch 50%|████████████████████                    | 60/120 [00:07<?, ?it/s]
TypeError: '<' not supported between instances of 'complex' and 'float'
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
Traceback (most recent call last):
      File "/InSPyReNet/run/Train.py", line 175, in <module>
lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    scheduler.step()
      File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    train(opt, args)
      File "/InSPyReNet/run/Train.py", line 141, in train
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr invalues = self.get_lr()

  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
TypeError: '<' not supported between instances of 'complex' and 'float'
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in    
scheduler.step()  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>

  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
Traceback (most recent call last):
  File "/InSPyReNet/run/Train.py", line 175, in <module>
    train(opt, args)
  File "/InSPyReNet/run/Train.py", line 141, in train
    scheduler.step()
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 154, in step
    values = self.get_lr()
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in get_lr
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
  File "/InSPyReNet/lib/optim/scheduler.py", line 25, in <listcomp>
    lrs = [min(self.warmup_lr(base_lr, alpha), self.poly_lr(base_lr, self.last_epoch)) for base_lr in
TypeError: '<' not supported between instances of 'complex' and 'float'
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654567 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654568 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654571 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654573 closing signal SIGTERM
[2024-10-25 04:18:38,492] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 654574 closing signal SIGTERM
[2024-10-25 04:18:40,324] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 654565) of binary: /usr/bin/python
Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 812, in main
    run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 803, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 135, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
run/Train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 654566)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 4 (local_rank: 4)
  exitcode  : 1 (pid: 654569)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 5 (local_rank: 5)
  exitcode  : 1 (pid: 654570)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 7 (local_rank: 7)
  exitcode  : 1 (pid: 654572)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-10-25_04:18:38
  host      : ca38e8013903
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 654565)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions