You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
首先感谢你这优秀的工作。我成功在NYU训练成功,但是在SCANNT数据集上一直失败,我可以确信某些数据应该是在计算损失的时候会出现nan,并且在您的原始代码中出现了类似# print(521, torch.any(torch.isnan(out_dict['depth_1_1']))),请问是否是部分场景本身的错误导致,我根据你代码中的指示,删除了对应的场景,但还是在其他场景下出现了类似的错误。请问可以给出正确的train_subscenes.txt与val_subscenes.txt。非常感谢!!以下是我在训练过程中遇到的错误:
Epoch 0: 2%| | 264/11422 [11:33<8:08:38, 2.63s/it, v_num=, train/loss_step=13/opt/conda/conda-bld/pytorch_1678402374358/work/aten/src/ATen/native/cuda/Loss.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion input_val >= zero && input_val <= one failed.
Traceback (most recent call last):
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 92, in launch
return function(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 935, in _run
results = self._run_stage()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 978, in _run_stage
self.fit_loop.run()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 201, in run
self.advance()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 354, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 133, in run
self.advance(data_fetcher)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 218, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 185, in run
self._optimizer_step(kwargs.get("batch_idx", 0), closure)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 261, in _optimizer_step
call._call_lightning_module_hook(
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 142, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1266, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 158, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 257, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 224, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 114, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper
return wrapped(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/optimizer.py", line 33, in _use_grad
ret = func(self, *args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/adamw.py", line 148, in step
loss = closure()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 101, in _wrap_closure
closure_result = closure()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 140, in call
self._result = self.closure(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 126, in closure
step_output = self._step_fn()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 308, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 288, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 329, in training_step
return self.model(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/overrides/base.py", line 90, in forward
output = self._forward_module.training_step(*inputs, **kwargs)
File "/home/gwr/PycharmProjects/ISO/iso/models/iso.py", line 762, in training_step
return self.step(batch, "train", self.train_metrics)
File "/home/gwr/PycharmProjects/ISO/iso/models/iso.py", line 688, in step
loss_sem_scal = sem_scal_loss(batch['name'], ssc_pred, target)
File "/home/gwr/PycharmProjects/ISO/iso/loss/ssc_loss.py", line 69, in sem_scal_loss
loss_precision = F.binary_cross_entropy(
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/functional.py", line 3098, in binary_cross_entropy
return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
RuntimeError: CUDA error: device-side assert triggered
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
The text was updated successfully, but these errors were encountered:
首先感谢你这优秀的工作。我成功在NYU训练成功,但是在SCANNT数据集上一直失败,我可以确信某些数据应该是在计算损失的时候会出现nan,并且在您的原始代码中出现了类似# print(521, torch.any(torch.isnan(out_dict['depth_1_1']))),请问是否是部分场景本身的错误导致,我根据你代码中的指示,删除了对应的场景,但还是在其他场景下出现了类似的错误。请问可以给出正确的train_subscenes.txt与val_subscenes.txt。非常感谢!!以下是我在训练过程中遇到的错误:
Epoch 0: 2%| | 264/11422 [11:33<8:08:38, 2.63s/it, v_num=, train/loss_step=13/opt/conda/conda-bld/pytorch_1678402374358/work/aten/src/ATen/native/cuda/Loss.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion input_val >= zero && input_val <= one failed.
Traceback (most recent call last):
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 92, in launch
return function(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 935, in _run
results = self._run_stage()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 978, in _run_stage
self.fit_loop.run()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 201, in run
self.advance()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 354, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 133, in run
self.advance(data_fetcher)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 218, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 185, in run
self._optimizer_step(kwargs.get("batch_idx", 0), closure)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 261, in _optimizer_step
call._call_lightning_module_hook(
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 142, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1266, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 158, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 257, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 224, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 114, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper
return wrapped(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/optimizer.py", line 33, in _use_grad
ret = func(self, *args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/optim/adamw.py", line 148, in step
loss = closure()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 101, in _wrap_closure
closure_result = closure()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 140, in call
self._result = self.closure(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 126, in closure
step_output = self._step_fn()
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 308, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 288, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 329, in training_step
return self.model(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/pytorch_lightning/overrides/base.py", line 90, in forward
output = self._forward_module.training_step(*inputs, **kwargs)
File "/home/gwr/PycharmProjects/ISO/iso/models/iso.py", line 762, in training_step
return self.step(batch, "train", self.train_metrics)
File "/home/gwr/PycharmProjects/ISO/iso/models/iso.py", line 688, in step
loss_sem_scal = sem_scal_loss(batch['name'], ssc_pred, target)
File "/home/gwr/PycharmProjects/ISO/iso/loss/ssc_loss.py", line 69, in sem_scal_loss
loss_precision = F.binary_cross_entropy(
File "/home/gwr/.conda/envs/iso/lib/python3.9/site-packages/torch/nn/functional.py", line 3098, in binary_cross_entropy
return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
RuntimeError: CUDA error: device-side assert triggered
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
The text was updated successfully, but these errors were encountered: