Unverified Commit 19d03a95 authored by Glenn Jocher's avatar Glenn Jocher Committed by GitHub
Browse files

Remove DDP process group timeout (#4422)

parent 4e65052f
...@@ -493,7 +493,7 @@ def main(opt): ...@@ -493,7 +493,7 @@ def main(opt):
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998' assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
torch.cuda.set_device(LOCAL_RANK) torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
# Train # Train
if not opt.evolve: if not opt.evolve:
......
...@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int): ...@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
Decorator to make all processes in distributed training wait for each local_master to do something. Decorator to make all processes in distributed training wait for each local_master to do something.
""" """
if local_rank not in [-1, 0]: if local_rank not in [-1, 0]:
dist.barrier() dist.barrier(device_ids=[local_rank])
yield yield
if local_rank == 0: if local_rank == 0:
dist.barrier() dist.barrier(device_ids=[0])
def init_torch_seeds(seed=0): def init_torch_seeds(seed=0):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment