202 lines
5.3 KiB
Python
202 lines
5.3 KiB
Python
# Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py # noqa: E501
|
|
import functools
|
|
import os
|
|
import subprocess
|
|
import torch
|
|
import torch.distributed as dist
|
|
import torch.multiprocessing as mp
|
|
|
|
|
|
# ----------------------------------
|
|
# init
|
|
# ----------------------------------
|
|
def init_dist(launcher, backend='nccl', **kwargs):
|
|
if mp.get_start_method(allow_none=True) is None:
|
|
mp.set_start_method('spawn')
|
|
if launcher == 'pytorch':
|
|
_init_dist_pytorch(backend, **kwargs)
|
|
elif launcher == 'slurm':
|
|
_init_dist_slurm(backend, **kwargs)
|
|
else:
|
|
raise ValueError(f'Invalid launcher type: {launcher}')
|
|
|
|
|
|
def _init_dist_pytorch(backend, **kwargs):
|
|
rank = int(os.environ['RANK'])
|
|
num_gpus = torch.cuda.device_count()
|
|
torch.cuda.set_device(rank % num_gpus)
|
|
dist.init_process_group(backend=backend, **kwargs)
|
|
|
|
|
|
def _init_dist_slurm(backend, port=None):
|
|
"""Initialize slurm distributed training environment.
|
|
If argument ``port`` is not specified, then the master port will be system
|
|
environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
|
|
environment variable, then a default port ``29500`` will be used.
|
|
Args:
|
|
backend (str): Backend of torch.distributed.
|
|
port (int, optional): Master port. Defaults to None.
|
|
"""
|
|
proc_id = int(os.environ['SLURM_PROCID'])
|
|
ntasks = int(os.environ['SLURM_NTASKS'])
|
|
node_list = os.environ['SLURM_NODELIST']
|
|
num_gpus = torch.cuda.device_count()
|
|
torch.cuda.set_device(proc_id % num_gpus)
|
|
addr = subprocess.getoutput(
|
|
f'scontrol show hostname {node_list} | head -n1')
|
|
# specify master port
|
|
if port is not None:
|
|
os.environ['MASTER_PORT'] = str(port)
|
|
elif 'MASTER_PORT' in os.environ:
|
|
pass # use MASTER_PORT in the environment variable
|
|
else:
|
|
# 29500 is torch.distributed default port
|
|
os.environ['MASTER_PORT'] = '29500'
|
|
os.environ['MASTER_ADDR'] = addr
|
|
os.environ['WORLD_SIZE'] = str(ntasks)
|
|
os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
|
|
os.environ['RANK'] = str(proc_id)
|
|
dist.init_process_group(backend=backend)
|
|
|
|
|
|
|
|
# ----------------------------------
|
|
# get rank and world_size
|
|
# ----------------------------------
|
|
def get_dist_info():
|
|
if dist.is_available():
|
|
initialized = dist.is_initialized()
|
|
else:
|
|
initialized = False
|
|
if initialized:
|
|
rank = dist.get_rank()
|
|
world_size = dist.get_world_size()
|
|
else:
|
|
rank = 0
|
|
world_size = 1
|
|
return rank, world_size
|
|
|
|
|
|
def get_rank():
|
|
if not dist.is_available():
|
|
return 0
|
|
|
|
if not dist.is_initialized():
|
|
return 0
|
|
|
|
return dist.get_rank()
|
|
|
|
|
|
def get_world_size():
|
|
if not dist.is_available():
|
|
return 1
|
|
|
|
if not dist.is_initialized():
|
|
return 1
|
|
|
|
return dist.get_world_size()
|
|
|
|
|
|
def master_only(func):
|
|
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
rank, _ = get_dist_info()
|
|
if rank == 0:
|
|
return func(*args, **kwargs)
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ----------------------------------
|
|
# operation across ranks
|
|
# ----------------------------------
|
|
def reduce_sum(tensor):
|
|
if not dist.is_available():
|
|
return tensor
|
|
|
|
if not dist.is_initialized():
|
|
return tensor
|
|
|
|
tensor = tensor.clone()
|
|
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
|
|
|
return tensor
|
|
|
|
|
|
def gather_grad(params):
|
|
world_size = get_world_size()
|
|
|
|
if world_size == 1:
|
|
return
|
|
|
|
for param in params:
|
|
if param.grad is not None:
|
|
dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
|
|
param.grad.data.div_(world_size)
|
|
|
|
|
|
def all_gather(data):
|
|
world_size = get_world_size()
|
|
|
|
if world_size == 1:
|
|
return [data]
|
|
|
|
buffer = pickle.dumps(data)
|
|
storage = torch.ByteStorage.from_buffer(buffer)
|
|
tensor = torch.ByteTensor(storage).to('cuda')
|
|
|
|
local_size = torch.IntTensor([tensor.numel()]).to('cuda')
|
|
size_list = [torch.IntTensor([0]).to('cuda') for _ in range(world_size)]
|
|
dist.all_gather(size_list, local_size)
|
|
size_list = [int(size.item()) for size in size_list]
|
|
max_size = max(size_list)
|
|
|
|
tensor_list = []
|
|
for _ in size_list:
|
|
tensor_list.append(torch.ByteTensor(size=(max_size,)).to('cuda'))
|
|
|
|
if local_size != max_size:
|
|
padding = torch.ByteTensor(size=(max_size - local_size,)).to('cuda')
|
|
tensor = torch.cat((tensor, padding), 0)
|
|
|
|
dist.all_gather(tensor_list, tensor)
|
|
|
|
data_list = []
|
|
|
|
for size, tensor in zip(size_list, tensor_list):
|
|
buffer = tensor.cpu().numpy().tobytes()[:size]
|
|
data_list.append(pickle.loads(buffer))
|
|
|
|
return data_list
|
|
|
|
|
|
def reduce_loss_dict(loss_dict):
|
|
world_size = get_world_size()
|
|
|
|
if world_size < 2:
|
|
return loss_dict
|
|
|
|
with torch.no_grad():
|
|
keys = []
|
|
losses = []
|
|
|
|
for k in sorted(loss_dict.keys()):
|
|
keys.append(k)
|
|
losses.append(loss_dict[k])
|
|
|
|
losses = torch.stack(losses, 0)
|
|
dist.reduce(losses, dst=0)
|
|
|
|
if dist.get_rank() == 0:
|
|
losses /= world_size
|
|
|
|
reduced_losses = {k: v for k, v in zip(keys, losses)}
|
|
|
|
return reduced_losses
|
|
|