train.py 30.4 KB
Newer Older
1
2
3
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
"""
Train a YOLOv5 model on a custom dataset
4
5
6
7
8

Usage:
    $ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640
"""

Glenn Jocher's avatar
Glenn Jocher committed
9
import argparse
10
import logging
11
import math
Jirka Borovec's avatar
Jirka Borovec committed
12
import os
Glenn Jocher's avatar
Glenn Jocher committed
13
import random
14
import sys
Jirka Borovec's avatar
Jirka Borovec committed
15
import time
16
from copy import deepcopy
Jirka Borovec's avatar
Jirka Borovec committed
17
18
19
from pathlib import Path

import numpy as np
Glenn Jocher's avatar
Glenn Jocher committed
20
import torch
Glenn Jocher's avatar
Glenn Jocher committed
21
import torch.distributed as dist
22
import torch.nn as nn
Jirka Borovec's avatar
Jirka Borovec committed
23
import yaml
24
from torch.cuda import amp
yzchen's avatar
yzchen committed
25
from torch.nn.parallel import DistributedDataParallel as DDP
Glenn Jocher's avatar
Glenn Jocher committed
26
from torch.optim import Adam, SGD, lr_scheduler
Jirka Borovec's avatar
Jirka Borovec committed
27
from tqdm import tqdm
Glenn Jocher's avatar
Glenn Jocher committed
28

29
30
31
FILE = Path(__file__).absolute()
sys.path.append(FILE.parents[0].as_posix())  # add yolov5/ to path

32
import val  # for end-of-epoch mAP
33
from models.experimental import attempt_load
Glenn Jocher's avatar
Glenn Jocher committed
34
from models.yolo import Model
Glenn Jocher's avatar
Glenn Jocher committed
35
from utils.autoanchor import check_anchors
Jirka Borovec's avatar
Jirka Borovec committed
36
from utils.datasets import create_dataloader
Glenn Jocher's avatar
Glenn Jocher committed
37
from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
38
    strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
39
    check_requirements, print_mutation, set_logging, one_cycle, colorstr, methods
40
from utils.downloads import attempt_download
Glenn Jocher's avatar
Glenn Jocher committed
41
from utils.loss import ComputeLoss
Glenn Jocher's avatar
Glenn Jocher committed
42
from utils.plots import plot_labels, plot_evolve
43
from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, de_parallel
44
from utils.loggers.wandb.wandb_utils import check_wandb_resume
45
from utils.metrics import fitness
46
from utils.loggers import Loggers
47
from utils.callbacks import Callbacks
Alex Stoken's avatar
Alex Stoken committed
48

Glenn Jocher's avatar
Glenn Jocher committed
49
LOGGER = logging.getLogger(__name__)
50
51
52
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
Glenn Jocher's avatar
Glenn Jocher committed
53

Glenn Jocher's avatar
Glenn Jocher committed
54

55
def train(hyp,  # path/to/hyp.yaml or hyp dictionary
Glenn Jocher's avatar
Glenn Jocher committed
56
57
          opt,
          device,
58
          callbacks=Callbacks()
Glenn Jocher's avatar
Glenn Jocher committed
59
          ):
60
    save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze, = \
Glenn Jocher's avatar
Glenn Jocher committed
61
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
62
        opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
63
64

    # Directories
Glenn Jocher's avatar
Glenn Jocher committed
65
66
    w = save_dir / 'weights'  # weights dir
    w.mkdir(parents=True, exist_ok=True)  # make dir
Glenn Jocher's avatar
Glenn Jocher committed
67
    last, best = w / 'last.pt', w / 'best.pt'
Glenn Jocher's avatar
Glenn Jocher committed
68

69
70
71
72
    # Hyperparameters
    if isinstance(hyp, str):
        with open(hyp) as f:
            hyp = yaml.safe_load(f)  # load hyps dict
Glenn Jocher's avatar
Glenn Jocher committed
73
    LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
74

75
    # Save run settings
76
    with open(save_dir / 'hyp.yaml', 'w') as f:
77
        yaml.safe_dump(hyp, f, sort_keys=False)
78
    with open(save_dir / 'opt.yaml', 'w') as f:
79
        yaml.safe_dump(vars(opt), f, sort_keys=False)
80
    data_dict = None
Glenn Jocher's avatar
Glenn Jocher committed
81

82
83
    # Loggers
    if RANK in [-1, 0]:
84
        loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  # loggers instance
85
86
87
88
        if loggers.wandb:
            data_dict = loggers.wandb.data_dict
            if resume:
                weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
Glenn Jocher's avatar
Glenn Jocher committed
89

90
91
92
93
        # Register actions
        for k in methods(loggers):
            callbacks.register_action(k, callback=getattr(loggers, k))

94
    # Config
95
    plots = not evolve  # create plots
96
    cuda = device.type != 'cpu'
97
    init_seeds(1 + RANK)
98
    with torch_distributed_zero_first(RANK):
99
        data_dict = data_dict or check_dataset(data)  # check if None
100
    train_path, val_path = data_dict['train'], data_dict['val']
101
102
    nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
Glenn Jocher's avatar
Glenn Jocher committed
103
    assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # check
104
    is_coco = data.endswith('coco.yaml') and nc == 80  # COCO dataset
Glenn Jocher's avatar
Glenn Jocher committed
105

Glenn Jocher's avatar
Glenn Jocher committed
106
107
108
    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
109
        with torch_distributed_zero_first(RANK):
110
            weights = attempt_download(weights)  # download if not found locally
Glenn Jocher's avatar
Glenn Jocher committed
111
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
112
113
        model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
        exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys
Glenn Jocher's avatar
Glenn Jocher committed
114
115
116
117
        csd = ckpt['model'].float().state_dict()  # checkpoint state_dict as FP32
        csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  # intersect
        model.load_state_dict(csd, strict=False)  # load
        LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # report
Glenn Jocher's avatar
Glenn Jocher committed
118
    else:
119
        model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
Glenn Jocher's avatar
Glenn Jocher committed
120

Glenn Jocher's avatar
Glenn Jocher committed
121
    # Freeze
122
    freeze = [f'model.{x}.' for x in range(freeze)]  # layers to freeze
123
124
125
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
Glenn Jocher's avatar
Glenn Jocher committed
126
            print(f'freezing {k}')
127
            v.requires_grad = False
Glenn Jocher's avatar
Glenn Jocher committed
128

Glenn Jocher's avatar
Glenn Jocher committed
129
130
    # Optimizer
    nbs = 64  # nominal batch size
131
132
    accumulate = max(round(nbs / batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
Glenn Jocher's avatar
Glenn Jocher committed
133
    LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
yzchen's avatar
yzchen committed
134

Glenn Jocher's avatar
Glenn Jocher committed
135
136
137
138
    g0, g1, g2 = [], [], []  # optimizer parameter groups
    for v in model.modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):  # bias
            g2.append(v.bias)
139
        if isinstance(v, nn.BatchNorm2d):  # weight (no decay)
Glenn Jocher's avatar
Glenn Jocher committed
140
            g0.append(v.weight)
141
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):  # weight (with decay)
Glenn Jocher's avatar
Glenn Jocher committed
142
            g1.append(v.weight)
Glenn Jocher's avatar
Glenn Jocher committed
143

144
    if opt.adam:
Glenn Jocher's avatar
Glenn Jocher committed
145
        optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
146
    else:
Glenn Jocher's avatar
Glenn Jocher committed
147
        optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
148

Glenn Jocher's avatar
Glenn Jocher committed
149
150
151
152
153
    optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']})  # add g1 with weight_decay
    optimizer.add_param_group({'params': g2})  # add g2 (biases)
    LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
                f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
    del g0, g1, g2
154

Glenn Jocher's avatar
Glenn Jocher committed
155
    # Scheduler
156
157
158
159
    if opt.linear_lr:
        lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear
    else:
        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
Glenn Jocher's avatar
Glenn Jocher committed
160
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)
161

162
    # EMA
163
    ema = ModelEMA(model) if RANK in [-1, 0] else None
164

Glenn Jocher's avatar
Glenn Jocher committed
165
    # Resume
Glenn Jocher's avatar
Glenn Jocher committed
166
    start_epoch, best_fitness = 0, 0.0
Glenn Jocher's avatar
Glenn Jocher committed
167
168
    if pretrained:
        # Optimizer
Glenn Jocher's avatar
updates    
Glenn Jocher committed
169
170
171
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']
Glenn Jocher's avatar
Glenn Jocher committed
172

173
174
        # EMA
        if ema and ckpt.get('ema'):
Glenn Jocher's avatar
Glenn Jocher committed
175
176
            ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
            ema.updates = ckpt['updates']
177

Glenn Jocher's avatar
Glenn Jocher committed
178
        # Epochs
Glenn Jocher's avatar
updates    
Glenn Jocher committed
179
        start_epoch = ckpt['epoch'] + 1
180
        if resume:
Glenn Jocher's avatar
Glenn Jocher committed
181
            assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
Glenn Jocher's avatar
Glenn Jocher committed
182
        if epochs < start_epoch:
Glenn Jocher's avatar
Glenn Jocher committed
183
            LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
Glenn Jocher's avatar
Glenn Jocher committed
184
185
            epochs += ckpt['epoch']  # finetune additional epochs

Glenn Jocher's avatar
Glenn Jocher committed
186
        del ckpt, csd
Glenn Jocher's avatar
Glenn Jocher committed
187

188
    # Image sizes
189
    gs = max(int(model.stride.max()), 32)  # grid size (max stride)
Glenn Jocher's avatar
Glenn Jocher committed
190
    nl = model.model[-1].nl  # number of detection layers (used for scaling hyp['obj'])
191
    imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)  # verify imgsz is gs-multiple
Glenn Jocher's avatar
Glenn Jocher committed
192

yzchen's avatar
yzchen committed
193
    # DP mode
194
    if cuda and RANK == -1 and torch.cuda.device_count() > 1:
Glenn Jocher's avatar
Glenn Jocher committed
195
196
        logging.warning('DP not recommended, instead use torch.distributed.run for best DDP Multi-GPU results.\n'
                        'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
yzchen's avatar
yzchen committed
197
198
        model = torch.nn.DataParallel(model)

199
    # SyncBatchNorm
200
    if opt.sync_bn and cuda and RANK != -1:
yzchen's avatar
yzchen committed
201
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
Glenn Jocher's avatar
Glenn Jocher committed
202
        LOGGER.info('Using SyncBatchNorm()')
203

Glenn Jocher's avatar
Glenn Jocher committed
204
    # Trainloader
Glenn Jocher's avatar
Glenn Jocher committed
205
    train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
206
                                              hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=RANK,
Glenn Jocher's avatar
Glenn Jocher committed
207
208
                                              workers=workers, image_weights=opt.image_weights, quad=opt.quad,
                                              prefix=colorstr('train: '))
Glenn Jocher's avatar
Glenn Jocher committed
209
    mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max())  # max label class
Glenn Jocher's avatar
Glenn Jocher committed
210
    nb = len(train_loader)  # number of batches
Glenn Jocher's avatar
Glenn Jocher committed
211
    assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
Glenn Jocher's avatar
Glenn Jocher committed
212

213
    # Process 0
214
    if RANK in [-1, 0]:
Glenn Jocher's avatar
Glenn Jocher committed
215
        val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
216
                                       hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
Glenn Jocher's avatar
Glenn Jocher committed
217
218
                                       workers=workers, pad=0.5,
                                       prefix=colorstr('val: '))[0]
219

220
        if not resume:
221
            labels = np.concatenate(dataset.labels, 0)
Glenn Jocher's avatar
Glenn Jocher committed
222
            # c = torch.tensor(labels[:, 0])  # classes
223
224
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
225
            if plots:
226
                plot_labels(labels, names, save_dir)
227
228
229
230

            # Anchors
            if not opt.noautoanchor:
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
231
232
            model.half().float()  # pre-reduce anchor precision

233
234
        callbacks.on_pretrain_routine_end()

235
    # DDP mode
236
    if cuda and RANK != -1:
237
        model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
Glenn Jocher's avatar
Glenn Jocher committed
238
239

    # Model parameters
Glenn Jocher's avatar
Glenn Jocher committed
240
241
242
    hyp['box'] *= 3. / nl  # scale to layers
    hyp['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
    hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl  # scale to image size and layers
243
    hyp['label_smoothing'] = opt.label_smoothing
Glenn Jocher's avatar
Glenn Jocher committed
244
245
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
246
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
Glenn Jocher's avatar
Glenn Jocher committed
247
    model.names = names
Glenn Jocher's avatar
Glenn Jocher committed
248
249
250

    # Start training
    t0 = time.time()
Glenn Jocher's avatar
Glenn Jocher committed
251
    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
252
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
253
    last_opt_step = -1
Glenn Jocher's avatar
Glenn Jocher committed
254
    maps = np.zeros(nc)  # mAP per class
255
    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
256
    scheduler.last_epoch = start_epoch - 1  # do not move
257
    scaler = amp.GradScaler(enabled=cuda)
Glenn Jocher's avatar
Glenn Jocher committed
258
    compute_loss = ComputeLoss(model)  # init loss class
Glenn Jocher's avatar
Glenn Jocher committed
259
260
    LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
                f'Using {train_loader.num_workers} dataloader workers\n'
261
262
                f'Logging results to {save_dir}\n'
                f'Starting training for {epochs} epochs...')
Glenn Jocher's avatar
Glenn Jocher committed
263
264
265
266
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
267
        if opt.image_weights:
268
            # Generate indices
269
            if RANK in [-1, 0]:
270
                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
Glenn Jocher's avatar
Glenn Jocher committed
271
272
                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
273
            # Broadcast if DDP
274
275
            if RANK != -1:
                indices = (torch.tensor(dataset.indices) if RANK == 0 else torch.zeros(dataset.n)).int()
yzchen's avatar
yzchen committed
276
                dist.broadcast(indices, 0)
277
                if RANK != 0:
yzchen's avatar
yzchen committed
278
                    dataset.indices = indices.cpu().numpy()
Glenn Jocher's avatar
Glenn Jocher committed
279

Glenn Jocher's avatar
Glenn Jocher committed
280
281
282
283
        # Update mosaic border
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

Glenn Jocher's avatar
Glenn Jocher committed
284
        mloss = torch.zeros(3, device=device)  # mean losses
285
        if RANK != -1:
Glenn Jocher's avatar
Glenn Jocher committed
286
287
            train_loader.sampler.set_epoch(epoch)
        pbar = enumerate(train_loader)
Glenn Jocher's avatar
Glenn Jocher committed
288
        LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
289
        if RANK in [-1, 0]:
yzchen's avatar
yzchen committed
290
291
            pbar = tqdm(pbar, total=nb)  # progress bar
        optimizer.zero_grad()
Glenn Jocher's avatar
Glenn Jocher committed
292
293
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
294
            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0
Glenn Jocher's avatar
Glenn Jocher committed
295

296
297
298
            # Warmup
            if ni <= nw:
                xi = [0, nw]  # x interp
Glenn Jocher's avatar
Glenn Jocher committed
299
                # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
300
                accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
Glenn Jocher's avatar
Glenn Jocher committed
301
302
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
303
                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
Glenn Jocher's avatar
Glenn Jocher committed
304
                    if 'momentum' in x:
305
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
Glenn Jocher's avatar
Glenn Jocher committed
306
307

            # Multi-scale
Glenn Jocher's avatar
Glenn Jocher committed
308
309
310
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
                sf = sz / max(imgs.shape[2:])  # scale factor
Glenn Jocher's avatar
Glenn Jocher committed
311
312
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
Glenn Jocher's avatar
Glenn Jocher committed
313
                    imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
Glenn Jocher's avatar
Glenn Jocher committed
314

315
            # Forward
316
            with amp.autocast(enabled=cuda):
317
                pred = model(imgs)  # forward
Glenn Jocher's avatar
Glenn Jocher committed
318
                loss, loss_items = compute_loss(pred, targets.to(device))  # loss scaled by batch_size
319
320
                if RANK != -1:
                    loss *= WORLD_SIZE  # gradient averaged between devices in DDP mode
Glenn Jocher's avatar
Glenn Jocher committed
321
322
                if opt.quad:
                    loss *= 4.
Glenn Jocher's avatar
Glenn Jocher committed
323
324

            # Backward
325
            scaler.scale(loss).backward()
Glenn Jocher's avatar
Glenn Jocher committed
326
327

            # Optimize
328
            if ni - last_opt_step >= accumulate:
329
330
                scaler.step(optimizer)  # optimizer.step
                scaler.update()
Glenn Jocher's avatar
Glenn Jocher committed
331
                optimizer.zero_grad()
Glenn Jocher's avatar
Glenn Jocher committed
332
                if ema:
yzchen's avatar
yzchen committed
333
                    ema.update(model)
334
                last_opt_step = ni
Glenn Jocher's avatar
Glenn Jocher committed
335

Glenn Jocher's avatar
Glenn Jocher committed
336
            # Log
337
            if RANK in [-1, 0]:
yzchen's avatar
yzchen committed
338
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
Glenn Jocher's avatar
Glenn Jocher committed
339
                mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  # (GB)
Glenn Jocher's avatar
Glenn Jocher committed
340
341
                pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
                    f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
342
                callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots)
Glenn Jocher's avatar
Glenn Jocher committed
343
            # end batch ------------------------------------------------------------------------------------------------
Glenn Jocher's avatar
Glenn Jocher committed
344

Glenn Jocher's avatar
Glenn Jocher committed
345
        # Scheduler
Glenn Jocher's avatar
Glenn Jocher committed
346
        lr = [x['lr'] for x in optimizer.param_groups]  # for loggers
Glenn Jocher's avatar
Glenn Jocher committed
347
348
        scheduler.step()

349
        if RANK in [-1, 0]:
yzchen's avatar
yzchen committed
350
            # mAP
351
            callbacks.on_train_epoch_end(epoch=epoch)
Glenn Jocher's avatar
Glenn Jocher committed
352
            ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
yzchen's avatar
yzchen committed
353
            final_epoch = epoch + 1 == epochs
354
355
356
            if not noval or final_epoch:  # Calculate mAP
                results, maps, _ = val.run(data_dict,
                                           batch_size=batch_size // WORLD_SIZE * 2,
Glenn Jocher's avatar
Glenn Jocher committed
357
                                           imgsz=imgsz,
358
359
                                           model=ema.ema,
                                           single_cls=single_cls,
Glenn Jocher's avatar
Glenn Jocher committed
360
                                           dataloader=val_loader,
361
362
363
364
                                           save_dir=save_dir,
                                           save_json=is_coco and final_epoch,
                                           verbose=nc < 50 and final_epoch,
                                           plots=plots and final_epoch,
365
                                           callbacks=callbacks,
366
                                           compute_loss=compute_loss)
367

NanoCode012's avatar
NanoCode012 committed
368
            # Update best mAP
369
            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
NanoCode012's avatar
NanoCode012 committed
370
371
            if fi > best_fitness:
                best_fitness = fi
Glenn Jocher's avatar
Glenn Jocher committed
372
373
            log_vals = list(mloss) + list(results) + lr
            callbacks.on_fit_epoch_end(log_vals, epoch, best_fitness, fi)
yzchen's avatar
yzchen committed
374
375

            # Save model
376
            if (not nosave) or (final_epoch and not evolve):  # if save
Glenn Jocher's avatar
Glenn Jocher committed
377
378
                ckpt = {'epoch': epoch,
                        'best_fitness': best_fitness,
379
                        'model': deepcopy(de_parallel(model)).half(),
Glenn Jocher's avatar
Glenn Jocher committed
380
381
                        'ema': deepcopy(ema.ema).half(),
                        'updates': ema.updates,
382
                        'optimizer': optimizer.state_dict(),
383
                        'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None}
yzchen's avatar
yzchen committed
384
385
386

                # Save last, best and delete
                torch.save(ckpt, last)
387
                if best_fitness == fi:
yzchen's avatar
yzchen committed
388
389
                    torch.save(ckpt, best)
                del ckpt
390
                callbacks.on_model_save(last, epoch, final_epoch, best_fitness, fi)
391

Glenn Jocher's avatar
Glenn Jocher committed
392
        # end epoch ----------------------------------------------------------------------------------------------------
Glenn Jocher's avatar
Glenn Jocher committed
393
    # end training -----------------------------------------------------------------------------------------------------
394
    if RANK in [-1, 0]:
395
        LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
396
        if not evolve:
397
398
            if is_coco:  # COCO dataset
                for m in [last, best] if best.exists() else [last]:  # speed, mAP tests
399
400
                    results, _, _ = val.run(data_dict,
                                            batch_size=batch_size // WORLD_SIZE * 2,
Glenn Jocher's avatar
Glenn Jocher committed
401
                                            imgsz=imgsz,
402
                                            model=attempt_load(m, device).half(),
Glenn Jocher's avatar
Glenn Jocher committed
403
                                            iou_thres=0.7,  # NMS IoU threshold for best pycocotools results
404
                                            single_cls=single_cls,
Glenn Jocher's avatar
Glenn Jocher committed
405
                                            dataloader=val_loader,
406
407
408
                                            save_dir=save_dir,
                                            save_json=True,
                                            plots=False)
409
410
411
412
            # Strip optimizers
            for f in last, best:
                if f.exists():
                    strip_optimizer(f)  # strip optimizers
413
414
        callbacks.on_train_end(last, best, plots, epoch)
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
415

Glenn Jocher's avatar
Glenn Jocher committed
416
417
418
419
    torch.cuda.empty_cache()
    return results


420
def parse_opt(known=False):
Glenn Jocher's avatar
Glenn Jocher committed
421
    parser = argparse.ArgumentParser()
Glenn Jocher's avatar
Glenn Jocher committed
422
423
    parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path')
    parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
Glenn Jocher's avatar
Glenn Jocher committed
424
    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='dataset.yaml path')
425
    parser.add_argument('--hyp', type=str, default='data/hyps/hyp.scratch.yaml', help='hyperparameters path')
426
    parser.add_argument('--epochs', type=int, default=300)
427
    parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
Glenn Jocher's avatar
Glenn Jocher committed
428
    parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)')
Glenn Jocher's avatar
Glenn Jocher committed
429
    parser.add_argument('--rect', action='store_true', help='rectangular training')
Glenn Jocher's avatar
Glenn Jocher committed
430
    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
Glenn Jocher's avatar
Glenn Jocher committed
431
    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
432
    parser.add_argument('--noval', action='store_true', help='only validate final epoch')
Glenn Jocher's avatar
Glenn Jocher committed
433
    parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
434
    parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
Glenn Jocher's avatar
Glenn Jocher committed
435
    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
436
    parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
437
    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
Glenn Jocher's avatar
Glenn Jocher committed
438
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
439
    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
440
    parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
441
    parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
442
    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
Glenn Jocher's avatar
Glenn Jocher committed
443
    parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
444
    parser.add_argument('--project', default='runs/train', help='save to project/name')
Jan Hajek's avatar
Jan Hajek committed
445
    parser.add_argument('--entity', default=None, help='W&B entity')
446
447
    parser.add_argument('--name', default='exp', help='save to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
Glenn Jocher's avatar
Glenn Jocher committed
448
    parser.add_argument('--quad', action='store_true', help='quad dataloader')
449
    parser.add_argument('--linear-lr', action='store_true', help='linear LR')
450
    parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
451
452
453
454
    parser.add_argument('--upload_dataset', action='store_true', help='Upload dataset as W&B artifact table')
    parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B')
    parser.add_argument('--save_period', type=int, default=-1, help='Log model after every "save_period" epoch')
    parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used')
455
    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
456
    parser.add_argument('--freeze', type=int, default=0, help='Number of layers to freeze. backbone=10, all=24')
457
    opt = parser.parse_known_args()[0] if known else parser.parse_args()
458
459
460
461
    return opt


def main(opt):
462
    # Checks
463
464
465
    set_logging(RANK)
    if RANK in [-1, 0]:
        print(colorstr('train: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items()))
yzchen's avatar
yzchen committed
466
        check_git_status()
467
        check_requirements(requirements=FILE.parent / 'requirements.txt', exclude=['thop'])
Glenn Jocher's avatar
Glenn Jocher committed
468

Glenn Jocher's avatar
Glenn Jocher committed
469
    # Resume
Glenn Jocher's avatar
Glenn Jocher committed
470
    if opt.resume and not check_wandb_resume(opt) and not opt.evolve:  # resume an interrupted run
Glenn Jocher's avatar
Glenn Jocher committed
471
472
        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
473
        with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
474
            opt = argparse.Namespace(**yaml.safe_load(f))  # replace
475
        opt.cfg, opt.weights, opt.resume = '', ckpt, True  # reinstate
Glenn Jocher's avatar
Glenn Jocher committed
476
        LOGGER.info(f'Resuming training from {ckpt}')
Glenn Jocher's avatar
Glenn Jocher committed
477
478
479
    else:
        opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check files
        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
Glenn Jocher's avatar
Glenn Jocher committed
480
481
482
483
        if opt.evolve:
            opt.project = 'runs/evolve'
            opt.exist_ok = opt.resume
        opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
Glenn Jocher's avatar
Glenn Jocher committed
484

485
    # DDP mode
486
    device = select_device(opt.device, batch_size=opt.batch_size)
487
488
    if LOCAL_RANK != -1:
        from datetime import timedelta
489
        assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
Glenn Jocher's avatar
Glenn Jocher committed
490
491
492
493
        assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
        assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
        assert not opt.evolve, '--evolve argument is not compatible with DDP training'
        assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
494
495
        torch.cuda.set_device(LOCAL_RANK)
        device = torch.device('cuda', LOCAL_RANK)
496
        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
497

Glenn Jocher's avatar
Glenn Jocher committed
498
499
    # Train
    if not opt.evolve:
Glenn Jocher's avatar
Glenn Jocher committed
500
        train(opt.hyp, opt, device)
501
502
        if WORLD_SIZE > 1 and RANK == 0:
            _ = [print('Destroying process group... ', end=''), dist.destroy_process_group(), print('Done.')]
Glenn Jocher's avatar
Glenn Jocher committed
503
504
505

    # Evolve hyperparameters (optional)
    else:
506
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
507
        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
508
                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
509
                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
510
                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
511
512
513
                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
514
                'box': (1, 0.02, 0.2),  # box loss gain
515
516
517
518
519
520
                'cls': (1, 0.2, 4.0),  # cls loss gain
                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
                'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
                'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
                'iou_t': (0, 0.1, 0.7),  # IoU training threshold
                'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
521
                'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
522
523
                'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
                'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
524
525
                'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
                'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
526
527
528
529
                'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
                'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
                'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
                'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
530
531
532
                'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
                'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
                'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
533
                'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
534
535
                'mixup': (1, 0.0, 1.0),  # image mixup (probability)
                'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)
yzchen's avatar
yzchen committed
536

537
538
        with open(opt.hyp) as f:
            hyp = yaml.safe_load(f)  # load hyps dict
539
540
            if 'anchors' not in hyp:  # anchors commented in hyp.yaml
                hyp['anchors'] = 3
Glenn Jocher's avatar
Glenn Jocher committed
541
        opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir)  # only val/save final epoch
542
        # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
Glenn Jocher's avatar
Glenn Jocher committed
543
        evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
Glenn Jocher's avatar
Glenn Jocher committed
544
        if opt.bucket:
Glenn Jocher's avatar
Glenn Jocher committed
545
            os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}')  # download evolve.csv if exists
Glenn Jocher's avatar
Glenn Jocher committed
546

547
        for _ in range(opt.evolve):  # generations to evolve
Glenn Jocher's avatar
Glenn Jocher committed
548
            if evolve_csv.exists():  # if evolve.csv exists: select best hyps and mutate
Glenn Jocher's avatar
Glenn Jocher committed
549
550
                # Select parent(s)
                parent = 'single'  # parent selection method: 'single' or 'weighted'
Glenn Jocher's avatar
Glenn Jocher committed
551
                x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
Glenn Jocher's avatar
Glenn Jocher committed
552
553
                n = min(5, len(x))  # number of previous results to consider
                x = x[np.argsort(-fitness(x))][:n]  # top n mutations
554
                w = fitness(x) - fitness(x).min() + 1E-6  # weights (sum > 0)
Glenn Jocher's avatar
Glenn Jocher committed
555
556
557
558
559
560
561
                if parent == 'single' or len(x) == 1:
                    # x = x[random.randint(0, n - 1)]  # random selection
                    x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
                elif parent == 'weighted':
                    x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination

                # Mutate
562
                mp, s = 0.8, 0.2  # mutation probability, sigma
Glenn Jocher's avatar
Glenn Jocher committed
563
564
                npr = np.random
                npr.seed(int(time.time()))
565
566
                g = np.array([x[0] for x in meta.values()])  # gains 0-1
                ng = len(meta)
Glenn Jocher's avatar
Glenn Jocher committed
567
568
569
570
                v = np.ones(ng)
                while all(v == 1):  # mutate until a change occurs (prevent duplicates)
                    v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
                for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
571
                    hyp[k] = float(x[i + 7] * v[i])  # mutate
Glenn Jocher's avatar
Glenn Jocher committed
572

573
            # Constrain to limits
574
            for k, v in meta.items():
575
576
577
                hyp[k] = max(hyp[k], v[1])  # lower limit
                hyp[k] = min(hyp[k], v[2])  # upper limit
                hyp[k] = round(hyp[k], 5)  # significant digits
Glenn Jocher's avatar
Glenn Jocher committed
578
579

            # Train mutation
580
            results = train(hyp.copy(), opt, device)
Glenn Jocher's avatar
Glenn Jocher committed
581
582

            # Write mutation results
Glenn Jocher's avatar
Glenn Jocher committed
583
            print_mutation(results, hyp.copy(), save_dir, opt.bucket)
Glenn Jocher's avatar
Glenn Jocher committed
584

585
        # Plot results
Glenn Jocher's avatar
Glenn Jocher committed
586
587
        plot_evolve(evolve_csv)
        print(f'Hyperparameter evolution finished\n'
Glenn Jocher's avatar
Glenn Jocher committed
588
              f"Results saved to {colorstr('bold', save_dir)}\n"
Glenn Jocher's avatar
Glenn Jocher committed
589
              f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
590
591


592
def run(**kwargs):
Glenn Jocher's avatar
Glenn Jocher committed
593
    # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
594
595
596
597
598
599
    opt = parse_opt(True)
    for k, v in kwargs.items():
        setattr(opt, k, v)
    main(opt)


600
601
602
if __name__ == "__main__":
    opt = parse_opt()
    main(opt)