From 0c3fe173b60b119a4df050067c936564d05e4775 Mon Sep 17 00:00:00 2001
From: Yunhao Meng <yunhao.meng@outlook.com>
Date: Sat, 18 Oct 2025 22:03:55 +0800
Subject: [PATCH] Initial commit

---
 .gitignore         | 302 ++++++++++++++++++
 Dockerfile         |  27 ++
 LICENSE            |  18 ++
 README.md          |  79 +++++
 docker-compose.yml |  59 ++++
 main.py            | 331 +++++++++++++++++++
 main.sh            |   2 +
 nets/nn.py         | 357 +++++++++++++++++++++
 utils/args.yaml    | 100 ++++++
 utils/dataset.py   | 415 ++++++++++++++++++++++++
 utils/util.py      | 777 +++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 2467 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 docker-compose.yml
 create mode 100755 main.py
 create mode 100755 main.sh
 create mode 100755 nets/nn.py
 create mode 100755 utils/args.yaml
 create mode 100644 utils/dataset.py
 create mode 100644 utils/util.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..76052ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,302 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# ---> JetBrains
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# ---> VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+# ---> macOS
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# ---> Custom
+*.log
+weights/
+venv/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d44cbae
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,27 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# Create a non-root dev user
+ARG USER=dev
+ARG UID=1000
+ARG GID=1000
+RUN groupadd -g ${GID} ${USER} \
+ && useradd -m -u ${UID} -g ${GID} -s /bin/bash ${USER}
+
+# System packages commonly needed (OpenCV runtime deps, build tools, etc.)
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+      git wget curl ca-certificates \
+      build-essential pkg-config \
+      libgl1 libglib2.0-0 ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Basic Python tooling
+RUN python -m pip install --upgrade pip wheel setuptools
+
+# Workspace & permissions
+WORKDIR /workspace
+RUN chown -R ${UID}:${GID} /workspace
+USER ${USER}
+
+# (Optional) expose Jupyter/TensorBoard if you use them
+# EXPOSE 8888 6006
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..44cf12b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,18 @@
+MIT License
+
+Copyright (c) 2025 Anchor-x
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 
+associated documentation files (the "Software"), to deal in the Software without restriction, including 
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the 
+following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial 
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9c27934
--- /dev/null
+++ b/README.md
@@ -0,0 +1,79 @@
+YOLOv11 re-implementation using PyTorch
+
+### fix
+* fix the label size [0,1] tensor, which have two dim not adjusted size of [1,]
+    * if pic do not have object (if label is empty), the phenomenon occurs
+    * find XXX to look
+
+### Installation
+
+```
+conda create -n YOLO python=3.10.10
+conda activate YOLO
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install opencv-python
+pip install PyYAML
+pip install tqdm
+```
+
+### Train
+
+* Configure your dataset path in `main.py` for training
+* Run `bash main.sh $ --train` for training, `$` is number of GPUs
+* Run `nohup bash main.sh 1 --train --epochs 300 > train.log 2>&1 &` for training in background
+
+### Test
+
+* Configure your dataset path in `main.py` for testing
+* Run `python main.py --test` for testing
+
+### Results
+
+| Version | Epochs | Box mAP |                                                                              Download |
+|:-------:|:------:|--------:|--------------------------------------------------------------------------------------:|
+|  v11_n  |  600   |    38.6 |       [Model](https://github.com/jahongir7174/YOLOv11-pt/blob/master/weights/best.pt) |
+| v11_n*  |   -    |    39.2 | [Model](https://github.com/jahongir7174/YOLOv11-pt/releases/download/v0.0.1/v11_n.pt) |
+| v11_s*  |   -    |    46.5 | [Model](https://github.com/jahongir7174/YOLOv11-pt/releases/download/v0.0.1/v11_s.pt) |
+| v11_m*  |   -    |    51.2 | [Model](https://github.com/jahongir7174/YOLOv11-pt/releases/download/v0.0.1/v11_m.pt) |
+| v11_l*  |   -    |    53.0 | [Model](https://github.com/jahongir7174/YOLOv11-pt/releases/download/v0.0.1/v11_l.pt) |
+| v11_x*  |   -    |    54.3 | [Model](https://github.com/jahongir7174/YOLOv11-pt/releases/download/v0.0.1/v11_x.pt) |
+
+```
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.386
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.551
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.415
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.196
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.420
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.569
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.321
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.533
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.588
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.361
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.646
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.777
+```
+
+* `*` means that it is from original repository, see reference
+* In the official YOLOv11 code, mask annotation information is used, which leads to higher performance
+
+### Dataset structure
+
+    ├── COCO 
+        ├── images
+            ├── train2017
+                ├── 1111.jpg
+                ├── 2222.jpg
+            ├── val2017
+                ├── 1111.jpg
+                ├── 2222.jpg
+        ├── labels
+            ├── train2017
+                ├── 1111.txt
+                ├── 2222.txt
+            ├── val2017
+                ├── 1111.txt
+                ├── 2222.txt
+
+#### Reference
+
+* [YOLOv11-pt](https://github.com/jahongir7174/YOLOv11-pt)
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..846201f
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,59 @@
+# version: "3.9"
+
+services:
+  dl:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        BASE_IMAGE: "pytorch/pytorch:2.9.0-cuda13.0-cudnn9-devel"
+        USER: "dev"
+        UID: "1000"
+        GID: "1000"
+    container_name: dl
+    # GPUs + large DataLoader shared memory
+    gpus: all
+    shm_size: "12g"
+    ipc: host
+
+    environment:
+      # Always use GPUs (you can limit to some: e.g., "0,1")
+      - NVIDIA_VISIBLE_DEVICES=0
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility,video
+      # Prefer NCCL for multi-GPU
+      - TORCH_DISTRIBUTED_DEBUG=INFO
+      - NCCL_P2P_DISABLE=0
+      - NCCL_ASYNC_ERROR_HANDLING=1
+      # Persisted virtualenv on PATH (lives in a named volume)
+      - VIRTUAL_ENV=./venv
+      - PATH=./venv/bin:/usr/local/bin:/usr/bin:/bin
+      - PYTHONUNBUFFERED=1
+      - TZ=America/Los_Angeles
+
+    volumes:
+      # your code/data
+      - .:/workspace
+      - /home/image1325/ssd1/dataset/coco:/data
+      # persisted venv: your pip installs live here and survive image/container removal
+      - venv:./venv
+      # (optional) speed up installs
+      - pip-cache:/home/dev/.cache/pip
+
+    working_dir: /workspace
+    ulimits:
+      memlock: -1
+      stack: 67108864
+
+    # On first run, create the venv if it doesn't exist; then drop to a shell.
+    command: >
+      bash -lc "
+        if [ ! -d /opt/venv/bin ]; then
+          python -m venv /opt/venv;
+          /opt/venv/bin/python -m pip install --upgrade pip;
+        fi;
+        exec bash
+      "
+
+volumes:
+  venv:
+  pip-cache:
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..2eb698e
--- /dev/null
+++ b/main.py
@@ -0,0 +1,331 @@
+import copy
+import csv
+import os
+import warnings
+from argparse import ArgumentParser
+from typing import cast
+
+import torch
+import tqdm
+import yaml
+from torch.utils import data
+from torch.amp.autocast_mode import autocast
+
+from nets import nn
+from utils import util
+from utils.dataset import Dataset
+
+warnings.filterwarnings("ignore")
+
+data_dir = "/home/image1325/ssd1/dataset/coco"
+
+
+def train(args, params):
+    # Model
+    model = nn.yolo_v11_n(len(params["names"]))
+    model.cuda()
+
+    # Optimizer
+    accumulate = max(round(64 / (args.batch_size * args.world_size)), 1)
+    params["weight_decay"] *= args.batch_size * args.world_size * accumulate / 64
+
+    optimizer = torch.optim.SGD(
+        util.set_params(model, params["weight_decay"]), params["min_lr"], params["momentum"], nesterov=True
+    )
+
+    # EMA
+    ema = util.EMA(model) if args.local_rank == 0 else None
+
+    filenames = []
+    with open(f"{data_dir}/train2017.txt") as f:
+        for filename in f.readlines():
+            filename = os.path.basename(filename.rstrip())
+            filenames.append(f"{data_dir}/images/train2017/" + filename)
+
+    sampler = None
+    dataset = Dataset(filenames, args.input_size, params, augment=True)
+
+    if args.distributed:
+        sampler = data.DistributedSampler(dataset)
+
+    loader = data.DataLoader(
+        dataset,
+        args.batch_size,
+        sampler is None,
+        sampler,
+        num_workers=8,
+        pin_memory=True,
+        collate_fn=Dataset.collate_fn,
+    )
+
+    # Scheduler
+    num_steps = len(loader)
+    scheduler = util.LinearLR(args, params, num_steps)
+
+    if args.distributed:
+        # DDP mode
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        model = torch.nn.parallel.DistributedDataParallel(
+            module=model, device_ids=[args.local_rank], output_device=args.local_rank
+        )
+
+    best = 0
+    amp_scale = torch.amp.grad_scaler.GradScaler()
+    criterion = util.ComputeLoss(model, params)
+
+    with open("weights/step.csv", "w") as log:
+        if args.local_rank == 0:
+            logger = csv.DictWriter(
+                log, fieldnames=["epoch", "box", "cls", "dfl", "Recall", "Precision", "mAP@50", "mAP"]
+            )
+            logger.writeheader()
+
+        for epoch in range(args.epochs):
+            model.train()
+            if args.distributed and sampler:
+                sampler.set_epoch(epoch)
+            if args.epochs - epoch == 10:
+                ds = cast(Dataset, loader.dataset)
+                ds.mosaic = False
+
+            p_bar = enumerate(loader)
+
+            if args.local_rank == 0:
+                print(("\n" + "%10s" * 5) % ("epoch", "memory", "box", "cls", "dfl"))
+                p_bar = tqdm.tqdm(p_bar, total=num_steps, ascii=" >-")
+
+            optimizer.zero_grad()
+            avg_box_loss = util.AverageMeter()
+            avg_cls_loss = util.AverageMeter()
+            avg_dfl_loss = util.AverageMeter()
+            for i, (samples, targets) in p_bar:
+                step = i + num_steps * epoch
+                scheduler.step(step, optimizer)
+
+                samples = samples.cuda().float() / 255
+
+                # Forward
+                with autocast("cuda"):
+                    outputs = model(samples)  # forward
+                    loss_box, loss_cls, loss_dfl = criterion(outputs, targets)
+
+                avg_box_loss.update(loss_box.item(), samples.size(0))
+                avg_cls_loss.update(loss_cls.item(), samples.size(0))
+                avg_dfl_loss.update(loss_dfl.item(), samples.size(0))
+
+                loss_box *= args.batch_size  # loss scaled by batch_size
+                loss_cls *= args.batch_size  # loss scaled by batch_size
+                loss_dfl *= args.batch_size  # loss scaled by batch_size
+                loss_box *= args.world_size  # gradient averaged between devices in DDP mode
+                loss_cls *= args.world_size  # gradient averaged between devices in DDP mode
+                loss_dfl *= args.world_size  # gradient averaged between devices in DDP mode
+
+                # Backward
+                amp_scale.scale(loss_box + loss_cls + loss_dfl).backward()
+
+                # Optimize
+                if step % accumulate == 0:
+                    # amp_scale.unscale_(optimizer)  # unscale gradients
+                    # util.clip_gradients(model)  # clip gradients
+                    amp_scale.step(optimizer)  # optimizer.step
+                    amp_scale.update()
+                    optimizer.zero_grad()
+                    if ema:
+                        ema.update(model)
+
+                torch.cuda.synchronize()
+
+                # Log
+                if args.local_rank == 0:
+                    memory = f"{torch.cuda.memory_reserved() / 1e9:.4g}G"  # (GB)
+                    s = ("%10s" * 2 + "%10.3g" * 3) % (
+                        f"{epoch + 1}/{args.epochs}",
+                        memory,
+                        avg_box_loss.avg,
+                        avg_cls_loss.avg,
+                        avg_dfl_loss.avg,
+                    )
+                    p_bar = cast(tqdm.tqdm, p_bar)
+                    p_bar.set_description(s)
+
+            if args.local_rank == 0:
+                # mAP
+                last = test(args, params, ema.ema if ema else None)
+
+                logger.writerow(
+                    {
+                        "epoch": str(epoch + 1).zfill(3),
+                        "box": str(f"{avg_box_loss.avg:.3f}"),
+                        "cls": str(f"{avg_cls_loss.avg:.3f}"),
+                        "dfl": str(f"{avg_dfl_loss.avg:.3f}"),
+                        "mAP": str(f"{last[0]:.3f}"),
+                        "mAP@50": str(f"{last[1]:.3f}"),
+                        "Recall": str(f"{last[2]:.3f}"),
+                        "Precision": str(f"{last[3]:.3f}"),
+                    }
+                )
+                log.flush()
+
+                # Update best mAP
+                if last[0] > best:
+                    best = last[0]
+
+                # Save model
+                save = {"epoch": epoch + 1, "model": copy.deepcopy(ema.ema if ema else None)}
+
+                # Save last, best and delete
+                torch.save(save, f="./weights/last.pt")
+                if best == last[0]:
+                    torch.save(save, f="./weights/best.pt")
+                del save
+
+    if args.local_rank == 0:
+        util.strip_optimizer("./weights/best.pt")  # strip optimizers
+        util.strip_optimizer("./weights/last.pt")  # strip optimizers
+
+
+@torch.no_grad()
+def test(args, params, model=None):
+    filenames = []
+    with open(f"{data_dir}/val2017.txt") as f:
+        for filename in f.readlines():
+            filename = os.path.basename(filename.rstrip())
+            filenames.append(f"{data_dir}/images/val2017/" + filename)
+
+    dataset = Dataset(filenames, args.input_size, params, augment=False)
+    loader = data.DataLoader(
+        dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True, collate_fn=Dataset.collate_fn
+    )
+
+    plot = False
+    if not model:
+        plot = True
+        model = torch.load(f="./weights/best.pt", map_location="cuda", weights_only=False)
+        model = model["model"].float().fuse()
+
+    model.half()
+    model.eval()
+
+    # Configure
+    iou_v = torch.linspace(start=0.5, end=0.95, steps=10).cuda()  # iou vector for mAP@0.5:0.95
+    n_iou = iou_v.numel()
+
+    m_pre = 0
+    m_rec = 0
+    map50 = 0
+    mean_ap = 0
+    metrics = []
+    p_bar = tqdm.tqdm(loader, desc=("%10s" * 5) % ("", "precision", "recall", "mAP50", "mAP"), ascii=" >-")
+    for samples, targets in p_bar:
+        samples = samples.cuda()
+        samples = samples.half()  # uint8 to fp16/32
+        samples = samples / 255.0  # 0 - 255 to 0.0 - 1.0
+        _, _, h, w = samples.shape  # batch-size, channels, height, width
+        scale = torch.tensor((w, h, w, h)).cuda()
+        # Inference
+        outputs = model(samples)
+        # NMS
+        outputs = util.non_max_suppression(outputs)
+        # Metrics
+        for i, output in enumerate(outputs):
+            # Ensure idx is a 1D boolean mask (squeeze any trailing dimension) to match cls/box shapes
+            idx = targets["idx"]
+            if idx.dim() > 1:
+                idx = idx.squeeze(-1)
+            idx = idx == i
+
+            # XXX: initially, the code was like below, which caused shape mismatch when idx has extra dimension
+            # idx = targets["idx"] == i
+            cls = targets["cls"][idx]
+            box = targets["box"][idx]
+
+            cls = cls.cuda()
+            box = box.cuda()
+
+            metric = torch.zeros(output.shape[0], n_iou, dtype=torch.bool).cuda()
+
+            if output.shape[0] == 0:
+                if cls.shape[0]:
+                    metrics.append((metric, *torch.zeros((2, 0)).cuda(), cls.squeeze(-1)))
+                continue
+            # Evaluate
+            if cls.shape[0]:
+                target = torch.cat(tensors=(cls, util.wh2xy(box) * scale), dim=1)
+                metric = util.compute_metric(output[:, :6], target, iou_v)
+            # Append
+            metrics.append((metric, output[:, 4], output[:, 5], cls.squeeze(-1)))
+
+    # Compute metrics
+    metrics = [torch.cat(x, dim=0).cpu().numpy() for x in zip(*metrics)]  # to numpy
+    if len(metrics) and metrics[0].any():
+        tp, fp, m_pre, m_rec, map50, mean_ap = util.compute_ap(*metrics, plot=plot, names=params["names"])
+    # Print results
+    print(("%10s" + "%10.3g" * 4) % ("", m_pre, m_rec, map50, mean_ap))
+    # Return results
+    model.float()  # for training
+    return mean_ap, map50, m_rec, m_pre
+
+
+def profile(args, params):
+    import thop
+
+    shape = (1, 3, args.input_size, args.input_size)
+    model = nn.yolo_v11_n(len(params["names"])).fuse()
+
+    model.eval()
+    model(torch.zeros(shape))
+
+    x = torch.empty(shape)
+    flops, num_params = thop.profile(model, inputs=[x], verbose=False)
+    flops, num_params = thop.clever_format(nums=[2 * flops, num_params], format="%.3f")
+
+    if args.local_rank == 0:
+        print(f"Number of parameters: {num_params}")
+        print(f"Number of FLOPs: {flops}")
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--input-size", default=640, type=int)
+    parser.add_argument("--batch-size", default=32, type=int)
+    parser.add_argument("--local-rank", default=0, type=int)
+    parser.add_argument("--local_rank", default=0, type=int)
+    parser.add_argument("--epochs", default=600, type=int)
+    parser.add_argument("--train", action="store_true")
+    parser.add_argument("--test", action="store_true")
+
+    args = parser.parse_args()
+
+    args.local_rank = int(os.getenv("LOCAL_RANK", 0))
+    args.world_size = int(os.getenv("WORLD_SIZE", 1))
+    args.distributed = int(os.getenv("WORLD_SIZE", 1)) > 1
+
+    if args.distributed:
+        torch.cuda.set_device(device=args.local_rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+
+    if args.local_rank == 0:
+        if not os.path.exists("weights"):
+            os.makedirs("weights")
+
+    with open("utils/args.yaml", errors="ignore") as f:
+        params = yaml.safe_load(f)
+
+    util.setup_seed()
+    util.setup_multi_processes()
+
+    profile(args, params)
+
+    if args.train:
+        train(args, params)
+    if args.test:
+        test(args, params)
+
+    # Clean
+    if args.distributed:
+        torch.distributed.destroy_process_group()
+    torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.sh b/main.sh
new file mode 100755
index 0000000..acfdd6b
--- /dev/null
+++ b/main.sh
@@ -0,0 +1,2 @@
+GPUS=$1
+python3 -m torch.distributed.run --nproc_per_node=$GPUS main.py ${@:2}
\ No newline at end of file
diff --git a/nets/nn.py b/nets/nn.py
new file mode 100755
index 0000000..ffa5373
--- /dev/null
+++ b/nets/nn.py
@@ -0,0 +1,357 @@
+import math
+
+import torch
+
+from utils.util import make_anchors
+
+
+def fuse_conv(conv, norm):
+    fused_conv = (
+        torch.nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        )
+        .requires_grad_(False)
+        .to(conv.weight.device)
+    )
+
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_norm = torch.diag(norm.weight.div(torch.sqrt(norm.eps + norm.running_var)))
+    fused_conv.weight.copy_(torch.mm(w_norm, w_conv).view(fused_conv.weight.size()))
+
+    b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
+    b_norm = norm.bias - norm.weight.mul(norm.running_mean).div(torch.sqrt(norm.running_var + norm.eps))
+    fused_conv.bias.copy_(torch.mm(w_norm, b_conv.reshape(-1, 1)).reshape(-1) + b_norm)
+
+    return fused_conv
+
+
+class Conv(torch.nn.Module):
+    def __init__(self, in_ch, out_ch, activation, k=1, s=1, p=0, g=1):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_ch, out_ch, k, s, p, groups=g, bias=False)
+        self.norm = torch.nn.BatchNorm2d(out_ch, eps=0.001, momentum=0.03)
+        self.relu = activation
+
+    def forward(self, x):
+        return self.relu(self.norm(self.conv(x)))
+
+    def fuse_forward(self, x):
+        return self.relu(self.conv(x))
+
+
+class Residual(torch.nn.Module):
+    def __init__(self, ch, e=0.5):
+        super().__init__()
+        self.conv1 = Conv(ch, int(ch * e), torch.nn.SiLU(), k=3, p=1)
+        self.conv2 = Conv(int(ch * e), ch, torch.nn.SiLU(), k=3, p=1)
+
+    def forward(self, x):
+        return x + self.conv2(self.conv1(x))
+
+
+class CSPModule(torch.nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.conv1 = Conv(in_ch, out_ch // 2, torch.nn.SiLU())
+        self.conv2 = Conv(in_ch, out_ch // 2, torch.nn.SiLU())
+        self.conv3 = Conv(2 * (out_ch // 2), out_ch, torch.nn.SiLU())
+        self.res_m = torch.nn.Sequential(Residual(out_ch // 2, e=1.0), Residual(out_ch // 2, e=1.0))
+
+    def forward(self, x):
+        y = self.res_m(self.conv1(x))
+        return self.conv3(torch.cat((y, self.conv2(x)), dim=1))
+
+
+class CSP(torch.nn.Module):
+    def __init__(self, in_ch, out_ch, n, csp, r):
+        super().__init__()
+        self.conv1 = Conv(in_ch, 2 * (out_ch // r), torch.nn.SiLU())
+        self.conv2 = Conv((2 + n) * (out_ch // r), out_ch, torch.nn.SiLU())
+
+        if not csp:
+            self.res_m = torch.nn.ModuleList(Residual(out_ch // r) for _ in range(n))
+        else:
+            self.res_m = torch.nn.ModuleList(CSPModule(out_ch // r, out_ch // r) for _ in range(n))
+
+    def forward(self, x):
+        y = list(self.conv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.res_m)
+        return self.conv2(torch.cat(y, dim=1))
+
+
+class SPP(torch.nn.Module):
+    def __init__(self, in_ch, out_ch, k=5):
+        super().__init__()
+        self.conv1 = Conv(in_ch, in_ch // 2, torch.nn.SiLU())
+        self.conv2 = Conv(in_ch * 2, out_ch, torch.nn.SiLU())
+        self.res_m = torch.nn.MaxPool2d(k, stride=1, padding=k // 2)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.res_m(x)
+        y2 = self.res_m(y1)
+        return self.conv2(torch.cat(tensors=[x, y1, y2, self.res_m(y2)], dim=1))
+
+
+class Attention(torch.nn.Module):
+    def __init__(self, ch, num_head):
+        super().__init__()
+        self.num_head = num_head
+        self.dim_head = ch // num_head
+        self.dim_key = self.dim_head // 2
+        self.scale = self.dim_key**-0.5
+
+        self.qkv = Conv(ch, ch + self.dim_key * num_head * 2, torch.nn.Identity())
+
+        self.conv1 = Conv(ch, ch, torch.nn.Identity(), k=3, p=1, g=ch)
+        self.conv2 = Conv(ch, ch, torch.nn.Identity())
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+
+        qkv = self.qkv(x)
+        qkv = qkv.view(b, self.num_head, self.dim_key * 2 + self.dim_head, h * w)
+
+        q, k, v = qkv.split([self.dim_key, self.dim_key, self.dim_head], dim=2)
+
+        attn = (q.transpose(-2, -1) @ k) * self.scale
+        attn = attn.softmax(dim=-1)
+
+        x = (v @ attn.transpose(-2, -1)).view(b, c, h, w) + self.conv1(v.reshape(b, c, h, w))
+        return self.conv2(x)
+
+
+class PSABlock(torch.nn.Module):
+    def __init__(self, ch, num_head):
+        super().__init__()
+        self.conv1 = Attention(ch, num_head)
+        self.conv2 = torch.nn.Sequential(Conv(ch, ch * 2, torch.nn.SiLU()), Conv(ch * 2, ch, torch.nn.Identity()))
+
+    def forward(self, x):
+        x = x + self.conv1(x)
+        return x + self.conv2(x)
+
+
+class PSA(torch.nn.Module):
+    def __init__(self, ch, n):
+        super().__init__()
+        self.conv1 = Conv(ch, 2 * (ch // 2), torch.nn.SiLU())
+        self.conv2 = Conv(2 * (ch // 2), ch, torch.nn.SiLU())
+        self.res_m = torch.nn.Sequential(*(PSABlock(ch // 2, ch // 128) for _ in range(n)))
+
+    def forward(self, x):
+        x, y = self.conv1(x).chunk(2, 1)
+        return self.conv2(torch.cat(tensors=(x, self.res_m(y)), dim=1))
+
+
+class DarkNet(torch.nn.Module):
+    def __init__(self, width, depth, csp):
+        super().__init__()
+        self.p1 = []
+        self.p2 = []
+        self.p3 = []
+        self.p4 = []
+        self.p5 = []
+
+        # p1/2
+        self.p1.append(Conv(width[0], width[1], torch.nn.SiLU(), k=3, s=2, p=1))
+        # p2/4
+        self.p2.append(Conv(width[1], width[2], torch.nn.SiLU(), k=3, s=2, p=1))
+        self.p2.append(CSP(width[2], width[3], depth[0], csp[0], r=4))
+        # p3/8
+        self.p3.append(Conv(width[3], width[3], torch.nn.SiLU(), k=3, s=2, p=1))
+        self.p3.append(CSP(width[3], width[4], depth[1], csp[0], r=4))
+        # p4/16
+        self.p4.append(Conv(width[4], width[4], torch.nn.SiLU(), k=3, s=2, p=1))
+        self.p4.append(CSP(width[4], width[4], depth[2], csp[1], r=2))
+        # p5/32
+        self.p5.append(Conv(width[4], width[5], torch.nn.SiLU(), k=3, s=2, p=1))
+        self.p5.append(CSP(width[5], width[5], depth[3], csp[1], r=2))
+        self.p5.append(SPP(width[5], width[5]))
+        self.p5.append(PSA(width[5], depth[4]))
+
+        self.p1 = torch.nn.Sequential(*self.p1)
+        self.p2 = torch.nn.Sequential(*self.p2)
+        self.p3 = torch.nn.Sequential(*self.p3)
+        self.p4 = torch.nn.Sequential(*self.p4)
+        self.p5 = torch.nn.Sequential(*self.p5)
+
+    def forward(self, x):
+        p1 = self.p1(x)
+        p2 = self.p2(p1)
+        p3 = self.p3(p2)
+        p4 = self.p4(p3)
+        p5 = self.p5(p4)
+        return p3, p4, p5
+
+
+class DarkFPN(torch.nn.Module):
+    def __init__(self, width, depth, csp):
+        super().__init__()
+        self.up = torch.nn.Upsample(scale_factor=2)
+        self.h1 = CSP(width[4] + width[5], width[4], depth[5], csp[0], r=2)
+        self.h2 = CSP(width[4] + width[4], width[3], depth[5], csp[0], r=2)
+        self.h3 = Conv(width[3], width[3], torch.nn.SiLU(), k=3, s=2, p=1)
+        self.h4 = CSP(width[3] + width[4], width[4], depth[5], csp[0], r=2)
+        self.h5 = Conv(width[4], width[4], torch.nn.SiLU(), k=3, s=2, p=1)
+        self.h6 = CSP(width[4] + width[5], width[5], depth[5], csp[1], r=2)
+
+    def forward(self, x):
+        p3, p4, p5 = x
+        p4 = self.h1(torch.cat(tensors=[self.up(p5), p4], dim=1))
+        p3 = self.h2(torch.cat(tensors=[self.up(p4), p3], dim=1))
+        p4 = self.h4(torch.cat(tensors=[self.h3(p3), p4], dim=1))
+        p5 = self.h6(torch.cat(tensors=[self.h5(p4), p5], dim=1))
+        return p3, p4, p5
+
+
+class DFL(torch.nn.Module):
+    # Generalized Focal Loss
+    # https://ieeexplore.ieee.org/document/9792391
+    def __init__(self, ch=16):
+        super().__init__()
+        self.ch = ch
+        self.conv = torch.nn.Conv2d(ch, out_channels=1, kernel_size=1, bias=False).requires_grad_(False)
+        x = torch.arange(ch, dtype=torch.float).view(1, ch, 1, 1)
+        self.conv.weight.data[:] = torch.nn.Parameter(x)
+
+    def forward(self, x):
+        b, c, a = x.shape
+        x = x.view(b, 4, self.ch, a).transpose(2, 1)
+        return self.conv(x.softmax(1)).view(b, 4, a)
+
+
+class Head(torch.nn.Module):
+    anchors = torch.empty(0)
+    strides = torch.empty(0)
+
+    def __init__(self, nc=80, filters=()):
+        super().__init__()
+        self.ch = 16  # DFL channels
+        self.nc = nc  # number of classes
+        self.nl = len(filters)  # number of detection layers
+        self.no = nc + self.ch * 4  # number of outputs per anchor
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+
+        box = max(64, filters[0] // 4)
+        cls = max(80, filters[0], self.nc)
+
+        self.dfl = DFL(self.ch)
+        self.box = torch.nn.ModuleList(
+            torch.nn.Sequential(
+                Conv(x, box, torch.nn.SiLU(), k=3, p=1),
+                Conv(box, box, torch.nn.SiLU(), k=3, p=1),
+                torch.nn.Conv2d(box, out_channels=4 * self.ch, kernel_size=1),
+            )
+            for x in filters
+        )
+        self.cls = torch.nn.ModuleList(
+            torch.nn.Sequential(
+                Conv(x, x, torch.nn.SiLU(), k=3, p=1, g=x),
+                Conv(x, cls, torch.nn.SiLU()),
+                Conv(cls, cls, torch.nn.SiLU(), k=3, p=1, g=cls),
+                Conv(cls, cls, torch.nn.SiLU()),
+                torch.nn.Conv2d(cls, out_channels=self.nc, kernel_size=1),
+            )
+            for x in filters
+        )
+
+    def forward(self, x):
+        for i, (box, cls) in enumerate(zip(self.box, self.cls)):
+            x[i] = torch.cat(tensors=(box(x[i]), cls(x[i])), dim=1)
+        if self.training:
+            return x
+
+        self.anchors, self.strides = (i.transpose(0, 1) for i in make_anchors(x, self.stride))
+        x = torch.cat([i.view(x[0].shape[0], self.no, -1) for i in x], dim=2)
+        box, cls = x.split(split_size=(4 * self.ch, self.nc), dim=1)
+
+        a, b = self.dfl(box).chunk(2, 1)
+        a = self.anchors.unsqueeze(0) - a
+        b = self.anchors.unsqueeze(0) + b
+        box = torch.cat(tensors=((a + b) / 2, b - a), dim=1)
+
+        return torch.cat(tensors=(box * self.strides, cls.sigmoid()), dim=1)
+
+    def initialize_biases(self):
+        # Initialize biases
+        # WARNING: requires stride availability
+        for box, cls, s in zip(self.box, self.cls, self.stride):
+            # box
+            box[-1].bias.data[:] = 1.0
+            # cls (.01 objects, 80 classes, 640 image)
+            cls[-1].bias.data[: self.nc] = math.log(5 / self.nc / (640 / s) ** 2)
+
+
+class YOLO(torch.nn.Module):
+    def __init__(self, width, depth, csp, num_classes):
+        super().__init__()
+        self.net = DarkNet(width, depth, csp)
+        self.fpn = DarkFPN(width, depth, csp)
+
+        img_dummy = torch.zeros(1, width[0], 256, 256)
+        self.head = Head(num_classes, (width[3], width[4], width[5]))
+        self.head.stride = torch.tensor([256 / x.shape[-2] for x in self.forward(img_dummy)])
+        self.stride = self.head.stride
+        self.head.initialize_biases()
+
+    def forward(self, x):
+        x = self.net(x)
+        x = self.fpn(x)
+        return self.head(list(x))
+
+    def fuse(self):
+        for m in self.modules():
+            if type(m) is Conv and hasattr(m, "norm"):
+                m.conv = fuse_conv(m.conv, m.norm)
+                m.forward = m.fuse_forward
+                delattr(m, "norm")
+        return self
+
+
+def yolo_v11_n(num_classes: int = 80):
+    csp = [False, True]
+    depth = [1, 1, 1, 1, 1, 1]
+    width = [3, 16, 32, 64, 128, 256]
+    return YOLO(width, depth, csp, num_classes)
+
+
+def yolo_v11_t(num_classes: int = 80):
+    csp = [False, True]
+    depth = [1, 1, 1, 1, 1, 1]
+    width = [3, 24, 48, 96, 192, 384]
+    return YOLO(width, depth, csp, num_classes)
+
+
+def yolo_v11_s(num_classes: int = 80):
+    csp = [False, True]
+    depth = [1, 1, 1, 1, 1, 1]
+    width = [3, 32, 64, 128, 256, 512]
+    return YOLO(width, depth, csp, num_classes)
+
+
+def yolo_v11_m(num_classes: int = 80):
+    csp = [True, True]
+    depth = [1, 1, 1, 1, 1, 1]
+    width = [3, 64, 128, 256, 512, 512]
+    return YOLO(width, depth, csp, num_classes)
+
+
+def yolo_v11_l(num_classes: int = 80):
+    csp = [True, True]
+    depth = [2, 2, 2, 2, 2, 2]
+    width = [3, 64, 128, 256, 512, 512]
+    return YOLO(width, depth, csp, num_classes)
+
+
+def yolo_v11_x(num_classes: int = 80):
+    csp = [True, True]
+    depth = [2, 2, 2, 2, 2, 2]
+    width = [3, 96, 192, 384, 768, 768]
+    return YOLO(width, depth, csp, num_classes)
diff --git a/utils/args.yaml b/utils/args.yaml
new file mode 100755
index 0000000..a63baea
--- /dev/null
+++ b/utils/args.yaml
@@ -0,0 +1,100 @@
+min_lr: 0.000100000000            # initial learning rate
+max_lr: 0.010000000000            # maximum learning rate
+momentum: 0.9370000000            # SGD momentum/Adam beta1
+weight_decay: 0.000500            # optimizer weight decay
+warmup_epochs: 3.00000            # warmup epochs
+box: 7.500000000000000            # box loss gain
+cls: 0.500000000000000            # cls loss gain
+dfl: 1.500000000000000            # dfl loss gain
+hsv_h: 0.0150000000000            # image HSV-Hue augmentation (fraction)
+hsv_s: 0.7000000000000            # image HSV-Saturation augmentation (fraction)
+hsv_v: 0.4000000000000            # image HSV-Value augmentation (fraction)
+degrees: 0.00000000000            # image rotation (+/- deg)
+translate: 0.100000000            # image translation (+/- fraction)
+scale: 0.5000000000000            # image scale (+/- gain)
+shear: 0.0000000000000            # image shear (+/- deg)
+flip_ud: 0.00000000000            # image flip up-down (probability)
+flip_lr: 0.50000000000            # image flip left-right (probability)
+mosaic: 1.000000000000            # image mosaic (probability)
+mix_up: 0.000000000000            # image mix-up (probability)
+names:
+  0: person
+  1: bicycle
+  2: car
+  3: motorcycle
+  4: airplane
+  5: bus
+  6: train
+  7: truck
+  8: boat
+  9: traffic light
+  10: fire hydrant
+  11: stop sign
+  12: parking meter
+  13: bench
+  14: bird
+  15: cat
+  16: dog
+  17: horse
+  18: sheep
+  19: cow
+  20: elephant
+  21: bear
+  22: zebra
+  23: giraffe
+  24: backpack
+  25: umbrella
+  26: handbag
+  27: tie
+  28: suitcase
+  29: frisbee
+  30: skis
+  31: snowboard
+  32: sports ball
+  33: kite
+  34: baseball bat
+  35: baseball glove
+  36: skateboard
+  37: surfboard
+  38: tennis racket
+  39: bottle
+  40: wine glass
+  41: cup
+  42: fork
+  43: knife
+  44: spoon
+  45: bowl
+  46: banana
+  47: apple
+  48: sandwich
+  49: orange
+  50: broccoli
+  51: carrot
+  52: hot dog
+  53: pizza
+  54: donut
+  55: cake
+  56: chair
+  57: couch
+  58: potted plant
+  59: bed
+  60: dining table
+  61: toilet
+  62: tv
+  63: laptop
+  64: mouse
+  65: remote
+  66: keyboard
+  67: cell phone
+  68: microwave
+  69: oven
+  70: toaster
+  71: sink
+  72: refrigerator
+  73: book
+  74: clock
+  75: vase
+  76: scissors
+  77: teddy bear
+  78: hair drier
+  79: toothbrush
diff --git a/utils/dataset.py b/utils/dataset.py
new file mode 100644
index 0000000..f6eaea2
--- /dev/null
+++ b/utils/dataset.py
@@ -0,0 +1,415 @@
+import math
+import os
+import random
+
+import cv2
+import numpy
+import torch
+from PIL import Image
+from torch.utils import data
+
+FORMATS = "bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp"
+
+
+class Dataset(data.Dataset):
+    def __init__(self, filenames, input_size, params, augment):
+        self.params = params
+        self.mosaic = augment
+        self.augment = augment
+        self.input_size = input_size
+
+        # Read labels
+        labels = self.load_label(filenames)
+        self.labels = list(labels.values())
+        self.filenames = list(labels.keys())  # update
+        self.n = len(self.filenames)  # number of samples
+        self.indices = range(self.n)
+        # Albumentations (optional, only used if package is installed)
+        self.albumentations = Albumentations()
+
+    def __getitem__(self, index):
+        index = self.indices[index]
+
+        if self.mosaic and random.random() < self.params["mosaic"]:
+            # Load MOSAIC
+            image, label = self.load_mosaic(index, self.params)
+            # MixUp augmentation
+            if random.random() < self.params["mix_up"]:
+                index = random.choice(self.indices)
+                mix_image1, mix_label1 = image, label
+                mix_image2, mix_label2 = self.load_mosaic(index, self.params)
+
+                image, label = mix_up(mix_image1, mix_label1, mix_image2, mix_label2)
+        else:
+            # Load image
+            image, shape = self.load_image(index)
+            h, w = image.shape[:2]
+
+            # Resize
+            image, ratio, pad = resize(image, self.input_size, self.augment)
+
+            label = self.labels[index].copy()
+            if label.size:
+                label[:, 1:] = wh2xy(label[:, 1:], ratio[0] * w, ratio[1] * h, pad[0], pad[1])
+            if self.augment:
+                image, label = random_perspective(image, label, self.params)
+
+        nl = len(label)  # number of labels
+        h, w = image.shape[:2]
+        cls = label[:, 0:1]
+        box = label[:, 1:5]
+        box = xy2wh(box, w, h)
+
+        if self.augment:
+            # Albumentations
+            image, box, cls = self.albumentations(image, box, cls)
+            nl = len(box)  # update after albumentations
+            # HSV color-space
+            augment_hsv(image, self.params)
+            # Flip up-down
+            if random.random() < self.params["flip_ud"]:
+                image = numpy.flipud(image)
+                if nl:
+                    box[:, 1] = 1 - box[:, 1]
+            # Flip left-right
+            if random.random() < self.params["flip_lr"]:
+                image = numpy.fliplr(image)
+                if nl:
+                    box[:, 0] = 1 - box[:, 0]
+
+        # XXX: when nl=0, torch.from_numpy(box) will error
+        if nl:
+            target_cls = torch.from_numpy(cls).view(-1, 1).float()  # always (N,1)
+            target_box = torch.from_numpy(box).reshape(-1, 4).float()  # always (N,4)
+        else:
+            target_cls = torch.zeros((0, 1), dtype=torch.float32)
+            target_box = torch.zeros((0, 4), dtype=torch.float32)
+        # target_cls = torch.zeros((nl, 1))
+        # target_box = torch.zeros((nl, 4))
+        # if nl:
+        #     target_cls = torch.from_numpy(cls)
+        #     target_box = torch.from_numpy(box)
+
+        # Convert HWC to CHW, BGR to RGB
+        sample = image.transpose((2, 0, 1))[::-1]
+        sample = numpy.ascontiguousarray(sample)
+
+        # return torch.from_numpy(sample), target_cls, target_box, torch.zeros(nl)
+        return torch.from_numpy(sample), target_cls, target_box, torch.zeros((nl, 1), dtype=torch.long)
+
+    def __len__(self):
+        return len(self.filenames)
+
+    def load_image(self, i):
+        image = cv2.imread(self.filenames[i])
+        if image is None:
+            raise FileNotFoundError(f"Image Not Found {self.filenames[i]}")
+        h, w = image.shape[:2]
+        r = self.input_size / max(h, w)
+        if r != 1:
+            image = cv2.resize(
+                image, dsize=(int(w * r), int(h * r)), interpolation=resample() if self.augment else cv2.INTER_LINEAR
+            )
+        return image, (h, w)
+
+    def load_mosaic(self, index, params):
+        label4 = []
+        border = [-self.input_size // 2, -self.input_size // 2]
+        image4 = numpy.full((self.input_size * 2, self.input_size * 2, 3), 0, dtype=numpy.uint8)
+        y1a, y2a, x1a, x2a, y1b, y2b, x1b, x2b = (None, None, None, None, None, None, None, None)
+
+        xc = int(random.uniform(-border[0], 2 * self.input_size + border[1]))
+        yc = int(random.uniform(-border[0], 2 * self.input_size + border[1]))
+
+        indices = [index] + random.choices(self.indices, k=3)
+        random.shuffle(indices)
+
+        for i, index in enumerate(indices):
+            # Load image
+            image, _ = self.load_image(index)
+            shape = image.shape
+            if i == 0:  # top left
+                x1a = max(xc - shape[1], 0)
+                y1a = max(yc - shape[0], 0)
+                x2a = xc
+                y2a = yc
+                x1b = shape[1] - (x2a - x1a)
+                y1b = shape[0] - (y2a - y1a)
+                x2b = shape[1]
+                y2b = shape[0]
+            if i == 1:  # top right
+                x1a = xc
+                y1a = max(yc - shape[0], 0)
+                x2a = min(xc + shape[1], self.input_size * 2)
+                y2a = yc
+                x1b = 0
+                y1b = shape[0] - (y2a - y1a)
+                x2b = min(shape[1], x2a - x1a)
+                y2b = shape[0]
+            if i == 2:  # bottom left
+                x1a = max(xc - shape[1], 0)
+                y1a = yc
+                x2a = xc
+                y2a = min(self.input_size * 2, yc + shape[0])
+                x1b = shape[1] - (x2a - x1a)
+                y1b = 0
+                x2b = shape[1]
+                y2b = min(y2a - y1a, shape[0])
+            if i == 3:  # bottom right
+                x1a = xc
+                y1a = yc
+                x2a = min(xc + shape[1], self.input_size * 2)
+                y2a = min(self.input_size * 2, yc + shape[0])
+                x1b = 0
+                y1b = 0
+                x2b = min(shape[1], x2a - x1a)
+                y2b = min(y2a - y1a, shape[0])
+
+            pad_w = x1a - x1b
+            pad_h = y1a - y1b
+            image4[y1a:y2a, x1a:x2a] = image[y1b:y2b, x1b:x2b]
+
+            # Labels
+            label = self.labels[index].copy()
+            if len(label):
+                label[:, 1:] = wh2xy(label[:, 1:], shape[1], shape[0], pad_w, pad_h)
+            label4.append(label)
+
+        # Concat/clip labels
+        label4 = numpy.concatenate(label4, 0)
+        for x in label4[:, 1:]:
+            numpy.clip(x, 0, 2 * self.input_size, out=x)
+
+        # Augment
+        image4, label4 = random_perspective(image4, label4, params, border)
+
+        return image4, label4
+
+    @staticmethod
+    def collate_fn(batch):
+        samples, cls, box, indices = zip(*batch)
+
+        cls = torch.cat(cls, dim=0)
+        box = torch.cat(box, dim=0)
+
+        new_indices = list(indices)
+        for i in range(len(indices)):
+            new_indices[i] += i
+        indices = torch.cat(new_indices, dim=0)
+
+        targets = {"cls": cls, "box": box, "idx": indices}
+        return torch.stack(samples, dim=0), targets
+
+    @staticmethod
+    def load_label(filenames):
+        path = f"{os.path.dirname(filenames[0])}.cache"
+        if os.path.exists(path):
+            return torch.load(path, weights_only=False)
+        x = {}
+        for filename in filenames:
+            try:
+                # verify images
+                with open(filename, "rb") as f:
+                    image = Image.open(f)
+                    image.verify()  # PIL verify
+                shape = image.size  # image size
+                assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+                assert image.format.lower() in FORMATS, f"invalid image format {image.format}"
+
+                # verify labels
+                a = f"{os.sep}images{os.sep}"
+                b = f"{os.sep}labels{os.sep}"
+                if os.path.isfile(b.join(filename.rsplit(a, 1)).rsplit(".", 1)[0] + ".txt"):
+                    with open(b.join(filename.rsplit(a, 1)).rsplit(".", 1)[0] + ".txt") as f:
+                        label = [x.split() for x in f.read().strip().splitlines() if len(x)]
+                        label = numpy.array(label, dtype=numpy.float32)
+                    nl = len(label)
+                    if nl:
+                        assert (label >= 0).all()
+                        assert label.shape[1] == 5
+                        assert (label[:, 1:] <= 1).all()
+                        _, i = numpy.unique(label, axis=0, return_index=True)
+                        if len(i) < nl:  # duplicate row check
+                            label = label[i]  # remove duplicates
+                    else:
+                        label = numpy.zeros((0, 5), dtype=numpy.float32)
+                else:
+                    label = numpy.zeros((0, 5), dtype=numpy.float32)
+            except FileNotFoundError:
+                label = numpy.zeros((0, 5), dtype=numpy.float32)
+            except AssertionError:
+                continue
+            x[filename] = label
+        torch.save(x, path)
+        return x
+
+
+def wh2xy(x, w=640, h=640, pad_w=0, pad_h=0):
+    # Convert nx4 boxes
+    # from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = numpy.copy(x)
+    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + pad_w  # top left x
+    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + pad_h  # top left y
+    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + pad_w  # bottom right x
+    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + pad_h  # bottom right y
+    return y
+
+
+def xy2wh(x, w, h):
+    # warning: inplace clip
+    x[:, [0, 2]] = x[:, [0, 2]].clip(0, w - 1e-3)  # x1, x2
+    x[:, [1, 3]] = x[:, [1, 3]].clip(0, h - 1e-3)  # y1, y2
+
+    # Convert nx4 boxes
+    # from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    y = numpy.copy(x)
+    y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w  # x center
+    y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h  # y center
+    y[:, 2] = (x[:, 2] - x[:, 0]) / w  # width
+    y[:, 3] = (x[:, 3] - x[:, 1]) / h  # height
+    return y
+
+
+def resample():
+    choices = (cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LINEAR, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4)
+    return random.choice(seq=choices)
+
+
+def augment_hsv(image, params):
+    # HSV color-space augmentation
+    h = params["hsv_h"]
+    s = params["hsv_s"]
+    v = params["hsv_v"]
+
+    r = numpy.random.uniform(-1, 1, 3) * [h, s, v] + 1
+    h, s, v = cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))
+
+    x = numpy.arange(0, 256, dtype=r.dtype)
+    lut_h = ((x * r[0]) % 180).astype("uint8")
+    lut_s = numpy.clip(x * r[1], 0, 255).astype("uint8")
+    lut_v = numpy.clip(x * r[2], 0, 255).astype("uint8")
+
+    hsv = cv2.merge((cv2.LUT(h, lut_h), cv2.LUT(s, lut_s), cv2.LUT(v, lut_v)))
+    cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR, dst=image)  # no return needed
+
+
+def resize(image, input_size, augment):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = image.shape[:2]  # current shape [height, width]
+
+    # Scale ratio (new / old)
+    r = min(input_size / shape[0], input_size / shape[1])
+    if not augment:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    pad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    w = (input_size - pad[0]) / 2
+    h = (input_size - pad[1]) / 2
+
+    if shape[::-1] != pad:  # resize
+        image = cv2.resize(image, dsize=pad, interpolation=resample() if augment else cv2.INTER_LINEAR)
+    top, bottom = int(round(h - 0.1)), int(round(h + 0.1))
+    left, right = int(round(w - 0.1)), int(round(w + 0.1))
+    image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT)  # add border
+    return image, (r, r), (w, h)
+
+
+def candidates(box1, box2):
+    # box1(4,n), box2(4,n)
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    aspect_ratio = numpy.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (w2 > 2) & (h2 > 2) & (w2 * h2 / (w1 * h1 + 1e-16) > 0.1) & (aspect_ratio < 100)
+
+
+def random_perspective(image, label, params, border=(0, 0)):
+    h = image.shape[0] + border[0] * 2
+    w = image.shape[1] + border[1] * 2
+
+    # Center
+    center = numpy.eye(3)
+    center[0, 2] = -image.shape[1] / 2  # x translation (pixels)
+    center[1, 2] = -image.shape[0] / 2  # y translation (pixels)
+
+    # Perspective
+    perspective = numpy.eye(3)
+
+    # Rotation and Scale
+    rotate = numpy.eye(3)
+    a = random.uniform(-params["degrees"], params["degrees"])
+    s = random.uniform(1 - params["scale"], 1 + params["scale"])
+    rotate[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    shear = numpy.eye(3)
+    shear[0, 1] = math.tan(random.uniform(-params["shear"], params["shear"]) * math.pi / 180)
+    shear[1, 0] = math.tan(random.uniform(-params["shear"], params["shear"]) * math.pi / 180)
+
+    # Translation
+    translate = numpy.eye(3)
+    translate[0, 2] = random.uniform(0.5 - params["translate"], 0.5 + params["translate"]) * w
+    translate[1, 2] = random.uniform(0.5 - params["translate"], 0.5 + params["translate"]) * h
+
+    # Combined rotation matrix, order of operations (right to left) is IMPORTANT
+    matrix = translate @ shear @ rotate @ perspective @ center
+    if (border[0] != 0) or (border[1] != 0) or (matrix != numpy.eye(3)).any():  # image changed
+        image = cv2.warpAffine(image, matrix[:2], dsize=(w, h), borderValue=(0, 0, 0))
+
+    # Transform label coordinates
+    n = len(label)
+    if n:
+        xy = numpy.ones((n * 4, 3))
+        xy[:, :2] = label[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ matrix.T  # transform
+        xy = xy[:, :2].reshape(n, 8)  # perspective rescale or affine
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        box = numpy.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # clip
+        box[:, [0, 2]] = box[:, [0, 2]].clip(0, w)
+        box[:, [1, 3]] = box[:, [1, 3]].clip(0, h)
+        # filter candidates
+        indices = candidates(box1=label[:, 1:5].T * s, box2=box.T)
+
+        label = label[indices]
+        label[:, 1:5] = box[indices]
+
+    return image, label
+
+
+def mix_up(image1, label1, image2, label2):
+    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
+    alpha = numpy.random.beta(a=32.0, b=32.0)  # mix-up ratio, alpha=beta=32.0
+    image = (image1 * alpha + image2 * (1 - alpha)).astype(numpy.uint8)
+    label = numpy.concatenate((label1, label2), 0)
+    return image, label
+
+
+class Albumentations:
+    def __init__(self):
+        self.transform = None
+        try:
+            import albumentations
+
+            transforms = [
+                albumentations.Blur(p=0.01),
+                albumentations.CLAHE(p=0.01),
+                albumentations.ToGray(p=0.01),
+                albumentations.MedianBlur(p=0.01),
+            ]
+            self.transform = albumentations.Compose(transforms, albumentations.BboxParams("yolo", ["class_labels"]))
+
+        except ImportError:  # package not installed, skip
+            pass
+
+    def __call__(self, image, box, cls):
+        if self.transform:
+            x = self.transform(image=image, bboxes=box, class_labels=cls)
+            image = x["image"]
+            box = numpy.array(x["bboxes"])
+            cls = numpy.array(x["class_labels"])
+        return image, box, cls
diff --git a/utils/util.py b/utils/util.py
new file mode 100644
index 0000000..3449e05
--- /dev/null
+++ b/utils/util.py
@@ -0,0 +1,777 @@
+import copy
+import random
+from time import time
+
+import math
+import numpy
+import torch
+import torchvision
+from torch.nn.functional import cross_entropy
+
+
+def setup_seed():
+    """
+    Setup random seed.
+    """
+    random.seed(0)
+    numpy.random.seed(0)
+    torch.manual_seed(0)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+
+def setup_multi_processes():
+    """
+    Setup multi-processing environment variables.
+    """
+    import cv2
+    from os import environ
+    from platform import system
+
+    # set multiprocess start method as `fork` to speed up the training
+    if system() != "Windows":
+        torch.multiprocessing.set_start_method("fork", force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    cv2.setNumThreads(0)
+
+    # setup OMP threads
+    if "OMP_NUM_THREADS" not in environ:
+        environ["OMP_NUM_THREADS"] = "1"
+
+    # setup MKL threads
+    if "MKL_NUM_THREADS" not in environ:
+        environ["MKL_NUM_THREADS"] = "1"
+
+
+def export_onnx(args):
+    import onnx  # noqa
+
+    inputs = ["images"]
+    outputs = ["outputs"]
+    dynamic = {"outputs": {0: "batch", 1: "anchors"}}
+
+    m = torch.load("./weights/best.pt", weights_only=False)["model"].float()
+    x = torch.zeros((1, 3, args.input_size, args.input_size))
+
+    torch.onnx.export(
+        m.cpu(),
+        (x.cpu(),),
+        f="./weights/best.onnx",
+        verbose=False,
+        opset_version=12,
+        # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
+        do_constant_folding=True,
+        input_names=inputs,
+        output_names=outputs,
+        dynamic_axes=dynamic or None,
+    )
+
+    # Checks
+    model_onnx = onnx.load("./weights/best.onnx")  # load onnx model
+    onnx.checker.check_model(model_onnx)  # check onnx model
+
+    onnx.save(model_onnx, "./weights/best.onnx")
+    # Inference example
+    # https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/autobackend.py
+
+
+def wh2xy(x):
+    y = x.clone() if isinstance(x, torch.Tensor) else numpy.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def make_anchors(x, strides, offset=0.5):
+    assert x is not None
+    anchor_tensor, stride_tensor = [], []
+    dtype, device = x[0].dtype, x[0].device
+    for i, stride in enumerate(strides):
+        _, _, h, w = x[i].shape
+        sx = torch.arange(end=w, device=device, dtype=dtype) + offset  # shift x
+        sy = torch.arange(end=h, device=device, dtype=dtype) + offset  # shift y
+        sy, sx = torch.meshgrid(sy, sx)
+        anchor_tensor.append(torch.stack((sx, sy), -1).view(-1, 2))
+        stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
+    return torch.cat(anchor_tensor), torch.cat(stride_tensor)
+
+
+def compute_metric(output, target, iou_v):
+    # intersection(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2) = target[:, 1:].unsqueeze(1).chunk(2, 2)
+    (b1, b2) = output[:, :4].unsqueeze(0).chunk(2, 2)
+    intersection = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+    # IoU = intersection / (area1 + area2 - intersection)
+    iou = intersection / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - intersection + 1e-7)
+
+    correct = numpy.zeros((output.shape[0], iou_v.shape[0]))
+    correct = correct.astype(bool)
+    for i in range(len(iou_v)):
+        # IoU > threshold and classes match
+        x = torch.where((iou >= iou_v[i]) & (target[:, 0:1] == output[:, 5]))
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()  # [label, detect, iou]
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[numpy.unique(matches[:, 1], return_index=True)[1]]
+                matches = matches[numpy.unique(matches[:, 0], return_index=True)[1]]
+            correct[matches[:, 1].astype(int), i] = True
+    return torch.tensor(correct, dtype=torch.bool, device=output.device)
+
+
+def non_max_suppression(outputs, confidence_threshold=0.001, iou_threshold=0.65):
+    max_wh = 7680
+    max_det = 300
+    max_nms = 30000
+
+    bs = outputs.shape[0]  # batch size
+    nc = outputs.shape[1] - 4  # number of classes
+    xc = outputs[:, 4 : 4 + nc].amax(1) > confidence_threshold  # candidates
+
+    # Settings
+    start = time()
+    limit = 0.5 + 0.05 * bs  # seconds to quit after
+    output = [torch.zeros((0, 6), device=outputs.device)] * bs
+    for index, x in enumerate(outputs):  # image index, image inference
+        x = x.transpose(0, -1)[xc[index]]  # confidence
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # matrix nx6 (box, confidence, cls)
+        box, cls = x.split((4, nc), 1)
+        box = wh2xy(box)  # (cx, cy, w, h) to (x1, y1, x2, y2)
+        if nc > 1:
+            i, j = (cls > confidence_threshold).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = cls.max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > confidence_threshold]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence and remove excess boxes
+
+        # Batched NMS
+        c = x[:, 5:6] * max_wh  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes, scores
+        indices = torchvision.ops.nms(boxes, scores, iou_threshold)  # NMS
+        indices = indices[:max_det]  # limit detections
+
+        output[index] = x[indices]
+        if (time() - start) > limit:
+            break  # time limit exceeded
+
+    return output
+
+
+def smooth(y, f=0.1):
+    # Box filter of fraction f
+    nf = round(len(y) * f * 2) // 2 + 1  # number of filter elements (must be odd)
+    p = numpy.ones(nf // 2)  # ones padding
+    yp = numpy.concatenate((p * y[0], y, p * y[-1]), 0)  # y padded
+    return numpy.convolve(yp, numpy.ones(nf) / nf, mode="valid")  # y-smoothed
+
+
+def plot_pr_curve(px, py, ap, names, save_dir):
+    from matplotlib import pyplot
+
+    fig, ax = pyplot.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = numpy.stack(py, axis=1)
+
+    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f"{names[i]} {ap[i, 0]:.3f}")  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color="grey")  # plot(recall, precision)
+
+    ax.plot(px, py.mean(1), linewidth=3, color="blue", label="all classes %.3f mAP@0.5" % ap[:, 0].mean())
+    ax.set_xlabel("Recall")
+    ax.set_ylabel("Precision")
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    ax.set_title("Precision-Recall Curve")
+    fig.savefig(save_dir, dpi=250)
+    pyplot.close(fig)
+
+
+def plot_curve(px, py, names, save_dir, x_label="Confidence", y_label="Metric"):
+    from matplotlib import pyplot
+
+    figure, ax = pyplot.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+
+    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
+        for i, y in enumerate(py):
+            ax.plot(px, y, linewidth=1, label=f"{names[i]}")  # plot(confidence, metric)
+    else:
+        ax.plot(px, py.T, linewidth=1, color="grey")  # plot(confidence, metric)
+
+    y = smooth(py.mean(0), f=0.05)
+    ax.plot(px, y, linewidth=3, color="blue", label=f"all classes {y.max():.3f} at {px[y.argmax()]:.3f}")
+    ax.set_xlabel(x_label)
+    ax.set_ylabel(y_label)
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    ax.set_title(f"{y_label}-Confidence Curve")
+    figure.savefig(save_dir, dpi=250)
+    pyplot.close(figure)
+
+
+def compute_ap(tp, conf, output, target, plot=False, names=(), eps=1e-16):
+    """
+    Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Object-ness value from 0-1 (nparray).
+        output:  Predicted object classes (nparray).
+        target:  True object classes (nparray).
+    # Returns
+        The average precision
+    """
+    # Sort by object-ness
+    i = numpy.argsort(-conf)
+    tp, conf, output = tp[i], conf[i], output[i]
+
+    # Find unique classes
+    unique_classes, nt = numpy.unique(target, return_counts=True)
+    nc = unique_classes.shape[0]  # number of classes, number of detections
+
+    # Create Precision-Recall curve and compute AP for each class
+    p = numpy.zeros((nc, 1000))
+    r = numpy.zeros((nc, 1000))
+    ap = numpy.zeros((nc, tp.shape[1]))
+    px, py = numpy.linspace(start=0, stop=1, num=1000), []  # for plotting
+    for ci, c in enumerate(unique_classes):
+        i = output == c
+        nl = nt[ci]  # number of labels
+        no = i.sum()  # number of outputs
+        if no == 0 or nl == 0:
+            continue
+
+        # Accumulate FPs and TPs
+        fpc = (1 - tp[i]).cumsum(0)
+        tpc = tp[i].cumsum(0)
+
+        # Recall
+        recall = tpc / (nl + eps)  # recall curve
+        # negative x, xp because xp decreases
+        r[ci] = numpy.interp(-px, -conf[i], recall[:, 0], left=0)
+
+        # Precision
+        precision = tpc / (tpc + fpc)  # precision curve
+        p[ci] = numpy.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
+
+        # AP from recall-precision curve
+        for j in range(tp.shape[1]):
+            m_rec = numpy.concatenate(([0.0], recall[:, j], [1.0]))
+            m_pre = numpy.concatenate(([1.0], precision[:, j], [0.0]))
+
+            # Compute the precision envelope
+            m_pre = numpy.flip(numpy.maximum.accumulate(numpy.flip(m_pre)))
+
+            # Integrate area under curve
+            x = numpy.linspace(start=0, stop=1, num=101)  # 101-point interp (COCO)
+            ap[ci, j] = numpy.trapz(numpy.interp(x, m_rec, m_pre), x)  # integrate
+            if plot and j == 0:
+                py.append(numpy.interp(px, m_rec, m_pre))  # precision at mAP@0.5
+
+    # Compute F1 (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + eps)
+    if plot:
+        names = dict(enumerate(names))  # to dict
+        names = [v for k, v in names.items() if k in unique_classes]  # list: only classes that have data
+        plot_pr_curve(px, py, ap, names, save_dir="./weights/PR_curve.png")
+        plot_curve(px, f1, names, save_dir="./weights/F1_curve.png", y_label="F1")
+        plot_curve(px, p, names, save_dir="./weights/P_curve.png", y_label="Precision")
+        plot_curve(px, r, names, save_dir="./weights/R_curve.png", y_label="Recall")
+    i = smooth(f1.mean(0), 0.1).argmax()  # max F1 index
+    p, r, f1 = p[:, i], r[:, i], f1[:, i]
+    tp = (r * nt).round()  # true positives
+    fp = (tp / (p + eps) - tp).round()  # false positives
+    ap50, ap = ap[:, 0], ap.mean(1)  # AP@0.5, AP@0.5:0.95
+    m_pre, m_rec = p.mean(), r.mean()
+    map50, mean_ap = ap50.mean(), ap.mean()
+    return tp, fp, m_pre, m_rec, map50, mean_ap
+
+
+def compute_iou(box1, box2, eps=1e-7):
+    # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
+
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    # Intersection area
+    inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * (
+        b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)
+    ).clamp(0)
+
+    # Union Area
+    union = w1 * h1 + w2 * h2 - inter + eps
+
+    # IoU
+    iou = inter / union
+    cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1)  # convex (smallest enclosing box) width
+    ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1)  # convex height
+    c2 = cw**2 + ch**2 + eps  # convex diagonal squared
+    rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center dist ** 2
+    # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+    v = (4 / math.pi**2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2)
+    with torch.no_grad():
+        alpha = v / (v - iou + (1 + eps))
+    return iou - (rho2 / c2 + v * alpha)  # CIoU
+
+
+def strip_optimizer(filename):
+    x = torch.load(filename, map_location="cpu", weights_only=False)
+    x["model"].half()  # to FP16
+    for p in x["model"].parameters():
+        p.requires_grad = False
+    torch.save(x, f=filename)
+
+
+def clip_gradients(model, max_norm=10.0):
+    parameters = model.parameters()
+    torch.nn.utils.clip_grad_norm_(parameters, max_norm=max_norm)
+
+
+def load_weight(model, ckpt):
+    dst = model.state_dict()
+    src = torch.load(ckpt, weights_only=False)["model"].float().cpu()
+
+    ckpt = {}
+    for k, v in src.state_dict().items():
+        if k in dst and v.shape == dst[k].shape:
+            ckpt[k] = v
+
+    model.load_state_dict(state_dict=ckpt, strict=False)
+    return model
+
+
+def set_params(model, decay):
+    p1 = []
+    p2 = []
+    norm = tuple(v for k, v in torch.nn.__dict__.items() if "Norm" in k)
+    for m in model.modules():
+        for n, p in m.named_parameters(recurse=0):
+            if not p.requires_grad:
+                continue
+            if n == "bias":  # bias (no decay)
+                p1.append(p)
+            elif n == "weight" and isinstance(m, norm):  # norm-weight (no decay)
+                p1.append(p)
+            else:
+                p2.append(p)  # weight (with decay)
+    return [{"params": p1, "weight_decay": 0.00}, {"params": p2, "weight_decay": decay}]
+
+
+def plot_lr(args, optimizer, scheduler, num_steps):
+    from matplotlib import pyplot
+
+    optimizer = copy.copy(optimizer)
+    scheduler = copy.copy(scheduler)
+
+    y = []
+    for epoch in range(args.epochs):
+        for i in range(num_steps):
+            step = i + num_steps * epoch
+            scheduler.step(step, optimizer)
+            y.append(optimizer.param_groups[0]["lr"])
+    pyplot.plot(y, ".-", label="LR")
+    pyplot.xlabel("step")
+    pyplot.ylabel("LR")
+    pyplot.grid()
+    pyplot.xlim(0, args.epochs * num_steps)
+    pyplot.ylim(0)
+    pyplot.savefig("./weights/lr.png", dpi=200)
+    pyplot.close()
+
+
+class CosineLR:
+    def __init__(self, args, params, num_steps):
+        max_lr = params["max_lr"]
+        min_lr = params["min_lr"]
+
+        warmup_steps = int(max(params["warmup_epochs"] * num_steps, 100))
+        decay_steps = int(args.epochs * num_steps - warmup_steps)
+
+        warmup_lr = numpy.linspace(min_lr, max_lr, int(warmup_steps))
+
+        decay_lr = []
+        for step in range(1, decay_steps + 1):
+            alpha = math.cos(math.pi * step / decay_steps)
+            decay_lr.append(min_lr + 0.5 * (max_lr - min_lr) * (1 + alpha))
+
+        self.total_lr = numpy.concatenate((warmup_lr, decay_lr))
+
+    def step(self, step, optimizer):
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = self.total_lr[step]
+
+
+class LinearLR:
+    def __init__(self, args, params, num_steps):
+        max_lr = params["max_lr"]
+        min_lr = params["min_lr"]
+
+        warmup_steps = int(max(params["warmup_epochs"] * num_steps, 100))
+        decay_steps = int(args.epochs * num_steps - warmup_steps)
+
+        warmup_lr = numpy.linspace(min_lr, max_lr, int(warmup_steps), endpoint=False)
+        decay_lr = numpy.linspace(max_lr, min_lr, decay_steps)
+
+        self.total_lr = numpy.concatenate((warmup_lr, decay_lr))
+
+    def step(self, step, optimizer):
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = self.total_lr[step]
+
+
+class EMA:
+    """
+    Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models
+    Keeps a moving average of everything in the model state_dict (parameters and buffers)
+    For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    """
+
+    def __init__(self, model, decay=0.9999, tau=2000, updates=0):
+        # Create EMA
+        self.ema = copy.deepcopy(model).eval()  # FP32 EMA
+        self.updates = updates  # number of EMA updates
+        # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / tau))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        if hasattr(model, "module"):
+            model = model.module
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = model.state_dict()  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+
+
+class AverageMeter:
+    def __init__(self):
+        self.num = 0
+        self.sum = 0
+        self.avg = 0
+
+    def update(self, v, n):
+        if not math.isnan(float(v)):
+            self.num = self.num + n
+            self.sum = self.sum + v * n
+            self.avg = self.sum / self.num
+
+
+class Assigner(torch.nn.Module):
+    def __init__(self, nc=80, top_k=13, alpha=1.0, beta=6.0, eps=1e-9):
+        super().__init__()
+        self.top_k = top_k
+        self.nc = nc
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @torch.no_grad()
+    def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
+        batch_size = pd_scores.size(0)
+        num_max_boxes = gt_bboxes.size(1)
+
+        if num_max_boxes == 0:
+            device = gt_bboxes.device
+            return (
+                torch.zeros_like(pd_bboxes).to(device),
+                torch.zeros_like(pd_scores).to(device),
+                torch.zeros_like(pd_scores[..., 0]).to(device),
+            )
+
+        num_anchors = anc_points.shape[0]
+        shape = gt_bboxes.shape
+        lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2)
+        mask_in_gts = torch.cat((anc_points[None] - lt, rb - anc_points[None]), dim=2)
+        mask_in_gts = mask_in_gts.view(shape[0], shape[1], num_anchors, -1).amin(3).gt_(self.eps)
+        na = pd_bboxes.shape[-2]
+        gt_mask = (mask_in_gts * mask_gt).bool()  # b, max_num_obj, h*w
+        overlaps = torch.zeros([batch_size, num_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device)
+        bbox_scores = torch.zeros([batch_size, num_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device)
+
+        ind = torch.zeros([2, batch_size, num_max_boxes], dtype=torch.long)  # 2, b, max_num_obj
+        ind[0] = torch.arange(end=batch_size).view(-1, 1).expand(-1, num_max_boxes)  # b, max_num_obj
+        ind[1] = gt_labels.squeeze(-1)  # b, max_num_obj
+        bbox_scores[gt_mask] = pd_scores[ind[0], :, ind[1]][gt_mask]  # b, max_num_obj, h*w
+
+        pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, num_max_boxes, -1, -1)[gt_mask]
+        gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[gt_mask]
+        overlaps[gt_mask] = compute_iou(gt_boxes, pd_boxes).squeeze(-1).clamp_(0)
+
+        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
+
+        top_k_mask = mask_gt.expand(-1, -1, self.top_k).bool()
+        top_k_metrics, top_k_indices = torch.topk(align_metric, self.top_k, dim=-1, largest=True)
+        if top_k_mask is None:
+            top_k_mask = (top_k_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(top_k_indices)
+        top_k_indices.masked_fill_(~top_k_mask, 0)
+
+        mask_top_k = torch.zeros(align_metric.shape, dtype=torch.int8, device=top_k_indices.device)
+        ones = torch.ones_like(top_k_indices[:, :, :1], dtype=torch.int8, device=top_k_indices.device)
+        for k in range(self.top_k):
+            mask_top_k.scatter_add_(-1, top_k_indices[:, :, k : k + 1], ones)
+        mask_top_k.masked_fill_(mask_top_k > 1, 0)
+        mask_top_k = mask_top_k.to(align_metric.dtype)
+        mask_pos = mask_top_k * mask_in_gts * mask_gt
+
+        fg_mask = mask_pos.sum(-2)
+        if fg_mask.max() > 1:
+            mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, num_max_boxes, -1)
+            max_overlaps_idx = overlaps.argmax(1)
+
+            is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device)
+            is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1)
+
+            mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float()
+            fg_mask = mask_pos.sum(-2)
+        target_gt_idx = mask_pos.argmax(-2)
+
+        # Assigned target
+        index = torch.arange(end=batch_size, dtype=torch.int64, device=gt_labels.device)[..., None]
+        target_index = target_gt_idx + index * num_max_boxes
+        target_labels = gt_labels.long().flatten()[target_index]
+
+        target_bboxes = gt_bboxes.view(-1, gt_bboxes.shape[-1])[target_index]
+
+        # Assigned target scores
+        target_labels.clamp_(0)
+
+        target_scores = torch.zeros(
+            (target_labels.shape[0], target_labels.shape[1], self.nc), dtype=torch.int64, device=target_labels.device
+        )
+        target_scores.scatter_(2, target_labels.unsqueeze(-1), 1)
+
+        fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.nc)
+        target_scores = torch.where(fg_scores_mask > 0, target_scores, 0)
+
+        # Normalize
+        align_metric *= mask_pos
+        pos_align_metrics = align_metric.amax(dim=-1, keepdim=True)
+        pos_overlaps = (overlaps * mask_pos).amax(dim=-1, keepdim=True)
+        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
+        target_scores = target_scores * norm_align_metric
+
+        return target_bboxes, target_scores, fg_mask.bool()
+
+
+class QFL(torch.nn.Module):
+    def __init__(self, beta=2.0):
+        super().__init__()
+        self.beta = beta
+        self.bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(self, outputs, targets):
+        bce_loss = self.bce_loss(outputs, targets)
+        return torch.pow(torch.abs(targets - outputs.sigmoid()), self.beta) * bce_loss
+
+
+class VFL(torch.nn.Module):
+    def __init__(self, alpha=0.75, gamma=2.00, iou_weighted=True):
+        super().__init__()
+        assert alpha >= 0.0
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(self, outputs, targets):
+        assert outputs.size() == targets.size()
+        targets = targets.type_as(outputs)
+
+        if self.iou_weighted:
+            focal_weight = (
+                targets * (targets > 0.0).float()
+                + self.alpha * (outputs.sigmoid() - targets).abs().pow(self.gamma) * (targets <= 0.0).float()
+            )
+
+        else:
+            focal_weight = (targets > 0.0).float() + self.alpha * (outputs.sigmoid() - targets).abs().pow(
+                self.gamma
+            ) * (targets <= 0.0).float()
+
+        return self.bce_loss(outputs, targets) * focal_weight
+
+
+class FocalLoss(torch.nn.Module):
+    def __init__(self, alpha=0.25, gamma=1.5):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(self, outputs, targets):
+        loss = self.bce_loss(outputs, targets)
+
+        if self.alpha > 0:
+            alpha_factor = targets * self.alpha + (1 - targets) * (1 - self.alpha)
+            loss *= alpha_factor
+
+        if self.gamma > 0:
+            outputs_sigmoid = outputs.sigmoid()
+            p_t = targets * outputs_sigmoid + (1 - targets) * (1 - outputs_sigmoid)
+            gamma_factor = (1.0 - p_t) ** self.gamma
+            loss *= gamma_factor
+
+        return loss
+
+
+class BoxLoss(torch.nn.Module):
+    def __init__(self, dfl_ch):
+        super().__init__()
+        self.dfl_ch = dfl_ch
+
+    def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
+        # IoU loss
+        weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
+        iou = compute_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask])
+        loss_box = ((1.0 - iou) * weight).sum() / target_scores_sum
+
+        # DFL loss
+        a, b = target_bboxes.chunk(2, -1)
+        target = torch.cat((anchor_points - a, b - anchor_points), -1)
+        target = target.clamp(0, self.dfl_ch - 0.01)
+        loss_dfl = self.df_loss(pred_dist[fg_mask].view(-1, self.dfl_ch + 1), target[fg_mask])
+        loss_dfl = (loss_dfl * weight).sum() / target_scores_sum
+
+        return loss_box, loss_dfl
+
+    @staticmethod
+    def df_loss(pred_dist, target):
+        # Distribution Focal Loss (DFL)
+        # https://ieeexplore.ieee.org/document/9792391
+        tl = target.long()  # target left
+        tr = tl + 1  # target right
+        wl = tr - target  # weight left
+        wr = 1 - wl  # weight right
+        left_loss = cross_entropy(pred_dist, tl.view(-1), reduction="none").view(tl.shape)
+        right_loss = cross_entropy(pred_dist, tr.view(-1), reduction="none").view(tl.shape)
+        return (left_loss * wl + right_loss * wr).mean(-1, keepdim=True)
+
+
+class ComputeLoss:
+    def __init__(self, model, params):
+        if hasattr(model, "module"):
+            model = model.module
+
+        device = next(model.parameters()).device
+
+        m = model.head  # Head() module
+
+        self.params = params
+        self.stride = m.stride
+        self.nc = m.nc
+        self.no = m.no
+        self.reg_max = m.ch
+        self.device = device
+
+        self.box_loss = BoxLoss(m.ch - 1).to(device)
+        self.cls_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+        self.assigner = Assigner(nc=self.nc, top_k=10, alpha=0.5, beta=6.0)
+
+        self.project = torch.arange(m.ch, dtype=torch.float, device=device)
+
+    def box_decode(self, anchor_points, pred_dist):
+        b, a, c = pred_dist.shape
+        pred_dist = pred_dist.view(b, a, 4, c // 4)
+        pred_dist = pred_dist.softmax(3)
+        pred_dist = pred_dist.matmul(self.project.type(pred_dist.dtype))
+        lt, rb = pred_dist.chunk(2, -1)
+        x1y1 = anchor_points - lt
+        x2y2 = anchor_points + rb
+        return torch.cat(tensors=(x1y1, x2y2), dim=-1)
+
+    def __call__(self, outputs, targets):
+        x = torch.cat([i.view(outputs[0].shape[0], self.no, -1) for i in outputs], dim=2)
+        pred_distri, pred_scores = x.split(split_size=(self.reg_max * 4, self.nc), dim=1)
+
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+
+        data_type = pred_scores.dtype
+        batch_size = pred_scores.shape[0]
+        input_size = torch.tensor(outputs[0].shape[2:], device=self.device, dtype=data_type) * self.stride[0]
+        anchor_points, stride_tensor = make_anchors(outputs, self.stride, offset=0.5)
+
+        idx = targets["idx"].view(-1, 1)
+        cls = targets["cls"].view(-1, 1)
+        box = targets["box"]
+
+        targets = torch.cat((idx, cls, box), dim=1).to(self.device)
+        if targets.shape[0] == 0:
+            gt = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]
+            _, counts = i.unique(return_counts=True)
+            counts = counts.to(dtype=torch.int32)
+            gt = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    gt[j, :n] = targets[matches, 1:]
+            x = gt[..., 1:5].mul_(input_size[[1, 0, 1, 0]])
+            y = torch.empty_like(x)
+            dw = x[..., 2] / 2  # half-width
+            dh = x[..., 3] / 2  # half-height
+            y[..., 0] = x[..., 0] - dw  # top left x
+            y[..., 1] = x[..., 1] - dh  # top left y
+            y[..., 2] = x[..., 0] + dw  # bottom right x
+            y[..., 3] = x[..., 1] + dh  # bottom right y
+            gt[..., 1:5] = y
+        gt_labels, gt_bboxes = gt.split((1, 4), 2)
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+
+        pred_bboxes = self.box_decode(anchor_points, pred_distri)
+        assigned_targets = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt,
+        )
+        target_bboxes, target_scores, fg_mask = assigned_targets
+
+        target_scores_sum = max(target_scores.sum(), 1)
+
+        loss_cls = self.cls_loss(pred_scores, target_scores.to(data_type)).sum() / target_scores_sum  # BCE
+
+        # Box loss
+        loss_box = torch.zeros(1, device=self.device)
+        loss_dfl = torch.zeros(1, device=self.device)
+        if fg_mask.sum():
+            target_bboxes /= stride_tensor
+            loss_box, loss_dfl = self.box_loss(
+                pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask
+            )
+
+        loss_box *= self.params["box"]  # box gain
+        loss_cls *= self.params["cls"]  # cls gain
+        loss_dfl *= self.params["dfl"]  # dfl gain
+
+        return loss_box, loss_cls, loss_dfl