1. Python调用
模型预测:
def segment_predict():
model = YOLO('../weights/segment/yolo11n-seg.pt')
print(model)
results = model('../ultralytics/assets/bus.jpg')
if not os.path.exists(results[0].save_dir):
os.makedirs(results[0].save_dir)
for result in results:
filename = result.path.split("\\")[-1]
filedir = result.save_dir + "\\" + filename
result.save(filedir)
运行结果:

模型训练,同样基于coco8数据集
# Load a model
model = YOLO("yolo11n-seg.yaml").load("yolo11n.pt") # build from YAML and transfer weights
# Train the model
results = model.train(data="coco8-seg.yaml", epochs=100, imgsz=640, device="0")
# Evaluate model performance on the validation set
metrics = model.val()
# Perform object detection on an image
results = model("../ultralytics/assets/bus.jpg")
results[0].show()
# Export the model to ONNX format
path = model.export(format="onnx") # return path to exported model
2. 网络结构图
分割的网络结构图主体部分与检测部分相同,只是Detect部分增加了Proto与Pred_mask而已。

图2-1 yolo11-segment网络结构图
3. 损失函数
3.1 损失函数定位
与Detection部分相同,训练函数都是调用BaseTrainer中的train函数,其中不同的是训练类与模型类都是继承自Detection中的类,另外分割的损失为类v8SegmentationLoss,用流程图定位损失函数的走向如图3-1所示,其中损失函数以类的方式定义。

图3-1 yolo11-segment的损失函数定位
3.2 损失函数具体分析
(1) 前向推理与构造anchor
segment的前向推理包括detection的feats,segment的pred_mask和proto,具体代码如下所示。其中anchor的构建如目标检测中的图3-1所示,分成三个尺度,倍率分别是8、16、32。
loss = torch.zeros(4, device=self.device) # box, cls, dfl
# feats:4x144x80x80,4x144x40x40,4x144x20x20;pred_masks:4x32x8400;proto:4x32x160x160
feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
batch_size, _, mask_h, mask_w = proto.shape # batch size, number of masks, mask height, mask width
pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
(self.reg_max * 4, self.nc), 1) # pred_distri:4x64x8400; pred_scores:4x80x8400
# B, grids, ..
pred_scores = pred_scores.permute(0, 2, 1).contiguous() # 4x8400x80
pred_distri = pred_distri.permute(0, 2, 1).contiguous() # 4x8400x64
pred_masks = pred_masks.permute(0, 2, 1).contiguous() # 4x8400x32
dtype = pred_scores.dtype
imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w)
anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) # anchors_points:8400x2; stride_tensor:8400x1
(2) 框构建与任务分配器
与detection相同,框的构建包括GT框与预测框的构建。
# Targets
try:
batch_idx = batch["batch_idx"].view(-1, 1)
targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"]), 1) # targets:22x6
# targets: 4x7x5,其中7表示4幅图像最大框数目,5表示cls+bbox
targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy 分离targets,gt_labels:4x7x1,gt_bboxes:4x7x4
mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0) # mask_gt:4x7x1二值矩阵
except RuntimeError as e:
raise TypeError("ERROR") from e
# Pboxes
# pred_bboxes:4x8400x4,anchor_points:8400x4,pred_distri:4x8400x64得到的是左上角、右下角距离每个anchor_point中心点的距离
pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4)
# target_bboxes:4x8400x4,target_scores:4x8400x80,fg_mask:4x8400,target_gt_idx:4x8400
_, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
pred_scores.detach().sigmoid(), # pred_scores:4x8400x80
(pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), # pred_bboxes:4x8400x4三个尺寸映射到全尺寸上bboxes
anchor_points * stride_tensor, # 映射到全尺寸上的anchor_points
gt_labels, # gt_labels:4x7x1 类别号
gt_bboxes, # gt_bboxes:4x7x4 真实框坐标
mask_gt, # mask_gt:4x7x1二值矩阵,存在目标物体的mask
)
target_scores_sum = max(target_scores.sum(), 1)

其中assigner与目标检测的任务分配器一样,调用TaskAlignedAssigner类。
(3) 损失计算
ultralytics-yolo11分割的损失计算包括四部分:box loss、cls loss、dfl loss以及seg loss。其中box loss、cls loss与dfl loss与目标检测部分相同,具体参考ultralytics-YOLOv11的目标检测解析-CSDN博客,主要增加的部分是seg loss,损失计算如下:
target_scores_sum = max(target_scores.sum(), 1) # target_scores: 4x8400x80
# Cls loss
# loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way
loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE self.bce:4x8400x80
if fg_mask.sum(): # fg_mask:4x8400
# Bbox loss
loss[0], loss[3] = self.bbox_loss( # 计算loss_iou, loss_dfl
pred_distri, # 4x8400x64
pred_bboxes, # 4x8400x4
anchor_points, # 8400x2
target_bboxes / stride_tensor, # target_bboxes:4x8400x4,stride_tensor:8400x1
target_scores, # 4x8400x80
target_scores_sum, # 1x1
fg_mask, # 4x8400
)
# Masks loss
masks = batch["masks"].to(self.device).float() # masks:4x160x160
if tuple(masks.shape[-2:]) != (mask_h, mask_w): # downsample
masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
loss[1] = self.calculate_segmentation_loss( # 分割损失计算
fg_mask, masks, target_gt_idx, target_bboxes, batch_idx, proto, pred_masks, imgsz, self.overlap
)
# WARNING: lines below prevent Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove
else:
loss[1] += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss
loss[0] *= self.hyp.box # box gain=7.5
loss[1] *= self.hyp.box # seg gain=7.5
loss[2] *= self.hyp.cls # cls gain=0.5
loss[3] *= self.hyp.dfl # dfl gain=1.5
return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl)
相对于目标检测,分割增加了的损失函数计算为calculate_segmentation_loss,该函数的输入说明如下:
def calculate_segmentation_loss(self, fg_mask, masks, target_gt_idx, target_bboxes, batch_idx, proto, pred_masks, imgsz, overlap)
fg_mask (torch.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.
masks (torch.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).
target_gt_idx (torch.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).
target_bboxes (torch.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).
batch_idx (torch.Tensor): Batch indices of shape (N_labels_in_batch, 1).
proto (torch.Tensor): Prototype masks of shape (BS, 32, H, W).
pred_masks (torch.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).
imgsz (torch.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).
overlap (bool): Whether the masks in `masks` tensor overlap.
fg_mask: 4x8400 某个anchor存在目标
masks: 4x160x160 GT mask
target_gt_idx: 4x8400 每个anchor存在目标的索引idx
target_bboxes: 4x8400x4 每个anchor存在目标的bbox坐标
batch_idx: 22x1 目标在batch中属于哪一张图,batch=4,则idx属于0,1,2,3
proto: 4x32x160x160
pred_masks: 4x8400x32 每个anchor预测的mask
imgsz: 640x640
_, _, mask_h, mask_w = proto.shape
loss = 0
# Normalize to 0-1
target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]] # 归一化box
# Areas of target bboxes
marea = xyxy2xywh(target_bboxes_normalized)[..., 2:].prod(2)
# Normalize to mask size 归一化box映射到mask_size
mxyxy = target_bboxes_normalized * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=proto.device)
for i, single_i in enumerate(zip(fg_mask, target_gt_idx, pred_masks, proto, mxyxy, marea, masks)):
fg_mask_i, target_gt_idx_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i #单幅图像数据
if fg_mask_i.any(): # fg_mask_i:(8400,)
mask_idx = target_gt_idx_i[fg_mask_i] # target_gt_idx_i:(8400,)
if overlap:
gt_mask = masks_i == (mask_idx + 1).view(-1, 1, 1) #masks_i:160x160,mask_idx:(40,)-->40x1x1,gt_mask:40x160x160
gt_mask = gt_mask.float()
else:
gt_mask = masks[batch_idx.view(-1) == i][mask_idx]
loss += self.single_mask_loss( # 单幅图像的mask loss计算
gt_mask, pred_masks_i[fg_mask_i], proto_i, mxyxy_i[fg_mask_i], marea_i[fg_mask_i]
)
# WARNING: lines below prevents Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove
else:
loss += (proto * 0).sum() + (pred_masks * 0).sum() # inf sums may lead to nan loss
return loss / fg_mask.sum() # fg_mask: 4x8400
单幅图像mask loss计算参数说明:
def single_mask_loss(gt_mask, pred, proto, xyxy, area)
gt_mask (torch.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects.
pred (torch.Tensor): Predicted mask coefficients of shape (n, 32).
proto (torch.Tensor): Prototype masks of shape (32, H, W).
xyxy (torch.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4).
area (torch.Tensor): Area of each ground truth bounding box of shape (n,).
gt_mask: 40x160x160 GT mask 40为目标个数
pred<---pred_mask_i[fg_mask_i]: 40x32 预测掩码系数
proto: 30x160x160 单幅原型mask
xyxy<---mxyxy_i[fg_mask_i]: 40x4 真实框的坐标
area<---marea_i[fg_mask_i]: (40,) 真实框的面积
pred_mask = torch.einsum("in,nhw->ihw", pred, proto) # (n, 32) @ (32, 160, 160) -> (n, 160, 160)
#类似于BCEWithLogitsLoss loss:40x160x160
loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).sum() #crop_mask根据坐标crop mask, xyxy:40x4,area:(40,)
其中函数torch.einsum的示例如下:


1万+

被折叠的 条评论
为什么被折叠?



