yolov7损失函数源码解析（一句一句解析，）

下面是函数 find_3_positive的详细解析。

本博客bs为2，即batch_size=2,采用的数据集不同于coco的85类，我们采用10类。

函数find_3_positive的参数解析：       
参数p为模型输出的预测值，有3层，size=[(bs,3,20,20,15),(bs,3,40,40,15)，(bs,3,80,80,85)], targets的size=(nt, 6)为(标注的数量，6)，如右边所示6的含义(nt, image_index + cls_id + x,y,w,h)

def  find_3_positive(self, p, targets)

       #######na为anchors的数量，nt为标注的数量，例      na=3，nt=156

       na, nt = self.na, targets.shape[0]

       indices, anch = [], []

       #######归一化网格空间(normalized to gridspace gain)

       gain = torch.ones(7, device=targets.device).long()

       #######得到维度(3，156)，即（na，nt），ai具体样子[[0,0...,0],[1,1..,1],[2,2...,2]]。

       ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)

       ########按照第二个维度进行堆叠，这里向最后一个维度添加了anchor索引，得到的targets维度(3,156,7)，(nt, image_index + cls_id + bbox + anchor_index)。这里使用repeat函数使输入进来的targets（156，6）重复了na=3次。

       targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)

       g = 0.5

       ########这里使用的是五个点，yolov7采用的是预测框的中心点落入那个grid里，使用该grid和周围的上下左右总共五个中的三个grid的anchor框来进行回归。主要看中心点的x落在左上角还是右下角。需要注意的是在这个函数里面是针对周围五个grid，而后面的函数可知，每一个中心点只能利用相邻的两个grid。

       off = torch.tensor([[0, 0],
                    [1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m
                    # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
                    ], device=targets.device).float() * g  #0ff维度（5，2）

       for i in range(self.nl):
           anchors = self.anchors[i] # anchors的维度(3,2)
           #######这里当i=0的时候将gain的[2:6]对应的位置换位80(第一层特征层大小)。p[0].shape = 2,3,80,80,15,根据下标索引得到[80,80,80,80]
           gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] 
           #######由于标签是0-1之间，所以对应位置乘以特征图尺寸。形状还是(na,nt, image_index + cls_id + bbox + anchor_index),例如(3,156，7)
           t = targets * gain 

       if nt: 
       #######用gt框的wh除anchor的wh，结果的维度r=(na,nt,wh)，例如(3,156,2)。
           r = t[:, :, 4:6] / anchors[:, None] # wh的比率。 
       #######这里hyp['anchor_t']=4，j是为了筛选比值大于4的，即正样本为：预测框wh/anchor wh<4.0的，同时比较r和1/r选择最大。j的维度为(na,nt),例如(3,156)。
           j = torch.max(r, 1. / r).max(2)[0] < self.hyp['anchor_t']                              
           # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']             
           # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) 
           t = t[j] # 从3*156个gt框过滤出220个gt框（filter_nt,image_index + cls_id + bbox + anchor_index）。例如过滤出220个gt框(220,7)。

       #######筛选得到gt框的xy，维度为（filter_nt,xy）,例如(220,2)
           gxy = t[:, 2:4]  # grid xy
       #######用对应特征层尺度减去经过筛选得到gt框的xy，比如用80去减。这里得到维度为：(220,2)。
           gxi = gain[[2, 3]] - gxy  # inverse
       #######把相对于各个网格左上角x<0.5,y<0.5和相对于右下角的x<0.5,y<0.5的框提取出来，也就是j,k,l,m。这里.T就是转置。这里
           j, k = ((gxy % 1. < g) & (gxy > 1.)).T
           l, m = ((gxi % 1. < g) & (gxi > 1.)).T
       #######这里j经过堆叠之后维度(5,filter_nt)。这里的例子(5,220)
           j = torch.stack((torch.ones_like(j), j, k, l, m))
           t = t.repeat((5, 1, 1))[j]  #和上面的过滤220框一样。从（5，220，7）过滤出（657，7）个gt框（相比于上面多了457个gt框，这得益与扩充），而位置索引（5，220）。这里要注意的是首先使用repeat函数使目标扩充五倍，先使用ones_like（j）将五个维度中的第一个维度全部索引出来，那就是220个框，加上j，k，l，m四个维度索引，理应2*220=440个框，总共能筛选660个框，但由于计算问题，掉了13个gt框，这里有待改进。这里将再次筛选的gt框数量计为final_filter_nt，于是得到t的维度(final_filter_nt,image_index + cls_id + bbox + anchor_index)。
       #######得到中心点掉入某gird，该网格相对于周围五个框的偏移量。
           offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
       else: 
           t = targets[0] 
           offsets = 0 
       #######得到image_index(batch_size为2的话就是0或者1) 和 cls_id。两个维度都是(final_filter_nt)。例如维度(648,)。这里.long()可以取整。
       b, c = t[:, :2].long().T # image, class 
       #######得到gt框中心点xy坐标。两个维度都是(final_filter_nt，xy)。例如我的例子中维度为(645,2)。
       gxy = t[:, 2:4] # grid xy （648，2）
        #######同理
       gwh = t[:, 4:6] # grid wh （648，2） 
       #######用xy坐标减去offset，这样做的目的是将本grid的坐标点，分别映射到周围五个grid的对应位置。这里.long是取整，得到各个网格坐标。
       gij = (gxy - offsets).long() 
       #######将最后一个维度的xy拆散，并且利用转置，将x和y进行分开。
       gi, gj = gij.T # grid xy indices 
# Append index（4，645）image class ，y , x 
       #######得到anchor_index(这里是0，1，2)，维度(final_filter_nt)，例如(648,)。
       a = t[:, 6].long() # anchor indices 
       #######将b(image_index)和a(anchor_index),还有gj(表示y)和gi(表示x)进行堆叠。这里clamp_(0, gain[3] - 1)函数表示将输入input张量每个元素的值压缩到区间 [min,max](这里就是(0,79))，并返回结果到一个新张量,由于函数加了_,所以在旧张量上面进行修改，不用返回一个新张量。返回后的维度为：(image_index+anchor_index+yx(grid),final_filter_nt)。本例为(4,648,)。
       indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))
        #######对anchors(形状为（na,2）)，以a（表现为anchor_index,他表示每个gt框的anchor索引）为索引，找到每个gt框所对应的anchor。最后输出维度为：（final_filter_nt,anchors）,例如(645,2)。
       anch.append(anchors[a]) # anchors 

########对上述最后两步的 indices（image_index+anchor_index+yx(grid),final_filter_nt）和anch（final_filter_nt,anchors）进行返回。
    return indices, anch


def build_targets(self, p, targets, imgs):
    
    #indices, anch = self.find_positive(p, targets)    
    ###上述函数的返回值。indices为（image_index+anchor_index+yx,final_filter_nt）和anch（final_filter_nt,anchors），这里yx是grid网格的坐标。
    indices, anch = self.find_3_positive(p, targets)
    #indices, anch = self.find_4_positive(p, targets)
    #indices, anch = self.find_5_positive(p, targets)
    #indices, anch = self.find_9_positive(p, targets)

    matching_bs = [[] for pp in p]
    matching_as = [[] for pp in p]
    matching_gjs = [[] for pp in p]
    matching_gis = [[] for pp in p]
    matching_targets = [[] for pp in p]
    matching_anchs = [[] for pp in p]
    
    nl = len(p)    # nl（number layer）= 3，预测特征层的层数。

    ###这里batch_size=2，所以batch_idx的范围为0，1。
    for batch_idx in range(p[0].shape[0]):
        ###判断是否image_index是否为0,返回维度(gt,),例如 (156,)。
        b_idx = targets[:, 0]==batch_idx
        ###根据b_idx进行筛选，得到图片0(这里batch_size=2,就图片0和1)的目标框数量，图片0的维度是（也就是this_target），(gt,image_index+class+bbox)，这里是（93，6），那么后面图片1就是（63，6）。
        this_target = targets[b_idx]  #(93,6)
        ###判断该图片是不是没有目标框
        if this_target.shape[0] == 0:
            continue
        ###这里对gt框的xywh都乘以80来得到，gt框的xywh。
        txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1]
        ###通过函数xywh2xyxy，得到gt框左上角和右下角的xy。
        txyxy = xywh2xyxy(txywh)
        pxyxys = []
        p_cls = []
        p_obj = []
        from_which_layer = []
        all_b = []
        all_a = []
        all_gj = []
        all_gi = []
        all_anch = []
        
        ###预测层p的维度为:(batch_size,3,80,80,15),(batch_size,3,40,40,15),(batch_size,3,20,20,15)。
        for i, pi in enumerate(p):
            ###将函数的返回值进行赋值b，a,gj,gi的维度(645,)
            b, a, gj, gi = indices[i]
            ###这里的b也就是image_index，如果是图片0就True，否则False。得到维度为(648,)。
            idx = (b == batch_idx)
            ###筛选得到图片0里面的b，a，gj，gi值，也就是image_index,anchor_index,和x，y他们四个维度，他们都是（273，）。
            b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] 
            all_b.append(b)
            all_a.append(a)
            all_gj.append(gj)
            all_gi.append(gi)
            ###对anchor框进行筛选，得到（273，2）个ancho框。这也是图片0所对应的anchor数量。
            all_anch.append(anch[i][idx])
            from_which_layer.append(torch.ones(size=(len(b),)) * i)
            ###将find_3_positive函数粗筛选后的[b,a,gj,gi]与预测层结果对应的点进行匹配。原本有640个预测点，有273个点被索引出，需要注意的是这里是图片0的。
            fg_pred = pi[b, a, gj, gi]                
            p_obj.append(fg_pred[:, 4:5])  #得到obj的预测值
            p_cls.append(fg_pred[:, 5:])   #得到cls的预测值
            ###将gi和gj在第一维度进行堆叠，这里gi和gj分别是x和y，得到有gt框中心点分别落在那个位置，最后得到的维度(273,2)，这里也是图片0里面gt框的中心点。
            grid = torch.stack([gi, gj], dim=1)
            ###通过下面的公式得到预测层里的预测框的xy。
            pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i]
            #pxy = (fg_pred[:, :2].sigmoid() * 3. - 1. + grid) * self.stride[i]
            ###通过下面的公式得到预测层里的预测框的xy。这里维度为(273,2)
            pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i] #/ 8.  预测的wh值
            ####将xywh值进行堆叠，他的维度变化为(273,4)。
            pxywh = torch.cat([pxy, pwh], dim=-1)
            ####将wywh转换成，预测框左上角和右下角坐标。
            pxyxy = xywh2xyxy(pxywh)
            pxyxys.append(pxyxy)

       ###将本张图片的所有正样本合并。shape=（P，4）    
       pxyxys = torch.cat(pxyxys, dim=0)
       if pxyxys.shape[0] == 0:
          continue
       p_obj = torch.cat(p_obj, dim=0)
       p_cls = torch.cat(p_cls, dim=0)
       from_which_layer = torch.cat(from_which_layer, dim=0)
       all_b = torch.cat(all_b, dim=0)
       all_a = torch.cat(all_a, dim=0)
       all_gj = torch.cat(all_gj, dim=0)
       all_gi = torch.cat(all_gi, dim=0)
       all_anch = torch.cat(all_anch, dim=0

文章出处登录后可见！

已经登录？立即刷新

yolov7损失函数源码解析（一句一句解析，）

相关推荐