# 这是一个用于SSD流派的groundtruth与prior boxes的匹配函数以及详细的讲解
# https://hellozhaozheng.github.io/z_post/PyTorch-SSD/#MultiBox
def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
# threshold: (float) 确定是否匹配的交并比阈值
# truths: (tensor: [num_obj, 4]) 存储真实 box 的边框坐标
# priors: (tensor: [num_priors, 4], 即[8732, 4]), 存储推荐框的坐标, 注意, 此时的框是 default box, 而不是 SSD 网络预测出来的框的坐标, 预测的结果存储在 loc_data中, 其 shape 为[num_obj, 8732, 4].
# variances: cfg['variance'], [0.1, 0.2], 用于将坐标转换成方便训练的形式(参考RCNN系列对边框坐标的处理)
# labels: (tensor: [num_obj]), 代表了每个真实 box 对应的类别的编号
# loc_t: (tensor: [batches, 8732, 4]),
# conf_t: (tensor: [batches, 8732]),
# idx: batches 中图片的序号, 标识当前正在处理的 image 在 batches 中的序号
overlaps = jaccard(truths, point_form(priors)) # [A, B], 返回任意两个box之间的交并比, overlaps[i][j] 代表box_a中的第i个box与box_b中的第j个box之间的交并比.
# 二部图匹配(Bipartite Matching)
# [num_objs,1], 得到对于每个 gt box 来说的匹配度最高的 prior box, 前者存储交并比, 后者存储prior box在num_priors中的位置
best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) # keepdim=True, 因此shape为[num_objs,1]
# [1, num_priors], 即[1,8732], 同理, 得到对于每个 prior box 来说的匹配度最高的 gt box
best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
best_prior_idx.squeeze_(1) # 上面特意保留了维度(keepdim=True), 这里又都把维度 squeeze/reduce 了, 实际上只需用默认的 keepdim=False 就可以自动 squeeze/reduce 维度.
best_truth_overlap.index_fill_(0, best_prior_idx, 2)
# 维度压缩后变为[num_priors], best_prior_idx 维度为[num_objs],
# 该语句会将与gt box匹配度最好的prior box 的交并比置为 2, 确保其最大, 以免防止某些 gtbox 没有匹配的 priorbox.
# 假想一种极端情况, 所有的priorbox与某个gtbox(标记为G)的交并比为1, 而其他gtbox分别有一个交并比
# 最高的priorbox, 但是肯定小于1(因为其他的gtbox与G的交并比肯定小于1), 这样一来, 就会使得所有
# 的priorbox都与G匹配, 为了防止这种情况, 我们将那些对gtbox来说, 具有最高交并比的priorbox,
# 强制进行互相匹配, 即令best_truth_idx[best_prior_idx[j]] = j, 详细见下面的for循环
# 注意!!: 因为 gt box 的数量要远远少于 prior box 的数量, 因此, 同一个 gt box 会与多个 prior box 匹配.
for j in range(best_prior_idx.size(0)): # range:0~num_obj-1
best_truth_idx[best_prior_idx[j]] = j
# best_prior_idx[j] 代表与box_a的第j个box交并比最高的 prior box 的下标, 将与该 gtbox
# 匹配度最好的 prior box 的下标改为j, 由此,完成了该 gtbox 与第j个 prior box 的匹配.
# 这里的循环只会进行num_obj次, 剩余的匹配为 best_truth_idx 中原本的值.
# 这里处理的情况是, priorbox中第i个box与gtbox中第k个box的交并比最高,
# 即 best_truth_idx[i]= k
# 但是对于best_prior_idx[k]来说, 它却与priorbox的第l个box有着最高的交并比,
# 即best_prior_idx[k]=l
# 而对于gtbox的另一个边框gtbox[j]来说, 它与priorbox[i]的交并比最大,
# 即但是对于best_prior_idx[j] = i.
# 那么, 此时, 我们就应该将best_truth_idx[i]= k 修改成 best_truth_idx[i]= j.
# 即令 priorbox[i] 与 gtbox[j]对应.
# 这样做的原因: 防止某个gtbox没有匹配的 prior box.
mathes = truths[best_truth_idx]
# truths 的shape 为[num_objs, 4], 而best_truth_idx是一个指示下标的列表, 列表长度为 8732,
# 列表中的下标范围为0~num_objs-1, 代表的是与每个priorbox匹配的gtbox的下标
# 上面的表达式会返回一个shape为 [num_priors, 4], 即 [8732, 4] 的tensor, 代表的就是与每个priorbox匹配的gtbox的坐标值.
conf = labels[best_truth_idx]+1 # 与上面的语句道理差不多, 这里得到的是每个prior box匹配的类别编号, shape 为[8732]
conf[best_truth_overlap < threshold] = 0 # 将与gtbox的交并比小于阈值的置为0 , 即认为是非物体框
loc = encode(matches, priors, variances) # 返回编码后的中心坐标和宽高.
loc_t[idx] = loc # 设置第idx张图片的gt编码坐标信息
conf_t[idx] = conf # 设置第idx张图片的编号信息.(大于0即为物体编号, 认为有物体, 小于0认为是背景)
visualization of bounding boxes from the groundtruth file
import csv
import cv2
import matplotlib.pyplot as plt
from random import randint
import os
%matplotlib inline
def bboxes_visualization(bboxes_file, num_saves, save_path):
bboxes_file:csv file contains all objects.
format:[[im_full_name, xmin,ymin,xmax,ymax,class_name], [......], .....]
check readme.md
num_saves: number of images to be saved
save_path: path to save images
if not os.path.exists(bboxes_file):
raise Exception('No specified bounding boxes files !')
if not os.path.exists(save_path):
print('Create the folder automatically .... \n')
with open(bboxes_file) as f :
info = list(csv.reader(f))
color_set = [[255, 0, 0],
[0, 255, 0],
[0, 0, 255],
[125, 125, 0]]
im_name = 'aaa'
k = 0
for ind, i in enumerate(info):
# if the new image, read the new image
if im_name != i[0]:
k = k + 1
im_name = i[0]
im = cv2.imread(im_name)
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
# plot rectangle in the image
coord = list(map(int,i[1:5]))
color = color_set[randint(0, len(color_set)-1)]
cv2.rectangle(im, (coord[0], coord[1]), (coord[2], coord[3]), color, 3)
cv2.putText(im,i[5],(coord[0]+20, coord[1]+20),cv2.FONT_HERSHEY_COMPLEX,0.9,color,2)
# 如果下面一个物体与此时的物体不属于同一个图像,
# 意味着一个图像中的所有物体显示完了
if ind<=(len(info)-2):
if info[ind+1][0] != im_name:
plt.savefig(os.path.join(save_path, im_name.split('/')[-1]))
if k == num_saves+1:
print("All images are saved in the " + save_path)
if __name__ == '__main__':
bboxes_visualization(bboxes_file='./refineDet/CSV/voc/07_test.csv', num_saves=30, save_path='saved_images')
# compute iou of boxes. The type of boxes is tensor-pyTorch
def compute_iou(box1, box2):
'''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
box1: (tensor) bounding boxes, sized [N,4].
box2: (tensor) bounding boxes, sized [M,4].
(tensor) iou, sized [N,M].
N = box1.size(0)
M = box2.size(0)
lt = torch.max(
box1[:, :2].unsqueeze(1).expand(N, M, 2), # [N,2] -> [N,1,2] -> [N,M,2]
box2[:, :2].unsqueeze(0).expand(N, M, 2), # [M,2] -> [1,M,2] -> [N,M,2]
rb = torch.min(
box1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N,2] -> [N,1,2] -> [N,M,2]
box2[:, 2:].unsqueeze(0).expand(N, M, 2), # [M,2] -> [1,M,2] -> [N,M,2]
wh = rb - lt # [N,M,2]
wh[wh < 0] = 0 # clip at 0
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1]) # [N,]
area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1]) # [M,]
area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M]
area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M]
iou = inter.float() / (area1 + area2 - inter + 1e-5)
return iou
# data augmentation for classification ---- mixup
# CODE:https://github.com/facebookresearch/mixup-cifar10/blob/master/train.py
# BLOG:https://blog.csdn.net/ly244855983/article/details/78938667
# details of mixup
def mixup_data(x, y, alpha=1.0, use_cuda=True):
'''Returns mixed inputs, pairs of targets, and lambda
x : a batch of images
y: a batch of labels
if alpha > 0:
lam = np.random.beta(alpha, alpha)
lam = 1
batch_size = x.size()[0]
if use_cuda:
index = torch.randperm(batch_size).cuda()
index = torch.randperm(batch_size)
mixed_x = lam * x + (1 - lam) * x[index, :]
y_a, y_b = y, y[index]
return mixed_x, y_a, y_b, lam
# mixup in training step for loss computation
def mixup_criterion(criterion, pred, y_a, y_b, lam):
return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
# train
def train(epoch):
print('\nEpoch: %d' % epoch)
train_loss = 0
reg_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
if use_cuda:
inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets_a, targets_b, lam = mixup_data(inputs, targets,
args.alpha, use_cuda) # 得到mixup 的data
inputs, targets_a, targets_b = map(Variable, (inputs,
targets_a, targets_b))
outputs = net(inputs) #mix-up data 的输出
loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) # mix-up data的输出分别与两个原始label的loss
train_loss += loss.data[0]
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += (lam * predicted.eq(targets_a.data).cpu().sum().float()
+ (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float())
progress_bar(batch_idx, len(trainloader),
'Loss: %.3f | Reg: %.5f | Acc: %.3f%% (%d/%d)'
% (train_loss/(batch_idx+1), reg_loss/(batch_idx+1),
100.*correct/total, correct, total))
return (train_loss/batch_idx, reg_loss/batch_idx, 100.*correct/total)
object detection csv file format :
[im_full_name, xmin, ymin, xmax, ymax, class_name ]
/root/voc/vocdevkit/07VOC/test/005167.jpg 242 169 270 215 bird
/root/voc/vocdevkit/07VOC/test/006505.jpg 5 51 494 277 sofa
/root/voc/vocdevkit/07VOC/test/006505.jpg 1 33 188 256 person
/root/voc/vocdevkit/07VOC/test/006505.jpg 132 10 321 277 person
/root/voc/vocdevkit/07VOC/test/006505.jpg 266 12 495 277 person
/root/voc/vocdevkit/07VOC/test/007977.jpg 82 28 500 375 person
/root/voc/vocdevkit/07VOC/test/003201.jpg 1 53 166 260 cow
/root/voc/vocdevkit/07VOC/test/003201.jpg 137 25 416 298 cow