302 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import cv2
from PIL import Image
import time
import numpy as np
import torch
import onnxruntime
from rknn.api import RKNN
NUM_CLS = 80
LISTSIZE = NUM_CLS+5
SPAN = 3
OBJ_THRESH = 0.2
NMS_THRESH = 0.5
CLASSES = ("person", "bicycle", "car","motorbike ","aeroplane ","bus ","train","truck ","boat","traffic light",
"fire hydrant","stop sign ","parking meter","bench","bird","cat","dog ","horse ","sheep","cow","elephant",
"bear","zebra ","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite",
"baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife ",
"spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza ","donut","cake","chair","sofa",
"pottedplant","bed","diningtable","toilet ","tvmonitor","laptop ","mouse ","remote ","keyboard ","cell phone","microwave ",
"oven ","toaster","sink","refrigerator ","book","clock","vase","scissors ","teddy bear ","hair drier", "toothbrush ")
masks = [[0,1,2], [3,4,5], [6,7,8]] #yolov5s
anchors = [[10,13],[16,30],[33,23],[30,61],[62,45],[59,119],[116,90],[156,198],[373,326]]
def letterbox_image(image, size):
iw, ih = image.size
w, h = size
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
image = np.array(image)
image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_LINEAR)
image = Image.fromarray(image)
new_image = Image.new('RGB', size, (128, 128, 128))
new_image.paste(image, ((w - nw) // 2, (h - nh) // 2))
return new_image
def w_bbox_iou(box1, box2, x1y1x2y2=True):
"""
计算IOU
"""
if not x1y1x2y2:
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
inter_rect_x1 = torch.max(b1_x1, b2_x1)
inter_rect_y1 = torch.max(b1_y1, b2_y1)
inter_rect_x2 = torch.min(b1_x2, b2_x2)
inter_rect_y2 = torch.min(b1_y2, b2_y2)
inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * \
torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def w_non_max_suppression(prediction, num_classes, conf_thres=0.1, nms_thres=0.4):
# 求左上角和右下角
# box_corner = prediction.new(prediction.shape)
box_corner = torch.FloatTensor(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))]
for image_i, image_pred in enumerate(prediction):
# 利用置信度进行第一轮筛选
conf_mask = (image_pred[:, 4] >= conf_thres).squeeze()
image_pred = image_pred[conf_mask]
if not image_pred.size(0):
continue
# 获得种类及其置信度
class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
# 获得的内容为(x1, y1, x2, y2, obj_conf, class_conf, class_pred)
detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
# 获得种类
unique_labels = detections[:, -1].cpu().unique()
if prediction.is_cuda:
unique_labels = unique_labels.cuda()
for c in unique_labels:
# 获得某一类初步筛选后全部的预测结果
detections_class = detections[detections[:, -1] == c]
# 按照存在物体的置信度排序
_, conf_sort_index = torch.sort(detections_class[:, 4], descending=True)
detections_class = detections_class[conf_sort_index]
# 进行非极大抑制
max_detections = []
while detections_class.size(0):
# 取出这一类置信度最高的一步一步往下判断判断重合程度是否大于nms_thres如果是则去除掉
max_detections.append(detections_class[0].unsqueeze(0))
if len(detections_class) == 1:
break
ious = w_bbox_iou(max_detections[-1], detections_class[1:])
detections_class = detections_class[1:][ious < nms_thres]
# 堆叠
max_detections = torch.cat(max_detections).data
# Add max detections to outputs
output[image_i] = max_detections if output[image_i] is None else torch.cat(
(output[image_i], max_detections))
return output
def onnx_postprocess(outputs, img_size_w, img_size_h):
boxs = []
a = torch.tensor(anchors).float().view(3, -1, 2)
anchor_grid = a.clone().view(3, 1, -1, 1, 1, 2)
for index, out in enumerate(outputs):
out = torch.from_numpy(out)
batch = out.shape[1]
feature_h = out.shape[2]
feature_w = out.shape[3]
# Feature map corresponds to the original image zoom factor
stride_w = int(img_size_w / feature_w)
stride_h = int(img_size_h / feature_h)
grid_x, grid_y = np.meshgrid(np.arange(feature_w), np.arange(feature_h))
grid_x, grid_y = torch.from_numpy(np.array(grid_x)).float(), torch.from_numpy(np.array(grid_y)).float()
# cx, cy, w, h
pred_boxes = torch.FloatTensor(out[..., :4].shape)
pred_boxes[..., 0] = (torch.sigmoid(out[..., 0]) * 2.0 - 0.5 + grid_x) * stride_w # cx
pred_boxes[..., 1] = (torch.sigmoid(out[..., 1]) * 2.0 - 0.5 + grid_y) * stride_h # cy
pred_boxes[..., 2:4] = (torch.sigmoid(out[..., 2:4]) * 2) ** 2 * anchor_grid[index] # wh
pred_boxes_np = pred_boxes.numpy()
conf = torch.sigmoid(out[..., 4])
pred_cls = torch.sigmoid(out[..., 5:])
output = torch.cat((pred_boxes.view(1, -1, 4),
conf.view(1, -1, 1),
pred_cls.view(1, -1, NUM_CLS)),
-1)
boxs.append(output)
outputx = torch.cat(boxs, 1)
# NMS
batch_detections = w_non_max_suppression(outputx, NUM_CLS, conf_thres=OBJ_THRESH, nms_thres=NMS_THRESH)
return batch_detections
def clip_coords(boxes, img_shape):
# Clip bounding xyxy bounding boxes to image shape (height, width)
boxes[:, 0].clamp_(0, img_shape[1]) # x1
boxes[:, 1].clamp_(0, img_shape[0]) # y1
boxes[:, 2].clamp_(0, img_shape[1]) # x2
boxes[:, 3].clamp_(0, img_shape[0]) # y2
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0]/img0_shape[0], img1_shape[1]/img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding
coords[:, :4] /= gain
clip_coords(coords, img0_shape)
return coords
def display(detections=None, image_src=None, input_size=(640, 640), line_thickness=None, text_bg_alpha=0.0):
labels = detections[..., -1]
boxs = detections[..., :4]
confs = detections[..., 4]
h, w, c = image_src.shape
boxs[:, :] = scale_coords(input_size, boxs[:, :], (h, w)).round()
tl = line_thickness or round(0.002 * (w + h) / 2) + 1
for i, box in enumerate(boxs):
x1, y1, x2, y2 = box
ratio = (y2-y1)/(x2-x1)
x1, y1, x2, y2 = int(x1.numpy()), int(y1.numpy()), int(x2.numpy()), int(y2.numpy())
np.random.seed(int(labels[i].numpy()) + 2020)
color = (np.random.randint(0, 255), 0, np.random.randint(0, 255))
cv2.rectangle(image_src, (x1, y1), (x2, y2), color, max(int((w + h) / 600), 1), cv2.LINE_AA)
label = '{0:.3f}'.format(confs[i])
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=1)[0]
c2 = x1 + t_size[0] + 3, y1 - t_size[1] - 5
if text_bg_alpha == 0.0:
cv2.rectangle(image_src, (x1 - 1, y1), c2, color, cv2.FILLED, cv2.LINE_AA)
else:
# 透明文本背景
alphaReserve = text_bg_alpha # 0不透明 1透明
BChannel, GChannel, RChannel = color
xMin, yMin = int(x1 - 1), int(y1 - t_size[1] - 3)
xMax, yMax = int(x1 + t_size[0]), int(y1)
image_src[yMin:yMax, xMin:xMax, 0] = image_src[yMin:yMax, xMin:xMax, 0] * alphaReserve + BChannel * (1 - alphaReserve)
image_src[yMin:yMax, xMin:xMax, 1] = image_src[yMin:yMax, xMin:xMax, 1] * alphaReserve + GChannel * (1 - alphaReserve)
image_src[yMin:yMax, xMin:xMax, 2] = image_src[yMin:yMax, xMin:xMax, 2] * alphaReserve + RChannel * (1 - alphaReserve)
cv2.putText(image_src, label, (x1 + 3, y1 - 4), 0, tl / 3, [255, 255, 255],
thickness=1, lineType=cv2.LINE_AA)
if __name__ == '__main__':
exp = 'yolov5s'
Width = 640
Height = 640
MODEL_PATH = './yolov5s.onnx'
im_file = './dog_bike_car_640x640.jpg'
RKNN_MODEL_PATH = './{}.rknn'.format(exp + '-' + str(Width) + '-' + str(Height))
DATASET = './dataset.txt'
# Create RKNN object
rknn = RKNN(verbose=True)
rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]])
# Load model
print('--> Loading model')
ret = rknn.load_onnx(MODEL_PATH)
if ret != 0:
print('load model failed!')
exit(ret)
print('done')
# Build model
print('--> Building model')
ret = rknn.build(do_quantization=True, dataset=DATASET)
if ret != 0:
print('build model failed.')
exit(ret)
print('done')
# Export rknn model
print('--> Export RKNN model')
ret = rknn.export_rknn(RKNN_MODEL_PATH)
if ret != 0:
print('Export rknn model failed.')
exit(ret)
print('done')
# Set inputs
image_src = Image.open('./dog_bike_car_640x640.jpg')
img = letterbox_image(image_src, (Width, Height))
img = np.array(img)
# init runtime environment
print('--> Init runtime environment')
ret = rknn.init_runtime()
if ret != 0:
print('Init runtime environment failed')
exit(ret)
print('done')
# inference
print('--> inference')
start = time.time()
outputs = rknn.inference(inputs=[img])
end = time.time()
print('inference time: ', end - start)
print('done')
np.save('./onnx_yolov5_0.npy', outputs[0])
np.save('./onnx_yolov5_1.npy', outputs[1])
np.save('./onnx_yolov5_2.npy', outputs[2])
# inference process
image_src = np.array(image_src)
detections = onnx_postprocess(outputs, Width, Height)
if detections[0] is not None:
display(detections[0], image_src)
image_src = cv2.cvtColor(image_src,cv2.COLOR_BGR2RGB)
cv2.imwrite("result.jpg", image_src)
rknn.release()