学習済みファイルの作成とクラス名について（chainerを使った場合のyolo3の研究）

JUGEMテーマ：電子工作

■yoloの学習済みモデルのファイルの場所

　C:¥Users¥user¥.chainer¥dataset¥pfnet¥chainercv¥models
　　yolo_v2_voc0712_converted_2018_05_03.npz
　　yolo_v3_voc0712_converted_2018_05_01.npz

■学習済みモデルのファイルの指定
　ソース中の
　　　　model = YOLOv3(
　　　　　　n_fg_class=len(voc_bbox_label_names),
　　　　　　pretrained_model=args.pretrained_model)
　　の所で
　　　　model = YOLOv3(
　　　　　　n_fg_class=20,
　　　　　　pretrained_model='C:¥¥Users¥¥user¥¥.chainer¥¥dataset¥¥pfnet¥¥chainercv¥¥models¥¥x.npz')
　　等とすればファイルを指定可能。拡張子はnpz
　　ファイルを指定する場合はn_fg_classを正しく指定する必要がある
　　
■学習済みファイルの作成
　darknetで作成した学習済みモデルをnpzファイルに変換する必要がある。
　変換コードはchainercv/examples/yolo/darknet2npz.py
　デフォルトでは入っていないので以下からダウンロード
　https://github.com/chainer/chainercv/blob/master/examples/yolo/darknet2npz.py
　yolo_v2_tinyとYOLOv2Tinyの記述のある行はエラーになるのでコメントアウト
　
　ダークネット学習済みモデルファイルは以下からダウンロード
　https://pjreddie.com/media/files/yolov3.weights
　80クラスに対応している模様です。
　
　変換実行
　　python darknet2npz.py --model yolo_v3 --n-fg-class 80 yolov3.weights yolov3.weights.npz

■変換した学習済みファイルを使ったyolo3対応画像認識プログラム

　起動方法
　　python yolo3.py --pretrained-model yolov3.weights.npz --class_num 80 --class_list yolov3.list 0
　　上記で作成した学習済みモデルファイル（yolov3.weights.npz）と

　　それに対応したクラス名ファイル（yolov3.list）が必要です。
　　--pretrained-model
　　　上記darknet2npz.pyで変換した学習済みモデルyolov3.weights.npzを指定
　　--class_num
　　　上記学習済みモデルのクラス数（上記のyolov3.weights.npzは80）
　　--class_list
　　　上記学習済みモデルのクラス名一覧 1行に1クラス（下記）
　　--gpu
　　　gpuなら0
　　　cpuなら省略
　　ビデオ
　　　WEBカメラの場合は0
　　　動画ファイルの場合はファイル名

■クラス名ファイル（yolov3.list）

・yolov3.list

person
bicycle
car
motorcycle
airplane
bus
train
truck
boat
traffic light
fire hydrant
stop sign
parking meter
bench
bird
cat
dog
horse
sheep
cow
elephant
bear
zebra
giraffe
backpack
umbrella
handbag
tie
suitcase
frisbee
skis
snowboard
sports ball
kite
baseball bat
baseball glove
skateboard
surfboard
tennis racket
bottle
wine glass
cup
fork
knife
spoon
bowl
banana
apple
sandwich
orange
broccoli
carrot
hot dog
pizza
donut
cake
chair
couch
potted plant
bed
dining table
toilet
tv
laptop
mouse
remote
keyboard
cell phone
microwave
oven
toaster
sink
refrigerator
book
clock
vase
scissors
teddy bear
hair drier
toothbrush

■ソース

・yolo3.py

import time
import argparse
import matplotlib.pyplot as plt
import cv2
import numpy as np
from timeit import default_timer as timer
import chainer
from chainercv.datasets import voc_bbox_label_names
from chainercv.datasets import coco_bbox_label_names
from chainercv.links import YOLOv3

#色のテーブル 150まで対応
label_colors = (
(120, 120, 120),
(180, 120, 120),
(6, 230, 230),
(80, 50, 50),
(4, 200, 3),
(120, 120, 80),
(140, 140, 140),
(204, 5, 255),
(230, 230, 230),
(4, 250, 7),
(224, 5, 255),
(235, 255, 7),
(150, 5, 61),
(120, 120, 70),
(8, 255, 51),
(255, 6, 82),
(143, 255, 140),
(204, 255, 4),
(255, 51, 7),
(204, 70, 3),
(0, 102, 200),
(61, 230, 250),
(255, 6, 51),
(11, 102, 255),
(255, 7, 71),
(255, 9, 224),
(9, 7, 230),
(220, 220, 220),
(255, 9, 92),
(112, 9, 255),
(8, 255, 214),
(7, 255, 224),
(255, 184, 6),
(10, 255, 71),
(255, 41, 10),
(7, 255, 255),
(224, 255, 8),
(102, 8, 255),
(255, 61, 6),
(255, 194, 7),
(255, 122, 8),
(0, 255, 20),
(255, 8, 41),
(255, 5, 153),
(6, 51, 255),
(235, 12, 255),
(160, 150, 20),
(0, 163, 255),
(140, 140, 140),
(250, 10, 15),
(20, 255, 0),
(31, 255, 0),
(255, 31, 0),
(255, 224, 0),
(153, 255, 0),
(0, 0, 255),
(255, 71, 0),
(0, 235, 255),
(0, 173, 255),
(31, 0, 255),
(11, 200, 200),
(255, 82, 0),
(0, 255, 245),
(0, 61, 255),
(0, 255, 112),
(0, 255, 133),
(255, 0, 0),
(255, 163, 0),
(255, 102, 0),
(194, 255, 0),
(0, 143, 255),
(51, 255, 0),
(0, 82, 255),
(0, 255, 41),
(0, 255, 173),
(10, 0, 255),
(173, 255, 0),
(0, 255, 153),
(255, 92, 0),
(255, 0, 255),
(255, 0, 245),
(255, 0, 102),
(255, 173, 0),
(255, 0, 20),
(255, 184, 184),
(0, 31, 255),
(0, 255, 61),
(0, 71, 255),
(255, 0, 204),
(0, 255, 194),
(0, 255, 82),
(0, 10, 255),
(0, 112, 255),
(51, 0, 255),
(0, 194, 255),
(0, 122, 255),
(0, 255, 163),
(255, 153, 0),
(0, 255, 10),
(255, 112, 0),
(143, 255, 0),
(82, 0, 255),
(163, 255, 0),
(255, 235, 0),
(8, 184, 170),
(133, 0, 255),
(0, 255, 92),
(184, 0, 255),
(255, 0, 31),
(0, 184, 255),
(0, 214, 255),
(255, 0, 112),
(92, 255, 0),
(0, 224, 255),
(112, 224, 255),
(70, 184, 160),
(163, 0, 255),
(153, 0, 255),
(71, 255, 0),
(255, 0, 163),
(255, 204, 0),
(255, 0, 143),
(0, 255, 235),
(133, 255, 0),
(255, 0, 235),
(245, 0, 255),
(255, 0, 122),
(255, 245, 0),
(10, 190, 212),
(214, 255, 0),
(0, 204, 255),
(20, 0, 255),
(255, 255, 0),
(0, 153, 255),
(0, 41, 255),
(0, 255, 204),
(41, 0, 255),
(41, 255, 0),
(173, 0, 255),
(0, 245, 255),
(71, 0, 255),
(122, 0, 255),
(0, 255, 184),
(0, 92, 255),
(184, 255, 0),
(0, 133, 255),
(255, 214, 0),
(25, 194, 194),
(102, 255, 0),
(92, 0, 255),
(0, 0, 0)
)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=int, default=-1)
parser.add_argument('--pretrained-model', default='voc0712')
parser.add_argument('--class_num', default=20)
parser.add_argument('--class_list', default=0)
parser.add_argument('video')
args = parser.parse_args()

#
#パラメータ解析
# python yolo3.py 0
# デフォルトではvoc0712のモデルをダウンロードして来ます。認識できるのは20種類
#
# python yolo3.py --pretrained-model yolov3.weights.npz --class_num 80 --class_list yolov3.list 0
# --pretrained-model
# darknet2npz.pyで変換した学習済みモデルyolov3.weights.npzを指定
# --class_num
# 上記学習済みモデルのクラス数
# --class_list
# 上記学習済みモデルのクラス名一覧 1行に1クラス
# ビデオ
# WEBカメラの場合は0
# 動画ファイルの場合はファイル名
#
if args.pretrained_model=='voc0712' :
label_names = voc_bbox_label_names
model = YOLOv3(20, 'voc0712')
else :
print(args.class_list)
if args.class_list==0 :
label_names = coco_bbox_label_names
else:
f = open(args.class_list, "r")
name_list = []
for line in f:
line = line.strip()
name_list.append(line)
f.close()
label_names = name_list

model = YOLOv3(n_fg_class=int(args.class_num), pretrained_model=args.pretrained_model)

#GPU対応
# CPUなら省略
# GPUなら0
if args.gpu >= 0:
chainer.cuda.get_device_from_id(args.gpu).use()
model.to_gpu()
#
#対応しているクラス名一覧を表示する
#
for name in label_names:
print(name)
#
#WEBカメラまたは動画ファイルを開く
#
if args.video == "0":
vid = cv2.VideoCapture(0)
else:
vid = cv2.VideoCapture(args.video)
if not vid.isOpened():
raise ImportError("Couldn't open video file or webcam.")

# Compute aspect ratio of video
vidw = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
vidh = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
# vidar = vidw / vidh
print(vidw)
print(vidh)

accum_time = 0
curr_fps = 0
fps = "FPS: ??"
prev_time = timer()

frame_count = 1
while True:
ret, frame = vid.read()
if ret == False:
time.sleep(5)
print("Done!")
return

# BGR -> RGB
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

# Result image
result = frame.copy()

# (H, W, C) -> (C, H, W)
img = np.asarray(rgb, dtype = np.float32).transpose((2, 0, 1))

# Object Detection
bboxes, labels, scores = model.predict([img])
bbox, label, score = bboxes[0], labels[0], scores[0]

print("----------")
nPerson = 0
nBottle = 0
if len(bbox) != 0:
for i, bb in enumerate(bbox):
# print(i)
lb = label[i]
conf = score[i].tolist()
ymin = int(bb[0])
xmin = int(bb[1])
ymax = int(bb[2])
xmax = int(bb[3])

class_num = int(lb)

# Draw box 1
cv2.rectangle(result, (xmin, ymin), (xmax, ymax),
label_colors[class_num], 2)

# Draw box 2
# cv2.rectangle(result, (xmin, ymin), (xmax, ymax), (0,255,0), 2)

#text = label_names[class_num] + " " + ('%.2f' % conf)
text = label_names[class_num] + " " + ('%.2f' % conf)
print(text)
if(label_names[class_num] == 'person'):
nPerson = nPerson + 1
if(label_names[class_num] == 'bottle'):
nBottle = nBottle + 1

text_top = (xmin, ymin - 10)
text_bot = (xmin + 80, ymin + 5)
text_pos = (xmin + 5, ymin)

# Draw label 1
cv2.rectangle(result, text_top, text_bot,
label_colors[class_num], -1)
cv2.putText(result, text, text_pos,
cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1)

# Draw label 2
# cv2.rectangle(result, text_top, text_bot, (255,255,255), -1)
# cv2.putText(result, text, text_pos,
# cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1)
print("==========")
print("Number of people : " + str(nPerson))
print("Number of bottle : " + str(nBottle))
# Calculate FPS
curr_time = timer()
exec_time = curr_time - prev_time
prev_time = curr_time
accum_time = accum_time + exec_time
curr_fps = curr_fps + 1
if accum_time > 1:
accum_time = accum_time - 1
fps = "FPS:" + str(curr_fps)
curr_fps = 0

# Draw FPS in top right corner
cv2.rectangle(result, (590, 0), (640, 17), (0, 0, 0), -1)
cv2.putText(result, fps, (595, 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1)

# Draw Frame Number
cv2.rectangle(result, (0, 0), (50, 17), (0, 0, 0), -1)
cv2.putText(result, str(frame_count), (0, 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1)

# Output Result
cv2.imshow("Yolo Result", result)

# Stop Processing
if cv2.waitKey(1) & 0xFF == ord('q'):
break

frame_count += 1

if __name__ == '__main__':
main()

・darknet2npz.py

import argparse
import numpy as np

import chainer
from chainer.links import Convolution2D
from chainer import serializers

#from chainercv.experimental.links import YOLOv2Tiny
from chainercv.links import Conv2DBNActiv
from chainercv.links import YOLOv2
from chainercv.links import YOLOv3

def load_param(file, param):
if isinstance(param, chainer.Variable):
param = param.array
param[:] = np.fromfile(file, dtype=np.float32, count=param.size) ¥
.reshape(param.shape)

def load_link(file, link):
if isinstance(link, Convolution2D):
load_param(file, link.b)
load_param(file, link.W)
elif isinstance(link, Conv2DBNActiv):
load_param(file, link.bn.beta)
load_param(file, link.bn.gamma)
load_param(file, link.bn.avg_mean)
load_param(file, link.bn.avg_var)
load_param(file, link.conv.W)
elif isinstance(link, chainer.ChainList):
for l in link:
load_link(file, l)

def reorder_loc(conv, n_fg_class):
# xy -> yx
for data in (conv.W.array, conv.b.array):
data = data.reshape(
(-1, 4 + 1 + n_fg_class) + data.shape[1:])
data[:, [1, 0, 3, 2]] = data[:, :4].copy()

def load_yolo_v2(file, model):
load_link(file, model.extractor)
load_link(file, model.subnet)

reorder_loc(model.subnet, model.n_fg_class)

def load_yolo_v3(file, model):
for i, link in enumerate(model.extractor):
load_link(file, link)
if i in {33, 39, 45}:
subnet = model.subnet[(i - 33) // 6]
load_link(file, subnet)

for subnet in model.subnet:
reorder_loc(subnet[-1], model.n_fg_class)

def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--model', choices=('yolo_v2', 'yolo_v2_tiny', 'yolo_v3'),
default='yolo_v2')
parser.add_argument('--n-fg-class', type=int, default=80)
parser.add_argument('darknetmodel')
parser.add_argument('output')
args = parser.parse_args()

if args.model == 'yolo_v2':
model = YOLOv2(n_fg_class=args.n_fg_class)
#elif args.model == 'yolo_v2_tiny':
# model = YOLOv2Tiny(n_fg_class=args.n_fg_class)
elif args.model == 'yolo_v3':
model = YOLOv3(n_fg_class=args.n_fg_class)

with chainer.using_config('train', False):
model(np.empty((1, 3, model.insize, model.insize), dtype=np.float32))

with open(args.darknetmodel, mode='rb') as f:
major = np.fromfile(f, dtype=np.int32, count=1)
minor = np.fromfile(f, dtype=np.int32, count=1)
np.fromfile(f, dtype=np.int32, count=1) # revision
if major * 10 + minor >= 2 and major < 1000 and minor < 1000:
np.fromfile(f, dtype=np.int64, count=1) # seen
else:
np.fromfile(f, dtype=np.int32, count=1) # seen

if args.model == 'yolo_v2':
load_yolo_v2(f, model)
elif args.model == 'yolo_v2_tiny':
load_yolo_v2(f, model)
elif args.model == 'yolo_v3':
load_yolo_v3(f, model)

serializers.save_npz(args.output, model)

if __name__ == '__main__':
main()