forked from jkjung-avt/tensorrt_demos
-
Notifications
You must be signed in to change notification settings - Fork 4
/
trt_ssd_async.py
185 lines (155 loc) · 6.15 KB
/
trt_ssd_async.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""trt_ssd_async.py
This is the 'async' version of trt_ssd.py implementation. It creates
1 dedicated child thread for fetching camera input and do inferencing
with the TensorRT optimized SSD model/engine, while using the main
thread for drawing detection results and displaying video. Ideally,
the 2 threads work in a pipeline fashion so overall throughput (FPS)
would be improved comparing to the non-async version.
"""
import time
import argparse
import threading
import cv2
import pycuda.driver as cuda
from utils.ssd_classes import get_cls_dict
from utils.ssd import TrtSSD
from utils.camera import add_camera_args, Camera
from utils.display import open_window, set_display, show_fps
from utils.visualization import BBoxVisualization
WINDOW_NAME = 'TrtSsdDemoAsync'
MAIN_THREAD_TIMEOUT = 20.0 # 20 seconds
INPUT_HW = (300, 300)
SUPPORTED_MODELS = [
'ssd_mobilenet_v1_coco',
'ssd_mobilenet_v1_egohands',
'ssd_mobilenet_v2_coco',
'ssd_mobilenet_v2_egohands',
'ssd_inception_v2_coco',
'ssdlite_mobilenet_v2_coco',
]
# These global variables are 'shared' between the main and child
# threads. The child thread writes new frame and detection result
# into these variables, while the main thread reads from them.
s_img, s_boxes, s_confs, s_clss = None, None, None, None
def parse_args():
"""Parse input arguments."""
desc = ('Capture and display live camera video, while doing '
'real-time object detection with TensorRT optimized '
'SSD model on Jetson Nano')
parser = argparse.ArgumentParser(description=desc)
parser = add_camera_args(parser)
parser.add_argument('-m', '--model', type=str,
default='ssd_mobilenet_v1_coco',
choices=SUPPORTED_MODELS)
args = parser.parse_args()
return args
class TrtThread(threading.Thread):
"""TrtThread
This implements the child thread which continues to read images
from cam (input) and to do TRT engine inferencing. The child
thread stores the input image and detection results into global
variables and uses a condition varaiable to inform main thread.
In other words, the TrtThread acts as the producer while the
main thread is the consumer.
"""
def __init__(self, condition, cam, model, conf_th):
"""__init__
# Arguments
condition: the condition variable used to notify main
thread about new frame and detection result
cam: the camera object for reading input image frames
model: a string, specifying the TRT SSD model
conf_th: confidence threshold for detection
"""
threading.Thread.__init__(self)
self.condition = condition
self.cam = cam
self.model = model
self.conf_th = conf_th
self.cuda_ctx = None # to be created when run
self.trt_ssd = None # to be created when run
self.running = False
def run(self):
"""Run until 'running' flag is set to False by main thread.
NOTE: CUDA context is created here, i.e. inside the thread
which calls CUDA kernels. In other words, creating CUDA
context in __init__() doesn't work.
"""
global s_img, s_boxes, s_confs, s_clss
print('TrtThread: loading the TRT SSD engine...')
self.cuda_ctx = cuda.Device(0).make_context() # GPU 0
self.trt_ssd = TrtSSD(self.model, INPUT_HW)
print('TrtThread: start running...')
self.running = True
while self.running:
img = self.cam.read()
if img is None:
break
boxes, confs, clss = self.trt_ssd.detect(img, self.conf_th)
with self.condition:
s_img, s_boxes, s_confs, s_clss = img, boxes, confs, clss
self.condition.notify()
del self.trt_ssd
self.cuda_ctx.pop()
del self.cuda_ctx
print('TrtThread: stopped...')
def stop(self):
self.running = False
self.join()
def loop_and_display(condition, vis):
"""Take detection results from the child thread and display.
# Arguments
condition: the condition variable for synchronization with
the child thread.
vis: for visualization.
"""
global s_img, s_boxes, s_confs, s_clss
full_scrn = False
fps = 0.0
tic = time.time()
while True:
if cv2.getWindowProperty(WINDOW_NAME, 0) < 0:
break
with condition:
# Wait for the next frame and detection result. When
# getting the signal from the child thread, save the
# references to the frame and detection result for
# display.
if condition.wait(timeout=MAIN_THREAD_TIMEOUT):
img, boxes, confs, clss = s_img, s_boxes, s_confs, s_clss
else:
raise SystemExit('ERROR: timeout waiting for img from child')
img = vis.draw_bboxes(img, boxes, confs, clss)
img = show_fps(img, fps)
cv2.imshow(WINDOW_NAME, img)
toc = time.time()
curr_fps = 1.0 / (toc - tic)
# calculate an exponentially decaying average of fps number
fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05)
tic = toc
key = cv2.waitKey(1)
if key == 27: # ESC key: quit program
break
elif key == ord('F') or key == ord('f'): # Toggle fullscreen
full_scrn = not full_scrn
set_display(WINDOW_NAME, full_scrn)
def main():
args = parse_args()
cam = Camera(args)
if not cam.isOpened():
raise SystemExit('ERROR: failed to open camera!')
cuda.init() # init pycuda driver
cls_dict = get_cls_dict(args.model.split('_')[-1])
open_window(
WINDOW_NAME, 'Camera TensorRT SSD Demo',
cam.img_width, cam.img_height)
vis = BBoxVisualization(cls_dict)
condition = threading.Condition()
trt_thread = TrtThread(condition, cam, args.model, conf_th=0.3)
trt_thread.start() # start the child thread
loop_and_display(condition, vis)
trt_thread.stop() # stop the child thread
cam.release()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()