diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index a381c5165e..28c5d1fd75 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -23,6 +23,8 @@ class SampleFrames(object): frame_interval (int): Temporal interval of adjacent sampled frames. Default: 1. num_clips (int): Number of clips to be sampled. Default: 1. + start_index (int): Specify a start index for frames in consideration of + different filename format. Default: 1. temporal_jitter (bool): Whether to apply temporal jittering. Default: False. twice_sample (bool): Whether to use twice sample when testing. @@ -39,6 +41,7 @@ def __init__(self, clip_len, frame_interval=1, num_clips=1, + start_index=1, temporal_jitter=False, twice_sample=False, out_of_bound_opt='loop', @@ -47,6 +50,7 @@ def __init__(self, self.clip_len = clip_len self.frame_interval = frame_interval self.num_clips = num_clips + self.start_index = start_index self.temporal_jitter = temporal_jitter self.twice_sample = twice_sample self.out_of_bound_opt = out_of_bound_opt @@ -144,7 +148,6 @@ def __call__(self, results): else: total_frames = results['total_frames'] - # TODO: index in different mode may be different clip_offsets = self._sample_clips(total_frames) frame_inds = clip_offsets[:, None] + np.arange( self.clip_len)[None, :] * self.frame_interval @@ -166,7 +169,7 @@ def __call__(self, results): frame_inds = new_inds else: raise ValueError('Illegal out_of_bound option.') - frame_inds = np.concatenate(frame_inds) + frame_inds = np.concatenate(frame_inds) + self.start_index results['frame_inds'] = frame_inds.astype(np.int) results['clip_len'] = self.clip_len results['frame_interval'] = self.frame_interval @@ -186,6 +189,8 @@ class DenseSampleFrames(SampleFrames): frame_interval (int): Temporal interval of adjacent sampled frames. Default: 1. num_clips (int): Number of clips to be sampled. Default: 1. + start_index (int): Specify a start index for frames in consideration of + different filename format. Default: 1. sample_range (int): Total sample range for dense sample. Default: 64. num_sample_positions (int): Number of sample start positions, Which is @@ -200,6 +205,7 @@ def __init__(self, clip_len, frame_interval=1, num_clips=1, + start_index=1, sample_range=64, num_sample_positions=10, temporal_jitter=False, @@ -209,6 +215,7 @@ def __init__(self, clip_len, frame_interval, num_clips, + start_index, temporal_jitter, out_of_bound_opt=out_of_bound_opt, test_mode=test_mode) @@ -585,10 +592,6 @@ def __call__(self, results): results['frame_inds'] = np.squeeze(results['frame_inds']) for frame_idx in results['frame_inds']: - # temporary solution for frame index offset. - # TODO: add offset attributes in datasets. - if frame_idx == 0: - frame_idx += 1 if modality == 'RGB': filepath = osp.join(directory, filename_tmpl.format(frame_idx)) img_bytes = self.file_client.get(filepath) diff --git a/tests/data/test_imgs/img_00006.jpg b/tests/data/test_imgs/img_00006.jpg new file mode 100644 index 0000000000..7f0fa6ca5c Binary files /dev/null and b/tests/data/test_imgs/img_00006.jpg differ diff --git a/tests/data/test_imgs/img_00007.jpg b/tests/data/test_imgs/img_00007.jpg new file mode 100644 index 0000000000..2ebc51fe1b Binary files /dev/null and b/tests/data/test_imgs/img_00007.jpg differ diff --git a/tests/data/test_imgs/img_00008.jpg b/tests/data/test_imgs/img_00008.jpg new file mode 100644 index 0000000000..f9747042fb Binary files /dev/null and b/tests/data/test_imgs/img_00008.jpg differ diff --git a/tests/data/test_imgs/img_00009.jpg b/tests/data/test_imgs/img_00009.jpg new file mode 100644 index 0000000000..b4a74ebb0d Binary files /dev/null and b/tests/data/test_imgs/img_00009.jpg differ diff --git a/tests/data/test_imgs/img_00010.jpg b/tests/data/test_imgs/img_00010.jpg new file mode 100644 index 0000000000..9944e62089 Binary files /dev/null and b/tests/data/test_imgs/img_00010.jpg differ diff --git a/tests/test_loading.py b/tests/test_loading.py index 53b5accccb..3e03787167 100644 --- a/tests/test_loading.py +++ b/tests/test_loading.py @@ -183,6 +183,28 @@ def check_monotonous(arr): assert len(sample_frames_results['frame_inds']) == 8 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 8 + assert_array_equal(sample_frames_results['frame_inds'], + np.array([1, 2, 2, 3, 4, 5, 5, 6])) + + # Sample Frame with no temporal_jitter to get clip_offsets + # clip_len=1, frame_interval=1, num_clips=8, start_index=0 + video_result = copy.deepcopy(self.video_results) + frame_result = copy.deepcopy(self.frame_results) + frame_result['total_frames'] = 6 + config = dict( + clip_len=1, + frame_interval=1, + num_clips=8, + start_index=0, + temporal_jitter=False, + test_mode=True) + sample_frames = SampleFrames(**config) + sample_frames_results = sample_frames(video_result) + assert self.check_keys_contain(sample_frames_results.keys(), + target_keys) + assert len(sample_frames_results['frame_inds']) == 8 + sample_frames_results = sample_frames(frame_result) + assert len(sample_frames_results['frame_inds']) == 8 assert_array_equal(sample_frames_results['frame_inds'], np.array([0, 1, 1, 2, 3, 4, 4, 5])) @@ -205,7 +227,7 @@ def check_monotonous(arr): sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 6 assert_array_equal(sample_frames_results['frame_inds'], - [0, 1, 2, 3, 4, 0]) + [1, 2, 3, 4, 5, 1]) # Sample Frame with no temporal_jitter to get avg_interval <= 0 # clip_len=12, frame_interval=1, num_clips=20 @@ -245,7 +267,7 @@ def check_monotonous(arr): sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 8 assert_array_equal(sample_frames_results['frame_inds'], - np.array([0, 1, 2, 2, 3, 4, 4, 5])) + np.array([1, 2, 3, 3, 4, 5, 5, 6])) # Sample Frame with no temporal_jitter to get clip_offsets zero # clip_len=12, frame_interval=1, num_clips=2 @@ -411,7 +433,7 @@ def test_pyav_init(self): def test_pyav_decode(self): target_keys = ['frame_inds', 'imgs', 'original_shape'] - # test PyAV with 2 dim input + # test PyAV with 2 dim input and start_index = 0 video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(0, self.total_frames, 2)[:, np.newaxis] @@ -419,6 +441,49 @@ def test_pyav_decode(self): pyav_init_result = pyav_init(video_result) video_result['video_reader'] = pyav_init_result['video_reader'] + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert self.check_keys_contain(pyav_decode_result.keys(), target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test PyAV with 1 dim input and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode() + pyav_decode_result = pyav_decode(video_result) + assert self.check_keys_contain(pyav_decode_result.keys(), target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # PyAV with multi thread and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 5) + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + + pyav_decode = PyAVDecode(multi_thread=True) + pyav_decode_result = pyav_decode(video_result) + assert self.check_keys_contain(pyav_decode_result.keys(), target_keys) + assert pyav_decode_result['original_shape'] == (256, 340) + assert np.shape(pyav_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test PyAV with 2 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, + 2)[:, np.newaxis] + pyav_init = PyAVInit() + pyav_init_result = pyav_init(video_result) + video_result['video_reader'] = pyav_init_result['video_reader'] + pyav_decode = PyAVDecode() pyav_decode_result = pyav_decode(video_result) assert self.check_keys_contain(pyav_decode_result.keys(), target_keys) @@ -469,9 +534,40 @@ def test_decord_init(self): def test_decord_decode(self): target_keys = ['frame_inds', 'imgs', 'original_shape'] - # test Decord with 2 dim input + # test Decord with 2 dim input and start_index = 0 video_result = copy.deepcopy(self.video_results) - video_result['frame_inds'] = np.arange(1, self.total_frames, + video_result['frame_inds'] = np.arange(0, self.total_frames, + 3)[:, np.newaxis] + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode() + decord_decode_result = decord_decode(video_result) + assert self.check_keys_contain(decord_decode_result.keys(), + target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test Decord with 1 dim input and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 3) + decord_init = DecordInit() + decord_init_result = decord_init(video_result) + video_result['video_reader'] = decord_init_result['video_reader'] + + decord_decode = DecordDecode() + decord_decode_result = decord_decode(video_result) + assert self.check_keys_contain(decord_decode_result.keys(), + target_keys) + assert decord_decode_result['original_shape'] == (256, 340) + assert np.shape(decord_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test Decord with 2 dim input and start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 3)[:, np.newaxis] decord_init = DecordInit() decord_init_result = decord_init(video_result) @@ -512,7 +608,7 @@ def test_opencv_init(self): def test_opencv_decode(self): target_keys = ['frame_inds', 'imgs', 'original_shape'] - # test OpenCV with 2 dim input + # test OpenCV with 2 dim input when start_index = 0 video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(0, self.total_frames, 2)[:, np.newaxis] @@ -528,6 +624,29 @@ def test_opencv_decode(self): assert np.shape(opencv_decode_result['imgs']) == (len( video_result['frame_inds']), 256, 340, 3) + # test OpenCV with 2 dim input + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(1, self.total_frames, + 2)[:, np.newaxis] + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + + opencv_decode = OpenCVDecode() + opencv_decode_result = opencv_decode(video_result) + assert self.check_keys_contain(opencv_decode_result.keys(), + target_keys) + assert opencv_decode_result['original_shape'] == (256, 340) + assert np.shape(opencv_decode_result['imgs']) == (len( + video_result['frame_inds']), 256, 340, 3) + + # test OpenCV with 1 dim input when start_index = 0 + video_result = copy.deepcopy(self.video_results) + video_result['frame_inds'] = np.arange(0, self.total_frames, 3) + opencv_init = OpenCVInit() + opencv_init_result = opencv_init(video_result) + video_result['video_reader'] = opencv_init_result['video_reader'] + # test OpenCV with 1 dim input video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(1, self.total_frames, 3) @@ -546,10 +665,37 @@ def test_opencv_decode(self): def test_frame_selector(self): target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality'] - # test frame selector with 2 dim input + # test frame selector with 2 dim input when start_index = 0 inputs = copy.deepcopy(self.frame_results) inputs['frame_inds'] = np.arange(0, self.total_frames, 2)[:, np.newaxis] + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = FrameSelector(io_backend='disk') + results = frame_selector(inputs) + assert self.check_keys_contain(results.keys(), target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 2 dim input + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(1, self.total_frames, 2)[:, + np.newaxis] + frame_selector = FrameSelector(io_backend='disk') + results = frame_selector(inputs) + assert self.check_keys_contain(results.keys(), target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + + # test frame selector with 1 dim input when start_index = 0 + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 5) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 frame_selector = FrameSelector(io_backend='disk') results = frame_selector(inputs) assert self.check_keys_contain(results.keys(), target_keys) @@ -567,6 +713,19 @@ def test_frame_selector(self): 320, 3) assert results['original_shape'] == (240, 320) + # test frame selector with 1 dim input when start_index = 0 + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 2) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = FrameSelector(io_backend='disk') + results = frame_selector(inputs) + assert self.check_keys_contain(results.keys(), target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + # test frame selector with 1 dim input inputs = copy.deepcopy(self.frame_results) inputs['frame_inds'] = np.arange(1, self.total_frames, 2) @@ -577,6 +736,20 @@ def test_frame_selector(self): 320, 3) assert results['original_shape'] == (240, 320) + # test frame selector with 1 dim input for flow images + # when start_index = 0 + inputs = copy.deepcopy(self.flow_frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 2) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = FrameSelector(io_backend='disk') + results = frame_selector(inputs) + assert self.check_keys_contain(results.keys(), target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']) * 2, + 240, 320) + assert results['original_shape'] == (240, 320) + # test frame selector with 1 dim input for flow images inputs = copy.deepcopy(self.flow_frame_results) inputs['frame_inds'] = np.arange(1, self.total_frames, 2) @@ -587,6 +760,21 @@ def test_frame_selector(self): 240, 320) assert results['original_shape'] == (240, 320) + # test frame selector in turbojpeg decording backend + # when start_index = 0 + inputs = copy.deepcopy(self.frame_results) + inputs['frame_inds'] = np.arange(0, self.total_frames, 5) + # since the test images start with index 1, we plus 1 to frame_inds + # in order to pass the CI + inputs['frame_inds'] = inputs['frame_inds'] + 1 + frame_selector = FrameSelector( + io_backend='disk', decoding_backend='turbojpeg') + results = frame_selector(inputs) + assert self.check_keys_contain(results.keys(), target_keys) + assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240, + 320, 3) + assert results['original_shape'] == (240, 320) + # test frame selector in turbojpeg decording backend inputs = copy.deepcopy(self.frame_results) inputs['frame_inds'] = np.arange(1, self.total_frames, 5)