diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index a381c5165e..28c5d1fd75 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -23,6 +23,8 @@ class SampleFrames(object):
         frame_interval (int): Temporal interval of adjacent sampled frames.
             Default: 1.
         num_clips (int): Number of clips to be sampled. Default: 1.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. Default: 1.
         temporal_jitter (bool): Whether to apply temporal jittering.
             Default: False.
         twice_sample (bool): Whether to use twice sample when testing.
@@ -39,6 +41,7 @@ def __init__(self,
                  clip_len,
                  frame_interval=1,
                  num_clips=1,
+                 start_index=1,
                  temporal_jitter=False,
                  twice_sample=False,
                  out_of_bound_opt='loop',
@@ -47,6 +50,7 @@ def __init__(self,
         self.clip_len = clip_len
         self.frame_interval = frame_interval
         self.num_clips = num_clips
+        self.start_index = start_index
         self.temporal_jitter = temporal_jitter
         self.twice_sample = twice_sample
         self.out_of_bound_opt = out_of_bound_opt
@@ -144,7 +148,6 @@ def __call__(self, results):
         else:
             total_frames = results['total_frames']
 
-        # TODO: index in different mode may be different
         clip_offsets = self._sample_clips(total_frames)
         frame_inds = clip_offsets[:, None] + np.arange(
             self.clip_len)[None, :] * self.frame_interval
@@ -166,7 +169,7 @@ def __call__(self, results):
             frame_inds = new_inds
         else:
             raise ValueError('Illegal out_of_bound option.')
-        frame_inds = np.concatenate(frame_inds)
+        frame_inds = np.concatenate(frame_inds) + self.start_index
         results['frame_inds'] = frame_inds.astype(np.int)
         results['clip_len'] = self.clip_len
         results['frame_interval'] = self.frame_interval
@@ -186,6 +189,8 @@ class DenseSampleFrames(SampleFrames):
         frame_interval (int): Temporal interval of adjacent sampled frames.
             Default: 1.
         num_clips (int): Number of clips to be sampled. Default: 1.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. Default: 1.
         sample_range (int): Total sample range for dense sample.
             Default: 64.
         num_sample_positions (int): Number of sample start positions, Which is
@@ -200,6 +205,7 @@ def __init__(self,
                  clip_len,
                  frame_interval=1,
                  num_clips=1,
+                 start_index=1,
                  sample_range=64,
                  num_sample_positions=10,
                  temporal_jitter=False,
@@ -209,6 +215,7 @@ def __init__(self,
             clip_len,
             frame_interval,
             num_clips,
+            start_index,
             temporal_jitter,
             out_of_bound_opt=out_of_bound_opt,
             test_mode=test_mode)
@@ -585,10 +592,6 @@ def __call__(self, results):
             results['frame_inds'] = np.squeeze(results['frame_inds'])
 
         for frame_idx in results['frame_inds']:
-            # temporary solution for frame index offset.
-            # TODO: add offset attributes in datasets.
-            if frame_idx == 0:
-                frame_idx += 1
             if modality == 'RGB':
                 filepath = osp.join(directory, filename_tmpl.format(frame_idx))
                 img_bytes = self.file_client.get(filepath)
diff --git a/tests/data/test_imgs/img_00006.jpg b/tests/data/test_imgs/img_00006.jpg
new file mode 100644
index 0000000000..7f0fa6ca5c
Binary files /dev/null and b/tests/data/test_imgs/img_00006.jpg differ
diff --git a/tests/data/test_imgs/img_00007.jpg b/tests/data/test_imgs/img_00007.jpg
new file mode 100644
index 0000000000..2ebc51fe1b
Binary files /dev/null and b/tests/data/test_imgs/img_00007.jpg differ
diff --git a/tests/data/test_imgs/img_00008.jpg b/tests/data/test_imgs/img_00008.jpg
new file mode 100644
index 0000000000..f9747042fb
Binary files /dev/null and b/tests/data/test_imgs/img_00008.jpg differ
diff --git a/tests/data/test_imgs/img_00009.jpg b/tests/data/test_imgs/img_00009.jpg
new file mode 100644
index 0000000000..b4a74ebb0d
Binary files /dev/null and b/tests/data/test_imgs/img_00009.jpg differ
diff --git a/tests/data/test_imgs/img_00010.jpg b/tests/data/test_imgs/img_00010.jpg
new file mode 100644
index 0000000000..9944e62089
Binary files /dev/null and b/tests/data/test_imgs/img_00010.jpg differ
diff --git a/tests/test_loading.py b/tests/test_loading.py
index 53b5accccb..3e03787167 100644
--- a/tests/test_loading.py
+++ b/tests/test_loading.py
@@ -183,6 +183,28 @@ def check_monotonous(arr):
         assert len(sample_frames_results['frame_inds']) == 8
         sample_frames_results = sample_frames(frame_result)
         assert len(sample_frames_results['frame_inds']) == 8
+        assert_array_equal(sample_frames_results['frame_inds'],
+                           np.array([1, 2, 2, 3, 4, 5, 5, 6]))
+
+        # Sample Frame with no temporal_jitter to get clip_offsets
+        # clip_len=1, frame_interval=1, num_clips=8, start_index=0
+        video_result = copy.deepcopy(self.video_results)
+        frame_result = copy.deepcopy(self.frame_results)
+        frame_result['total_frames'] = 6
+        config = dict(
+            clip_len=1,
+            frame_interval=1,
+            num_clips=8,
+            start_index=0,
+            temporal_jitter=False,
+            test_mode=True)
+        sample_frames = SampleFrames(**config)
+        sample_frames_results = sample_frames(video_result)
+        assert self.check_keys_contain(sample_frames_results.keys(),
+                                       target_keys)
+        assert len(sample_frames_results['frame_inds']) == 8
+        sample_frames_results = sample_frames(frame_result)
+        assert len(sample_frames_results['frame_inds']) == 8
         assert_array_equal(sample_frames_results['frame_inds'],
                            np.array([0, 1, 1, 2, 3, 4, 4, 5]))
 
@@ -205,7 +227,7 @@ def check_monotonous(arr):
         sample_frames_results = sample_frames(frame_result)
         assert len(sample_frames_results['frame_inds']) == 6
         assert_array_equal(sample_frames_results['frame_inds'],
-                           [0, 1, 2, 3, 4, 0])
+                           [1, 2, 3, 4, 5, 1])
 
         # Sample Frame with no temporal_jitter to get avg_interval <= 0
         # clip_len=12, frame_interval=1, num_clips=20
@@ -245,7 +267,7 @@ def check_monotonous(arr):
         sample_frames_results = sample_frames(frame_result)
         assert len(sample_frames_results['frame_inds']) == 8
         assert_array_equal(sample_frames_results['frame_inds'],
-                           np.array([0, 1, 2, 2, 3, 4, 4, 5]))
+                           np.array([1, 2, 3, 3, 4, 5, 5, 6]))
 
         # Sample Frame with no temporal_jitter to get clip_offsets zero
         # clip_len=12, frame_interval=1, num_clips=2
@@ -411,7 +433,7 @@ def test_pyav_init(self):
     def test_pyav_decode(self):
         target_keys = ['frame_inds', 'imgs', 'original_shape']
 
-        # test PyAV with 2 dim input
+        # test PyAV with 2 dim input and start_index = 0
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(0, self.total_frames,
                                                2)[:, np.newaxis]
@@ -419,6 +441,49 @@ def test_pyav_decode(self):
         pyav_init_result = pyav_init(video_result)
         video_result['video_reader'] = pyav_init_result['video_reader']
 
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert self.check_keys_contain(pyav_decode_result.keys(), target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test PyAV with 1 dim input and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode()
+        pyav_decode_result = pyav_decode(video_result)
+        assert self.check_keys_contain(pyav_decode_result.keys(), target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # PyAV with multi thread and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 5)
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
+        pyav_decode = PyAVDecode(multi_thread=True)
+        pyav_decode_result = pyav_decode(video_result)
+        assert self.check_keys_contain(pyav_decode_result.keys(), target_keys)
+        assert pyav_decode_result['original_shape'] == (256, 340)
+        assert np.shape(pyav_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test PyAV with 2 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames,
+                                               2)[:, np.newaxis]
+        pyav_init = PyAVInit()
+        pyav_init_result = pyav_init(video_result)
+        video_result['video_reader'] = pyav_init_result['video_reader']
+
         pyav_decode = PyAVDecode()
         pyav_decode_result = pyav_decode(video_result)
         assert self.check_keys_contain(pyav_decode_result.keys(), target_keys)
@@ -469,9 +534,40 @@ def test_decord_init(self):
     def test_decord_decode(self):
         target_keys = ['frame_inds', 'imgs', 'original_shape']
 
-        # test Decord with 2 dim input
+        # test Decord with 2 dim input and start_index = 0
         video_result = copy.deepcopy(self.video_results)
-        video_result['frame_inds'] = np.arange(1, self.total_frames,
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
+                                               3)[:, np.newaxis]
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode()
+        decord_decode_result = decord_decode(video_result)
+        assert self.check_keys_contain(decord_decode_result.keys(),
+                                       target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test Decord with 1 dim input and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
+        decord_init = DecordInit()
+        decord_init_result = decord_init(video_result)
+        video_result['video_reader'] = decord_init_result['video_reader']
+
+        decord_decode = DecordDecode()
+        decord_decode_result = decord_decode(video_result)
+        assert self.check_keys_contain(decord_decode_result.keys(),
+                                       target_keys)
+        assert decord_decode_result['original_shape'] == (256, 340)
+        assert np.shape(decord_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test Decord with 2 dim input and start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames,
                                                3)[:, np.newaxis]
         decord_init = DecordInit()
         decord_init_result = decord_init(video_result)
@@ -512,7 +608,7 @@ def test_opencv_init(self):
     def test_opencv_decode(self):
         target_keys = ['frame_inds', 'imgs', 'original_shape']
 
-        # test OpenCV with 2 dim input
+        # test OpenCV with 2 dim input when start_index = 0
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(0, self.total_frames,
                                                2)[:, np.newaxis]
@@ -528,6 +624,29 @@ def test_opencv_decode(self):
         assert np.shape(opencv_decode_result['imgs']) == (len(
             video_result['frame_inds']), 256, 340, 3)
 
+        # test OpenCV with 2 dim input
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(1, self.total_frames,
+                                               2)[:, np.newaxis]
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
+        opencv_decode = OpenCVDecode()
+        opencv_decode_result = opencv_decode(video_result)
+        assert self.check_keys_contain(opencv_decode_result.keys(),
+                                       target_keys)
+        assert opencv_decode_result['original_shape'] == (256, 340)
+        assert np.shape(opencv_decode_result['imgs']) == (len(
+            video_result['frame_inds']), 256, 340, 3)
+
+        # test OpenCV with 1 dim input when start_index = 0
+        video_result = copy.deepcopy(self.video_results)
+        video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
+        opencv_init = OpenCVInit()
+        opencv_init_result = opencv_init(video_result)
+        video_result['video_reader'] = opencv_init_result['video_reader']
+
         # test OpenCV with 1 dim input
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(1, self.total_frames, 3)
@@ -546,10 +665,37 @@ def test_opencv_decode(self):
     def test_frame_selector(self):
         target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality']
 
-        # test frame selector with 2 dim input
+        # test frame selector with 2 dim input when start_index = 0
         inputs = copy.deepcopy(self.frame_results)
         inputs['frame_inds'] = np.arange(0, self.total_frames, 2)[:,
                                                                   np.newaxis]
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = FrameSelector(io_backend='disk')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 2 dim input
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 2)[:,
+                                                                  np.newaxis]
+        frame_selector = FrameSelector(io_backend='disk')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
+        # test frame selector with 1 dim input when start_index = 0
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 5)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
         frame_selector = FrameSelector(io_backend='disk')
         results = frame_selector(inputs)
         assert self.check_keys_contain(results.keys(), target_keys)
@@ -567,6 +713,19 @@ def test_frame_selector(self):
                                              320, 3)
         assert results['original_shape'] == (240, 320)
 
+        # test frame selector with 1 dim input when start_index = 0
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 2)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = FrameSelector(io_backend='disk')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
         # test frame selector with 1 dim input
         inputs = copy.deepcopy(self.frame_results)
         inputs['frame_inds'] = np.arange(1, self.total_frames, 2)
@@ -577,6 +736,20 @@ def test_frame_selector(self):
                                              320, 3)
         assert results['original_shape'] == (240, 320)
 
+        # test frame selector with 1 dim input for flow images
+        # when start_index = 0
+        inputs = copy.deepcopy(self.flow_frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 2)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = FrameSelector(io_backend='disk')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']) * 2,
+                                             240, 320)
+        assert results['original_shape'] == (240, 320)
+
         # test frame selector with 1 dim input for flow images
         inputs = copy.deepcopy(self.flow_frame_results)
         inputs['frame_inds'] = np.arange(1, self.total_frames, 2)
@@ -587,6 +760,21 @@ def test_frame_selector(self):
                                              240, 320)
         assert results['original_shape'] == (240, 320)
 
+        # test frame selector in turbojpeg decording backend
+        # when start_index = 0
+        inputs = copy.deepcopy(self.frame_results)
+        inputs['frame_inds'] = np.arange(0, self.total_frames, 5)
+        # since the test images start with index 1, we plus 1 to frame_inds
+        # in order to pass the CI
+        inputs['frame_inds'] = inputs['frame_inds'] + 1
+        frame_selector = FrameSelector(
+            io_backend='disk', decoding_backend='turbojpeg')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']), 240,
+                                             320, 3)
+        assert results['original_shape'] == (240, 320)
+
         # test frame selector in turbojpeg decording backend
         inputs = copy.deepcopy(self.frame_results)
         inputs['frame_inds'] = np.arange(1, self.total_frames, 5)