diff --git a/colossalai/kernel/triton/__init__.py b/colossalai/kernel/triton/__init__.py
index 87ea9cf6536e..09f7a5592253 100644
--- a/colossalai/kernel/triton/__init__.py
+++ b/colossalai/kernel/triton/__init__.py
@@ -7,6 +7,7 @@
     from .copy_kv_cache_dest import copy_kv_cache_to_dest
     from .fused_layernorm import layer_norm
     from .gptq_triton import gptq_fused_linear_triton
+    from .int8_rotary_embedding_kernel import int8_rotary_embedding_fwd
     from .rms_norm import rmsnorm_forward
     from .rotary_embedding_kernel import rotary_embedding_fwd
     from .softmax import softmax
@@ -22,6 +23,7 @@
         "rotary_embedding_fwd",
         "token_attention_fwd",
         "gptq_fused_linear_triton",
+        "int8_rotary_embedding_fwd",
     ]
 
 except ImportError:
diff --git a/colossalai/kernel/triton/int8_rotary_embedding_kernel.py b/colossalai/kernel/triton/int8_rotary_embedding_kernel.py
new file mode 100644
index 000000000000..1e2c5c427954
--- /dev/null
+++ b/colossalai/kernel/triton/int8_rotary_embedding_kernel.py
@@ -0,0 +1,119 @@
+# Adapted from ModelTC https://github.com/ModelTC/lightllm
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _rotary_kernel(
+    q,
+    input_scale,
+    output_scale,
+    Cos,
+    Sin,
+    q_bs_stride,
+    q_h_stride,
+    q_d_stride,
+    cos_bs_stride,
+    cos_d_stride,
+    total_len,
+    HEAD_NUM: tl.constexpr,
+    BLOCK_HEAD: tl.constexpr,
+    BLOCK_SEQ: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+):
+    current_head_index = tl.program_id(0)
+    current_seq_index = tl.program_id(1)
+
+    dim_range0 = tl.arange(0, HEAD_DIM // 2)
+    dim_range1 = tl.arange(HEAD_DIM // 2, HEAD_DIM)
+
+    current_head_range = current_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
+    current_seq_range = current_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
+
+    off_q0 = (
+        current_seq_range[:, None, None] * q_bs_stride
+        + current_head_range[None, :, None] * q_h_stride
+        + dim_range0[None, None, :] * q_d_stride
+    )
+    off_q1 = (
+        current_seq_range[:, None, None] * q_bs_stride
+        + current_head_range[None, :, None] * q_h_stride
+        + dim_range1[None, None, :] * q_d_stride
+    )
+
+    off_dimcos_sin = current_seq_range[:, None, None] * cos_bs_stride + dim_range0[None, None, :] * cos_d_stride
+
+    q0 = tl.load(
+        q + off_q0,
+        mask=(current_seq_range[:, None, None] < total_len) & (current_head_range[None, :, None] < HEAD_NUM),
+        other=0.0,
+    )
+    q1 = tl.load(
+        q + off_q1,
+        mask=(current_seq_range[:, None, None] < total_len) & (current_head_range[None, :, None] < HEAD_NUM),
+        other=0.0,
+    )
+
+    cos = tl.load(Cos + off_dimcos_sin, mask=current_seq_range[:, None, None] < total_len, other=0.0)
+    sin = tl.load(Sin + off_dimcos_sin, mask=current_seq_range[:, None, None] < total_len, other=0.0)
+    in_scale = tl.load(input_scale)
+    o_scale = tl.load(output_scale)
+
+    q0 = q0.to(tl.float32) * in_scale
+    q1 = q1.to(tl.float32) * in_scale
+
+    out0 = (q0 * cos - q1 * sin) / o_scale
+    out1 = (q0 * sin + q1 * cos) / o_scale
+
+    # out0 = out0.to(tl.int8)
+    # out1 = out1.to(tl.int8)
+
+    tl.store(
+        q + off_q0,
+        out0,
+        mask=(current_seq_range[:, None, None] < total_len) & (current_head_range[None, :, None] < HEAD_NUM),
+    )
+    tl.store(
+        q + off_q1,
+        out1,
+        mask=(current_seq_range[:, None, None] < total_len) & (current_head_range[None, :, None] < HEAD_NUM),
+    )
+
+    return
+
+
+@torch.no_grad()
+def int8_rotary_embedding_fwd(q, cos, sin, input_scale, output_scale):
+    total_len = q.shape[0]
+    head_num = q.shape[1]
+    head_dim = q.shape[2]
+    assert q.shape[0] == cos.shape[0] and q.shape[0] == sin.shape[0], f"q shape {q.shape} cos shape {cos.shape}"
+    BLOCK_HEAD = 4
+    BLOCK_SEQ = 32
+    grid = (triton.cdiv(head_num, BLOCK_HEAD), triton.cdiv(total_len, BLOCK_SEQ))
+    if head_dim >= 128:
+        num_warps = 8
+    else:
+        num_warps = 4
+
+    _rotary_kernel[grid](
+        q,
+        input_scale,
+        output_scale,
+        cos,
+        sin,
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        cos.stride(0),
+        cos.stride(1),
+        total_len,
+        HEAD_NUM=head_num,
+        BLOCK_HEAD=BLOCK_HEAD,
+        BLOCK_SEQ=BLOCK_SEQ,
+        HEAD_DIM=head_dim,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return
diff --git a/tests/test_smoothquant/test_rotary_embedding.py b/tests/test_smoothquant/test_rotary_embedding.py
new file mode 100644
index 000000000000..ee030065d66e
--- /dev/null
+++ b/tests/test_smoothquant/test_rotary_embedding.py
@@ -0,0 +1,59 @@
+# Adapted from ModelTC https://github.com/ModelTC/lightllm
+
+
+import pytest
+import torch
+from packaging import version
+
+try:
+    from colossalai.kernel.triton import int8_rotary_embedding_fwd
+
+    HAS_TRITON = True
+except ImportError:
+    HAS_TRITON = False
+    print("please install triton from https://github.com/openai/triton")
+
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
+
+
+def torch_rotary_emb(x, cos, sin):
+    seq_len, h, dim = x.shape
+    x0 = x[:, :, 0 : dim // 2]
+    x1 = x[:, :, dim // 2 : dim]
+    cos = cos.view((seq_len, 1, dim // 2))
+    sin = sin.view((seq_len, 1, dim // 2))
+    o0 = x0 * cos - x1 * sin
+    o1 = x0 * sin + x1 * cos
+    return torch.cat((o0, o1), dim=-1)
+
+
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
+)
+def test_rotary_emb():
+    SEQ_LEN = 1
+    HEAD_NUM = 32
+    HEAD_DIM = 128
+    dtype = torch.float
+    # create data
+    x_shape = (SEQ_LEN, HEAD_NUM, HEAD_DIM)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")
+    cos_shape = (SEQ_LEN, HEAD_DIM // 2)
+    cos = -1.2 + 0.5 * torch.randn(cos_shape, dtype=dtype, device="cuda")
+    sin = -2.0 + 0.5 * torch.randn(cos_shape, dtype=dtype, device="cuda")
+    # forward pass
+    y_torch = torch_rotary_emb(x, cos, sin)
+
+    input_scale = torch.max(torch.abs(x)) / 127
+    output_scale = torch.max(torch.abs(y_torch)) / 127
+
+    x = x / input_scale
+    x = x.to(torch.int8)
+
+    int8_rotary_embedding_fwd(x, cos, sin, input_scale, output_scale)
+    y_triton = x.to(torch.float) * output_scale
+    assert torch.allclose(y_triton, y_torch, atol=2e-1, rtol=1e-2, equal_nan=True)
+
+
+if __name__ == "__main__":
+    test_rotary_emb()