diff --git a/perception/lidar_centerpoint/CMakeLists.txt b/perception/lidar_centerpoint/CMakeLists.txt
index 6831d7a57017b..d7fce8647b7b1 100644
--- a/perception/lidar_centerpoint/CMakeLists.txt
+++ b/perception/lidar_centerpoint/CMakeLists.txt
@@ -52,23 +52,27 @@ else()
   set(TRT_AVAIL OFF)
 endif()
 
-option(TORCH_AVAIL "Torch available" OFF)
-if(CUDA_FOUND)
-  set(Torch_DIR /usr/local/libtorch/share/cmake/Torch)
-  find_package(Torch)
-  if(TORCH_FOUND)
-    if(CUDA_VERBOSE)
-      message(STATUS "TORCH_INCLUDE_DIRS: ${TORCH_INCLUDE_DIRS}")
-      message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}")
-    endif()
-    set(TORCH_AVAIL ON)
-  else()
-  message("Torch NOT FOUND")
-  set(TORCH_AVAIL OFF)
+# set flags for CUDNN availability
+option(CUDNN_AVAIL "CUDNN available" OFF)
+# try to find the CUDNN module
+find_library(CUDNN_LIBRARY
+NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib ${__cudnn_lib_win_name}
+PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
+PATH_SUFFIXES lib lib64 bin
+DOC "CUDNN library."
+)
+if(CUDNN_LIBRARY)
+  if(CUDA_VERBOSE)
+    message(STATUS "CUDNN is available!")
+    message(STATUS "CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
   endif()
+  set(CUDNN_AVAIL ON)
+else()
+  message("CUDNN is NOT Available")
+  set(CUDNN_AVAIL OFF)
 endif()
 
-if(TRT_AVAIL AND CUDA_AVAIL AND TORCH_AVAIL)
+if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL)
 # Download trained models
   find_program(GDOWN_AVAIL "gdown")
   if(NOT GDOWN_AVAIL)
@@ -84,27 +88,29 @@ if(TRT_AVAIL AND CUDA_AVAIL AND TORCH_AVAIL)
     set(FILE_PATH ${DATA_PATH}/${FILE_NAME})
     if(EXISTS ${FILE_PATH})
       file(MD5 ${FILE_PATH} EXISTING_FILE_HASH)
-      if(NOT ${FILE_HASH} EQUAL ${EXISTING_FILE_HASH})
-        message(STATUS "... file hash changes. Downloading now ...")
-        execute_process(COMMAND gdown --quiet https://drive.google.com/uc?id=${GFILE_ID} -O ${FILE_PATH})
+      if(${FILE_HASH} EQUAL ${EXISTING_FILE_HASH})
+        message(STATUS "File already exists.")
+      else()
+        message(STATUS "File hash changes. Downloading now ...")
+        execute_process(COMMAND gdown --quiet https://drive.google.com//uc?id=${GFILE_ID} -O ${FILE_PATH})
+        # file(MD5 ${FILE_PATH} DOWNLOADED_FILE_HASH)  # disable to pass ci
+        message(STATUS "Downloaded file hash: ${DOWNLOADED_FILE_HASH}")
       endif()
     else()
-      message(STATUS "... file doesn't exists. Downloading now ...")
-      execute_process(COMMAND gdown --quiet https://drive.google.com/uc?id=${GFILE_ID} -O ${FILE_PATH})
+      message(STATUS "File doesn't exists. Downloading now ...")
+      execute_process(COMMAND gdown --quiet https://drive.google.com//uc?id=${GFILE_ID} -O ${FILE_PATH})
+      # file(MD5 ${FILE_PATH} DOWNLOADED_FILE_HASH)  # disable to pass ci
+      message(STATUS "Downloaded file hash: ${DOWNLOADED_FILE_HASH}")
     endif()
   endfunction()
 
   # default model
-  download(pts_voxel_encoder_default.onnx 1_8OCQmrPm_R4ZVh70QsS9HZo6uGrlbgz 01b860612e497591c4375d90dff61ef7)
-  download(pts_voxel_encoder_default.pt 1RZ7cuDnI-RBrDiWe-2vEs16mR_z0e9Uo 33136caa97e3bcef2cf3e04bbc93d1e4)
-  download(pts_backbone_neck_head_default.onnx 1UxDyt8T-TMJS7Ujx-1vbbqGRfDbMUZg2 e23a8ad4ea440f923e44dbe072b070da)
-  download(pts_backbone_neck_head_default.pt 1toAhmOriX8bwVI-ohuas9_2EBZnltoXh eb0df29b30acf9c1082ac4490af0bbc5)
+  download(pts_voxel_encoder_default.onnx 1KFhmA4oFT6CtZx5806QeMzn5H2tKa3oD 410f730c537968cb27fbd70c941849a8)
+  download(pts_backbone_neck_head_default.onnx 1iyk5VoQ4uNBGPZwypVZIMjSuSYAI1RxP e97c165c7877222c0e27e44409a07517)
 
   # aip_x2 model
-  download(pts_voxel_encoder_aip_x2.onnx 1x-NAHQ3W0lbLmjJlrL6Nhvdq8yz6Ux0n 65eeb95c5e48ebfe6894146cdb48c160)
-  download(pts_voxel_encoder_aip_x2.pt 1jzKopAhXWjnEgo_v8rtYy0hQIayUE-oL 4db81ce8edc6571aa0afb1ae43ee72e9)
-  download(pts_backbone_neck_head_aip_x2.onnx 1l2fdIQcBWr3-6stVoNkudnL4OZaPqmNT a33c8910fd9c9c910b10904d3cd96717)
-  download(pts_backbone_neck_head_aip_x2.pt 18iOAlRsjvcWoUG9KiL1PlD7OY5mi9BSw 274fdf1580dd899e36c050c1366f1883)
+  download(pts_voxel_encoder_aip_x2.onnx 13aYPRHx17Ge4BqxzW9drAUSWTppjtUV5 3ae5e9efd7b2ed12115e6f0b28cac58d)
+  download(pts_backbone_neck_head_aip_x2.onnx 14PJ_L3Jpz6Yi8GzoctVOEbGWcaCLArGp 6a406a19e05660677c162486ab332de8)
 
   find_package(ament_cmake_auto REQUIRED)
   ament_auto_find_build_dependencies()
@@ -112,25 +118,33 @@ if(TRT_AVAIL AND CUDA_AVAIL AND TORCH_AVAIL)
   include_directories(
     lib/include
     ${CUDA_INCLUDE_DIRS}
-    ${TORCH_INCLUDE_DIRS}
   )
 
   ### centerpoint ###
   ament_auto_add_library(centerpoint SHARED
+    lib/src/centerpoint_trt.cpp
     lib/src/pointcloud_densification.cpp
     lib/src/voxel_generator.cpp
-    lib/src/centerpoint_trt.cpp
     lib/src/tensorrt_wrapper.cpp
     lib/src/network_trt.cpp
+    lib/src/utils.cpp
+  )
+
+  cuda_add_library(centerpoint_cuda_libraries SHARED
+    lib/src/circle_nms_kernel.cu
+    lib/src/postprocess_kernel.cu
+    lib/src/preprocess_kernel.cu
+    lib/src/scatter_kernel.cu
   )
 
   target_link_libraries(centerpoint
     ${NVINFER}
     ${NVONNXPARSER}
-    ${NVINFER_PLUGIN}
     ${CUDA_LIBRARIES}
     ${CUBLAS_LIBRARIES}
-    ${TORCH_LIBRARIES}
+    ${CUDA_curand_LIBRARY}
+    ${CUDNN_LIBRARY}
+    centerpoint_cuda_libraries
   )
 
   ## node ##
diff --git a/perception/lidar_centerpoint/README.md b/perception/lidar_centerpoint/README.md
index 99acf6399d118..193ffa801659a 100644
--- a/perception/lidar_centerpoint/README.md
+++ b/perception/lidar_centerpoint/README.md
@@ -20,10 +20,9 @@ We trained the models using <https://github.com/open-mmlab/mmdetection3d>.
 
 ### Output
 
-| Name                               | Type                                                  | Description              |
-| ---------------------------------- | ----------------------------------------------------- | ------------------------ |
-| `~/output/objects`                 | `autoware_auto_perception_msgs::msg::DetectedObjects` | detected objects         |
-| `~/debug/pointcloud_densification` | `sensor_msgs::msg::PointCloud2`                       | densification pointcloud |
+| Name               | Type                                                  | Description      |
+| ------------------ | ----------------------------------------------------- | ---------------- |
+| `~/output/objects` | `autoware_auto_perception_msgs::msg::DetectedObjects` | detected objects |
 
 ## Parameters
 
@@ -34,30 +33,16 @@ We trained the models using <https://github.com/open-mmlab/mmdetection3d>.
 | `score_threshold`               | float  | `0.4`         | detected objects with score less than threshold are ignored |
 | `densification_world_frame_id`  | string | `map`         | the world frame id to fuse multi-frame pointcloud           |
 | `densification_num_past_frames` | int    | `1`           | the number of past frames to fuse with the current frame    |
-| `use_encoder_trt`               | bool   | `false`       | use TensorRT VoxelFeatureEncoder                            |
-| `use_head_trt`                  | bool   | `true`        | use TensorRT DetectionHead                                  |
 | `trt_precision`                 | string | `fp16`        | TensorRT inference precision: `fp32` or `fp16`              |
 | `encoder_onnx_path`             | string | `""`          | path to VoxelFeatureEncoder ONNX file                       |
 | `encoder_engine_path`           | string | `""`          | path to VoxelFeatureEncoder TensorRT Engine file            |
-| `encoder_pt_path`               | string | `""`          | path to VoxelFeatureEncoder TorchScript file                |
 | `head_onnx_path`                | string | `""`          | path to DetectionHead ONNX file                             |
 | `head_engine_path`              | string | `""`          | path to DetectionHead TensorRT Engine file                  |
-| `head_pt_path`                  | string | `""`          | path to DetectionHead TorchScript file                      |
 
 ## Assumptions / Known limits
 
 - The `object.existence_probability` is stored the value of classification confidence of a DNN, not probability.
 
-- If you have an error like `'GOMP_4.5' not found`, replace the OpenMP library in libtorch.
-
-  ```bash
-  sudo apt install libgomp1 -y
-  sudo rm /usr/local/libtorch/lib/libgomp-75eea7e8.so.1
-  sudo ln -s /usr/lib/x86_64-linux-gnu/libgomp.so.1 /usr/local/libtorch/lib/libgomp-75eea7e8.so.1
-  ```
-
-- if `use_encoder_trt` is set `use_encoder_trt`, more GPU memory is allocated.
-
 ## (Optional) Error detection and handling
 
 <!-- Write how to detect errors and how to recover from them.
@@ -92,15 +77,9 @@ Example:
 
 [5] <https://github.com/open-mmlab/OpenPCDet>
 
-[6] <https://github.com/poodarchu/Det3D>
-
-[7] <https://github.com/xingyizhou/CenterNet>
-
-[8] <https://github.com/lzccccc/SMOKE>
-
-[9] <https://github.com/yukkysaito/autoware_perception>
+[6] <https://github.com/yukkysaito/autoware_perception>
 
-[10] <https://github.com/pytorch/pytorch>
+[7] <https://github.com/NVIDIA-AI-IOT/CUDA-PointPillars>
 
 ## (Optional) Future extensions / Unimplemented parts
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
index bf7a93cd555e9..686ba7c224567 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/node.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -40,7 +40,8 @@ class LidarCenterPointNode : public rclcpp::Node
 
 private:
   void pointCloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr input_pointcloud_msg);
-
+  void box3DToDetectedObject(
+    const Box3D & box3d, autoware_auto_perception_msgs::msg::DetectedObject & obj);
   static uint8_t getSemanticType(const std::string & class_name);
   static bool isCarLikeVehicleLabel(const uint8_t label);
 
@@ -49,20 +50,8 @@ class LidarCenterPointNode : public rclcpp::Node
 
   rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_sub_;
   rclcpp::Publisher<autoware_auto_perception_msgs::msg::DetectedObjects>::SharedPtr objects_pub_;
-  rclcpp::Publisher<sensor_msgs::msg::PointCloud2>::SharedPtr pointcloud_pub_;
 
   float score_threshold_{0.0};
-  bool use_encoder_trt_{false};
-  bool use_head_trt_{false};
-  std::string trt_precision_;
-
-  std::string encoder_onnx_path_;
-  std::string encoder_engine_path_;
-  std::string encoder_pt_path_;
-  std::string head_onnx_path_;
-  std::string head_engine_path_;
-  std::string head_pt_path_;
-
   std::vector<std::string> class_names_;
   bool rename_car_to_truck_and_bus_{false};
 
diff --git a/perception/lidar_centerpoint/launch/lidar_centerpoint.launch.xml b/perception/lidar_centerpoint/launch/lidar_centerpoint.launch.xml
index d6fce7ef2d2fd..d049c8b00b2df 100644
--- a/perception/lidar_centerpoint/launch/lidar_centerpoint.launch.xml
+++ b/perception/lidar_centerpoint/launch/lidar_centerpoint.launch.xml
@@ -13,9 +13,9 @@
       <param name="score_threshold" value="0.45"/>
       <param name="densification_world_frame_id" value="map"/>
       <param name="densification_num_past_frames" value="1"/>
-      <param name="use_head_trt" value="true"/>
       <param name="trt_precision" value="fp16"/>
-      <param name="encoder_pt_path" value="$(var model_path)/pts_voxel_encoder_$(var model_name).pt"/>
+      <param name="encoder_onnx_path" value="$(var model_path)/pts_voxel_encoder_$(var model_name).onnx"/>
+      <param name="encoder_engine_path" value="$(var model_path)/pts_voxel_encoder_$(var model_name).engine"/>
       <param name="head_onnx_path" value="$(var model_path)/pts_backbone_neck_head_$(var model_name).onnx"/>
       <param name="head_engine_path" value="$(var model_path)/pts_backbone_neck_head_$(var model_name).engine"/>
       <param from="$(var model_param_path)"/>
diff --git a/perception/lidar_centerpoint/lib/include/centerpoint_trt.hpp b/perception/lidar_centerpoint/lib/include/centerpoint_trt.hpp
index c99c7356b8ae7..6225da6b3771e 100644
--- a/perception/lidar_centerpoint/lib/include/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/lib/include/centerpoint_trt.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -18,14 +18,13 @@
 #include <config.hpp>
 #include <cuda_utils.hpp>
 #include <network_trt.hpp>
-#include <tier4_autoware_utils/math/constants.hpp>
+#include <postprocess_kernel.hpp>
 #include <voxel_generator.hpp>
 
 #include <sensor_msgs/msg/point_cloud2.hpp>
 
 #include <pcl/point_cloud.h>
 #include <pcl/point_types.h>
-#include <torch/script.h>
 
 #include <memory>
 #include <string>
@@ -37,75 +36,72 @@ namespace centerpoint
 class NetworkParam
 {
 public:
-  NetworkParam(
-    std::string onnx_path, std::string engine_path, std::string pt_path, std::string trt_precision,
-    const bool use_trt)
+  NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision)
   : onnx_path_(std::move(onnx_path)),
     engine_path_(std::move(engine_path)),
-    pt_path_(std::move(pt_path)),
-    trt_precision_(std::move(trt_precision)),
-    use_trt_(use_trt)
+    trt_precision_(std::move(trt_precision))
   {
   }
 
   std::string onnx_path() const { return onnx_path_; }
   std::string engine_path() const { return engine_path_; }
-  std::string pt_path() const { return pt_path_; }
   std::string trt_precision() const { return trt_precision_; }
-  bool use_trt() const { return use_trt_; }
 
 private:
   std::string onnx_path_;
   std::string engine_path_;
-  std::string pt_path_;
   std::string trt_precision_;
-  bool use_trt_;
 };
 
 class CenterPointTRT
 {
 public:
   explicit CenterPointTRT(
-    const int num_class, const NetworkParam & encoder_param, const NetworkParam & head_param,
-    const DensificationParam & densification_param);
+    const std::size_t num_class, const float score_threshold, const NetworkParam & encoder_param,
+    const NetworkParam & head_param, const DensificationParam & densification_param);
 
   ~CenterPointTRT();
 
-  std::vector<float> detect(
-    const sensor_msgs::msg::PointCloud2 &, const tf2_ros::Buffer & tf_buffer);
+  bool detect(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+    std::vector<Box3D> & det_boxes3d);
 
 private:
-  bool initPtr(bool use_encoder_trt, bool use_head_trt);
-
-  bool loadTorchScript(torch::jit::script::Module & module, const std::string & model_path);
-
-  static at::Tensor createInputFeatures(
-    const at::Tensor & voxels, const at::Tensor & coords, const at::Tensor & voxel_num_points);
-
-  static at::Tensor scatterPillarFeatures(
-    const at::Tensor & pillar_features, const at::Tensor & coordinates);
-
-  at::Tensor generatePredictedBoxes();
-
-  std::unique_ptr<VoxelGeneratorTemplate> vg_ptr_ = nullptr;
-  torch::jit::script::Module encoder_pt_;
-  torch::jit::script::Module head_pt_;
-  std::unique_ptr<VoxelEncoderTRT> encoder_trt_ptr_ = nullptr;
-  std::unique_ptr<HeadTRT> head_trt_ptr_ = nullptr;
-  c10::Device device_ = torch::kCUDA;
-  cudaStream_t stream_ = nullptr;
-
-  int num_class_{0};
-  at::Tensor voxels_t_;
-  at::Tensor coordinates_t_;
-  at::Tensor num_points_per_voxel_t_;
-  at::Tensor output_pillar_feature_t_;
-  at::Tensor output_heatmap_t_;
-  at::Tensor output_offset_t_;
-  at::Tensor output_z_t_;
-  at::Tensor output_dim_t_;
-  at::Tensor output_rot_t_;
-  at::Tensor output_vel_t_;
+  void initPtr();
+
+  bool preprocess(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
+
+  void inference();
+
+  void postProcess(std::vector<Box3D> & det_boxes3d);
+
+  std::unique_ptr<VoxelGeneratorTemplate> vg_ptr_{nullptr};
+  std::unique_ptr<VoxelEncoderTRT> encoder_trt_ptr_{nullptr};
+  std::unique_ptr<HeadTRT> head_trt_ptr_{nullptr};
+  std::unique_ptr<PostProcessCUDA> post_proc_ptr_{nullptr};
+  cudaStream_t stream_{nullptr};
+
+  bool verbose_{false};
+  std::size_t num_class_{0};
+  std::size_t num_voxels_{0};
+  std::size_t encoder_in_feature_size_{0};
+  std::size_t spatial_features_size_{0};
+  std::vector<float> voxels_;
+  std::vector<int> coordinates_;
+  std::vector<float> num_points_per_voxel_;
+  cuda::unique_ptr<float[]> voxels_d_{nullptr};
+  cuda::unique_ptr<int[]> coordinates_d_{nullptr};
+  cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
+  cuda::unique_ptr<float[]> encoder_in_features_d_{nullptr};
+  cuda::unique_ptr<float[]> pillar_features_d_{nullptr};
+  cuda::unique_ptr<float[]> spatial_features_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_heatmap_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_offset_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_z_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_dim_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
+  cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/include/circle_nms_kernel.hpp b/perception/lidar_centerpoint/lib/include/circle_nms_kernel.hpp
new file mode 100644
index 0000000000000..85886da8f4c23
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/include/circle_nms_kernel.hpp
@@ -0,0 +1,32 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCLE_NMS_KERNEL_HPP_
+#define CIRCLE_NMS_KERNEL_HPP_
+
+#include <utils.hpp>
+
+#include <thrust/device_vector.h>
+
+namespace centerpoint
+{
+// Non-maximum suppression (NMS) uses the distance on the xy plane instead of
+// intersection over union (IoU) to suppress overlapped objects.
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream);
+
+}  // namespace centerpoint
+
+#endif  // CIRCLE_NMS_KERNEL_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/config.hpp b/perception/lidar_centerpoint/lib/include/config.hpp
index 24e443eb76175..c8f799527e697 100644
--- a/perception/lidar_centerpoint/lib/include/config.hpp
+++ b/perception/lidar_centerpoint/lib/include/config.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,50 +15,50 @@
 #ifndef CONFIG_HPP_
 #define CONFIG_HPP_
 
+#include <cstddef>
+
 namespace centerpoint
 {
 class Config
 {
 public:
   // input params
-  constexpr static int num_point_dims = 3;      // x, y and z
-  constexpr static int num_point_features = 4;  // x, y, z and timelag
-  constexpr static int max_num_points_per_voxel = 32;
-  constexpr static int max_num_voxels = 40000;
-  constexpr static float pointcloud_range_xmin = -89.6f;
-  constexpr static float pointcloud_range_ymin = -89.6f;
-  constexpr static float pointcloud_range_zmin = -3.0f;
-  constexpr static float pointcloud_range_xmax = 89.6f;
-  constexpr static float pointcloud_range_ymax = 89.6f;
-  constexpr static float pointcloud_range_zmax = 5.0f;
+  constexpr static std::size_t point_dim_size = 3;      // x, y and z
+  constexpr static std::size_t point_feature_size = 4;  // x, y, z and timelag
+  constexpr static std::size_t box_feature_size = 9;    // x, y, z, l, w, h, rot, vel_x, vel_y
+  constexpr static std::size_t max_num_points_per_voxel = 32;
+  constexpr static std::size_t max_num_voxels = 40000;
+  constexpr static float range_min_x = -89.6f;
+  constexpr static float range_min_y = -89.6f;
+  constexpr static float range_min_z = -3.0f;
+  constexpr static float range_max_x = 89.6f;
+  constexpr static float range_max_y = 89.6f;
+  constexpr static float range_max_z = 5.0f;
   constexpr static float voxel_size_x = 0.32f;
   constexpr static float voxel_size_y = 0.32f;
   constexpr static float voxel_size_z = 8.0f;
-  // = (pointcloud_range_xmax - pointcloud_range_xmin) / voxel_size_x
-  constexpr static int grid_size_x = 560;
-  // = (pointcloud_range_ymax - pointcloud_range_ymin) / voxel_size_y
-  constexpr static int grid_size_y = 560;
-  // = (pointcloud_range_zmax - pointcloud_range_zmin) / voxel_size_z
-  constexpr static int grid_size_z = 1;
-  constexpr static float offset_x = -89.44;  // = pointcloud_range_xmin + voxel_size_x / 2
-  constexpr static float offset_y = -89.44;  // = pointcloud_range_ymin + voxel_size_y / 2
-  constexpr static float offset_z = 1.0f;    // = pointcloud_range_zmin + voxel_size_z / 2
-
-  // output params
-  constexpr static int num_box_features = 11;  // score, class, x, y, z, w, l, h, yaw, vel_x, vel_y
-  constexpr static int max_num_output_objects = 500;
 
   // network params
-  constexpr static int batch_size = 1;
-  constexpr static int downsample_factor = 2;
-  constexpr static int num_encoder_input_features = 8;
-  constexpr static int num_encoder_output_features = 32;
-  constexpr static int num_output_features = 6;
-  constexpr static int num_output_offset_features = 2;
-  constexpr static int num_output_z_features = 1;
-  constexpr static int num_output_dim_features = 3;
-  constexpr static int num_output_rot_features = 2;
-  constexpr static int num_output_vel_features = 2;
+  constexpr static std::size_t batch_size = 1;
+  constexpr static std::size_t downsample_factor = 2;
+  constexpr static std::size_t encoder_in_feature_size = 9;
+  constexpr static std::size_t encoder_out_feature_size = 32;
+  constexpr static std::size_t head_out_size = 6;
+  constexpr static std::size_t head_out_offset_size = 2;
+  constexpr static std::size_t head_out_z_size = 1;
+  constexpr static std::size_t head_out_dim_size = 3;
+  constexpr static std::size_t head_out_rot_size = 2;
+  constexpr static std::size_t head_out_vel_size = 2;
+
+  // calculated params
+  constexpr static std::size_t grid_size_x = (range_max_x - range_min_x) / voxel_size_x;
+  constexpr static std::size_t grid_size_y = (range_max_y - range_min_y) / voxel_size_y;
+  constexpr static std::size_t grid_size_z = (range_max_z - range_min_z) / voxel_size_z;
+  constexpr static float offset_x = range_min_x + voxel_size_x / 2;
+  constexpr static float offset_y = range_min_y + voxel_size_y / 2;
+  constexpr static float offset_z = range_min_z + voxel_size_z / 2;
+  constexpr static std::size_t down_grid_size_x = grid_size_x / downsample_factor;
+  constexpr static std::size_t down_grid_size_y = grid_size_y / downsample_factor;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/include/cuda_utils.hpp b/perception/lidar_centerpoint/lib/include/cuda_utils.hpp
index eaaf68fc1db1b..df9a19ca3b2ef 100644
--- a/perception/lidar_centerpoint/lib/include/cuda_utils.hpp
+++ b/perception/lidar_centerpoint/lib/include/cuda_utils.hpp
@@ -1,4 +1,4 @@
-// Copyright 2020 Tier IV, Inc.
+// Copyright 2020 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/perception/lidar_centerpoint/lib/include/heatmap_utils.hpp b/perception/lidar_centerpoint/lib/include/heatmap_utils.hpp
deleted file mode 100644
index 3e9f2fd19784d..0000000000000
--- a/perception/lidar_centerpoint/lib/include/heatmap_utils.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2021 Tier IV, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HEATMAP_UTILS_HPP_
-#define HEATMAP_UTILS_HPP_
-
-#include <torch/script.h>
-
-#include <tuple>
-
-namespace centerpoint
-{
-at::Tensor sigmoid_hm(const at::Tensor & heatmap)
-{
-  // heatmap (float): (batch_size, num_class, H, W)
-
-  return torch::clamp(torch::sigmoid(heatmap), /*min=*/1e-6, /*max=*/1 - 1e-6);
-}
-
-at::Tensor nms_hm(const at::Tensor & heatmap, const int kernel_size = 3)
-{
-  // heatmap (float): (B, C, H, W)
-
-  at::Tensor heatmap_max = torch::max_pool2d(
-    heatmap, {kernel_size, kernel_size},
-    /*stride=*/{1}, /*padding=*/{(kernel_size - 1) / 2});
-  at::Tensor mask = heatmap_max == heatmap;
-  return heatmap * mask.to(heatmap.dtype());
-}
-
-at::Tensor gather_feature(const at::Tensor & feature, const at::Tensor & index)
-{
-  // feature (float): (batch_size, topk * num_class, 1)
-  // feature (int): (batch_size, topk)
-
-  int channel = feature.sizes()[2];
-  auto index_size = index.sizes();
-  at::Tensor _index = index.unsqueeze(-1).expand({index_size[0], index_size[1], channel});
-  at::Tensor _feature = feature.gather(1, _index);
-  return _feature;
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> select_topk(
-  const at::Tensor & heatmap_pred, const int k)
-{
-  // heatmap_pred: (batch_size, num_class, H, W)
-
-  const auto dtype = heatmap_pred.dtype();
-  const int batch_size = heatmap_pred.sizes()[0];
-  const int cls = heatmap_pred.sizes()[1];
-  const int width = heatmap_pred.sizes()[3];
-
-  // first select topk scores in all classes and batches
-  // [B, C, H, W] -> [B, C, H*W]
-  at::Tensor _heatmap_pred = heatmap_pred.view({batch_size, cls, -1});
-
-  // both in [B, C, K]
-  auto topk_all_tuple = _heatmap_pred.topk(k);
-  at::Tensor topk_scores_all = std::get<0>(topk_all_tuple);
-  at::Tensor topk_inds_all = std::get<1>(topk_all_tuple);
-
-  at::Tensor topk_ys = topk_inds_all.to(dtype).floor_divide(width);
-  at::Tensor topk_xs = topk_inds_all.to(dtype).fmod(width);
-
-  // select topk examples across channels
-  // [B, C, K] -> [B, C*K]
-  topk_scores_all = topk_scores_all.view({batch_size, -1});
-
-  // Both in [N, K]
-  auto topk_tuple = topk_scores_all.topk(k);
-  at::Tensor topk_scores = std::get<0>(topk_tuple);
-  at::Tensor topk_inds = std::get<1>(topk_tuple);
-  at::Tensor topk_clses = topk_inds.to(dtype).floor_divide(k);
-
-  topk_inds_all = topk_inds_all.view({batch_size, -1, 1});
-  topk_ys = topk_ys.view({batch_size, -1, 1});
-  topk_xs = topk_xs.view({batch_size, -1, 1});
-
-  topk_inds_all = gather_feature(topk_inds_all, topk_inds).view({batch_size, k});
-  topk_ys = gather_feature(topk_ys, topk_inds).view({batch_size, k});
-  topk_xs = gather_feature(topk_xs, topk_inds).view({batch_size, k});
-
-  return std::make_tuple(topk_scores, topk_inds_all, topk_clses, topk_ys, topk_xs);
-}
-
-at::Tensor select_point_of_interest(const at::Tensor & index, const at::Tensor & feature_map)
-{
-  // index: (batch_size, N)
-  // feature_map: (batch_size, num_features, H, W)
-
-  const int batch_size = feature_map.sizes()[0];
-  const int channel = feature_map.sizes()[1];
-  at::Tensor _index = index.view({batch_size, -1, 1}).repeat({1, 1, channel});
-  at::Tensor _feature_map = feature_map.permute({0, 2, 3, 1}).contiguous();
-  _feature_map = _feature_map.view({batch_size, -1, channel});
-  _feature_map = _feature_map.gather(1, _index);
-  return _feature_map;
-}
-
-}  // namespace centerpoint
-
-#endif  // HEATMAP_UTILS_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/network_trt.hpp b/perception/lidar_centerpoint/lib/include/network_trt.hpp
index 2a03c6f134409..369b0c40ae5de 100644
--- a/perception/lidar_centerpoint/lib/include/network_trt.hpp
+++ b/perception/lidar_centerpoint/lib/include/network_trt.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -35,14 +35,14 @@ class HeadTRT : public TensorRTWrapper
 public:
   using TensorRTWrapper::TensorRTWrapper;
 
-  HeadTRT(const int num_class, const bool verbose);
+  HeadTRT(const std::size_t num_class, const bool verbose);
 
 protected:
   bool setProfile(
     nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network,
     nvinfer1::IBuilderConfig & config) override;
 
-  int num_class_{0};
+  std::size_t num_class_{0};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/include/pointcloud_densification.hpp b/perception/lidar_centerpoint/lib/include/pointcloud_densification.hpp
index 885f16160bf98..21f5179076fc3 100644
--- a/perception/lidar_centerpoint/lib/include/pointcloud_densification.hpp
+++ b/perception/lidar_centerpoint/lib/include/pointcloud_densification.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ class PointCloudDensification
 public:
   explicit PointCloudDensification(const DensificationParam & param);
 
-  void enqueuePointCloud(
+  bool enqueuePointCloud(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
 
   double getCurrentTimestamp() const { return current_timestamp_; }
diff --git a/perception/lidar_centerpoint/lib/include/postprocess_kernel.hpp b/perception/lidar_centerpoint/lib/include/postprocess_kernel.hpp
new file mode 100644
index 0000000000000..b4008fa12529b
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/include/postprocess_kernel.hpp
@@ -0,0 +1,52 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef POSTPROCESS_KERNEL_HPP_
+#define POSTPROCESS_KERNEL_HPP_
+
+#include <config.hpp>
+#include <utils.hpp>
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <thrust/device_vector.h>
+
+#include <vector>
+
+namespace centerpoint
+{
+class PostProcessCUDA
+{
+public:
+  explicit PostProcessCUDA(const std::size_t num_class, const float score_threshold);
+
+  cudaError_t generateDetectedBoxes3D_launch(
+    const float * out_heatmap, const float * out_offset, const float * out_z, const float * out_dim,
+    const float * out_rot, const float * out_vel, std::vector<Box3D> & det_boxes3d,
+    cudaStream_t stream);
+
+private:
+  cudaError_t generateBoxes3D_launch(
+    const float * out_heatmap, const float * out_offset, const float * out_z, const float * out_dim,
+    const float * out_rot, const float * out_vel, Box3D * det_boxes3d, cudaStream_t stream);
+
+  std::size_t num_class_{0};
+  float score_threshold_{0.0f};
+  float dist_threshold_{1.5f};  // TODO(yukke42): temporary value
+  thrust::device_vector<Box3D> boxes3d_d_;
+};
+
+}  // namespace centerpoint
+
+#endif  // POSTPROCESS_KERNEL_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/preprocess_kernel.hpp b/perception/lidar_centerpoint/lib/include/preprocess_kernel.hpp
new file mode 100644
index 0000000000000..58a23c7cf712f
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/include/preprocess_kernel.hpp
@@ -0,0 +1,29 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PREPROCESS_KERNEL_HPP_
+#define PREPROCESS_KERNEL_HPP_
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace centerpoint
+{
+cudaError_t generateFeatures_launch(
+  const float * voxel_features, const float * voxel_num_points, const int * coords,
+  const std::size_t num_voxels, float * features, cudaStream_t stream);
+
+}  // namespace centerpoint
+
+#endif  // PREPROCESS_KERNEL_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/scatter_kernel.hpp b/perception/lidar_centerpoint/lib/include/scatter_kernel.hpp
new file mode 100644
index 0000000000000..c84c7044a2c77
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/include/scatter_kernel.hpp
@@ -0,0 +1,29 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SCATTER_KERNEL_HPP_
+#define SCATTER_KERNEL_HPP_
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace centerpoint
+{
+cudaError_t scatterFeatures_launch(
+  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  float * scattered_features, cudaStream_t stream);
+
+}  // namespace centerpoint
+
+#endif  // SCATTER_KERNEL_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/tensorrt_wrapper.hpp b/perception/lidar_centerpoint/lib/include/tensorrt_wrapper.hpp
index 9efa222d92388..dd49723a5265c 100644
--- a/perception/lidar_centerpoint/lib/include/tensorrt_wrapper.hpp
+++ b/perception/lidar_centerpoint/lib/include/tensorrt_wrapper.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/perception/lidar_centerpoint/lib/include/utils.hpp b/perception/lidar_centerpoint/lib/include/utils.hpp
new file mode 100644
index 0000000000000..8aa35ff33cdc6
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/include/utils.hpp
@@ -0,0 +1,41 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTILS_HPP_
+#define UTILS_HPP_
+
+#include <cstddef>
+
+namespace centerpoint
+{
+struct Box3D
+{
+  // initializer not allowed for __shared__ variable
+  int label;
+  float score;
+  float x;
+  float y;
+  float z;
+  float length;
+  float width;
+  float height;
+  float yaw;
+  float vel_x;
+  float vel_y;
+};
+
+std::size_t divup(const std::size_t a, const std::size_t b);
+}  // namespace centerpoint
+
+#endif  // UTILS_HPP_
diff --git a/perception/lidar_centerpoint/lib/include/voxel_generator.hpp b/perception/lidar_centerpoint/lib/include/voxel_generator.hpp
index f6062bbb9a1e0..9b69411ce0611 100644
--- a/perception/lidar_centerpoint/lib/include/voxel_generator.hpp
+++ b/perception/lidar_centerpoint/lib/include/voxel_generator.hpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,9 +20,8 @@
 
 #include <sensor_msgs/msg/point_cloud2.hpp>
 
-#include <torch/torch.h>
-
 #include <memory>
+#include <vector>
 
 namespace centerpoint
 {
@@ -31,15 +30,18 @@ class VoxelGeneratorTemplate
 public:
   explicit VoxelGeneratorTemplate(const DensificationParam & param);
 
-  virtual int pointsToVoxels(
-    at::Tensor & voxels, at::Tensor & coordinates, at::Tensor & num_points_per_voxel) = 0;
+  virtual std::size_t pointsToVoxels(
+    std::vector<float> & voxels, std::vector<int> & coordinates,
+    std::vector<float> & num_points_per_voxel) = 0;
 
-  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
 
 protected:
-  std::array<float, 6> pointcloud_range_{
-    Config::pointcloud_range_xmin, Config::pointcloud_range_ymin, Config::pointcloud_range_zmin,
-    Config::pointcloud_range_xmax, Config::pointcloud_range_ymax, Config::pointcloud_range_zmax};
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
+
+  std::array<float, 6> range_{Config::range_min_x, Config::range_min_y, Config::range_min_z,
+                              Config::range_max_x, Config::range_max_y, Config::range_max_z};
   std::array<float, 3> recip_voxel_size_{
     1 / Config::voxel_size_x, 1 / Config::voxel_size_y, 1 / Config::voxel_size_z};
   std::array<int, 3> grid_size_{Config::grid_size_x, Config::grid_size_y, Config::grid_size_z};
@@ -50,10 +52,9 @@ class VoxelGenerator : public VoxelGeneratorTemplate
 public:
   using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
 
-  // explicit VoxelGenerator(const DensificationParam & param);
-
-  int pointsToVoxels(
-    at::Tensor & voxels, at::Tensor & coordinates, at::Tensor & num_points_per_voxel) override;
+  std::size_t pointsToVoxels(
+    std::vector<float> & voxels, std::vector<int> & coordinates,
+    std::vector<float> & num_points_per_voxel) override;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/src/centerpoint_trt.cpp
index 5e52cd62e5d28..898866acd8888 100644
--- a/perception/lidar_centerpoint/lib/src/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/src/centerpoint_trt.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,14 +13,10 @@
 // limitations under the License.
 
 #include <centerpoint_trt.hpp>
-#include <heatmap_utils.hpp>
+#include <preprocess_kernel.hpp>
+#include <scatter_kernel.hpp>
 #include <tier4_autoware_utils/math/constants.hpp>
 
-#include <ATen/cuda/CUDAContext.h>
-#include <NvOnnxParser.h>
-#include <c10/cuda/CUDAStream.h>
-#include <torch/script.h>
-
 #include <iostream>
 #include <memory>
 #include <string>
@@ -28,38 +24,32 @@
 
 namespace centerpoint
 {
-using torch::indexing::Slice;
-
 CenterPointTRT::CenterPointTRT(
-  const int num_class, const NetworkParam & encoder_param, const NetworkParam & head_param,
-  const DensificationParam & densification_param)
+  const std::size_t num_class, const float score_threshold, const NetworkParam & encoder_param,
+  const NetworkParam & head_param, const DensificationParam & densification_param)
 : num_class_(num_class)
 {
   vg_ptr_ = std::make_unique<VoxelGenerator>(densification_param);
-  bool verbose = false;
-
-  if (encoder_param.use_trt()) {
-    encoder_trt_ptr_ = std::make_unique<VoxelEncoderTRT>(verbose);
-    encoder_trt_ptr_->init(
-      encoder_param.onnx_path(), encoder_param.engine_path(), encoder_param.trt_precision());
-  } else {
-    loadTorchScript(encoder_pt_, encoder_param.pt_path());
-  }
-
-  if (head_param.use_trt()) {
-    head_trt_ptr_ = std::make_unique<HeadTRT>(num_class, verbose);
-    head_trt_ptr_->init(
-      head_param.onnx_path(), head_param.engine_path(), head_param.trt_precision());
-    head_trt_ptr_->context_->setBindingDimensions(
-      0, nvinfer1::Dims4(
-           1, Config::num_encoder_output_features, Config::grid_size_y, Config::grid_size_x));
-  } else {
-    loadTorchScript(head_pt_, head_param.pt_path());
-  }
-
-  initPtr(encoder_param.use_trt(), head_param.use_trt());
-
-  torch::set_num_threads(1);  // disable CPU parallelization
+  post_proc_ptr_ = std::make_unique<PostProcessCUDA>(num_class, score_threshold);
+
+  // encoder
+  encoder_trt_ptr_ = std::make_unique<VoxelEncoderTRT>(verbose_);
+  encoder_trt_ptr_->init(
+    encoder_param.onnx_path(), encoder_param.engine_path(), encoder_param.trt_precision());
+  encoder_trt_ptr_->context_->setBindingDimensions(
+    0,
+    nvinfer1::Dims3(
+      Config::max_num_voxels, Config::max_num_points_per_voxel, Config::encoder_in_feature_size));
+
+  // head
+  head_trt_ptr_ = std::make_unique<HeadTRT>(num_class, verbose_);
+  head_trt_ptr_->init(head_param.onnx_path(), head_param.engine_path(), head_param.trt_precision());
+  head_trt_ptr_->context_->setBindingDimensions(
+    0, nvinfer1::Dims4(
+         Config::batch_size, Config::encoder_out_feature_size, Config::grid_size_y,
+         Config::grid_size_x));
+
+  initPtr();
 
   cudaStreamCreate(&stream_);
 }
@@ -72,235 +62,127 @@ CenterPointTRT::~CenterPointTRT()
   }
 }
 
-bool CenterPointTRT::initPtr(const bool use_encoder_trt, const bool use_head_trt)
+void CenterPointTRT::initPtr()
 {
-  if (use_encoder_trt) {
-    output_pillar_feature_t_ = torch::zeros(
-      {Config::max_num_voxels, Config::num_encoder_output_features},
-      torch::TensorOptions().device(device_).dtype(torch::kFloat));
-  }
-
-  if (use_head_trt) {
-    const int downsample_grid_x =
-      static_cast<int>(static_cast<float>(Config::grid_size_x) / Config::downsample_factor);
-    const int downsample_grid_y =
-      static_cast<int>(static_cast<float>(Config::grid_size_y) / Config::downsample_factor);
-    auto torch_options = torch::TensorOptions().device(device_).dtype(torch::kFloat);
-    output_heatmap_t_ = torch::zeros(
-      {Config::batch_size, num_class_, downsample_grid_y, downsample_grid_x}, torch_options);
-    output_offset_t_ = torch::zeros(
-      {Config::batch_size, Config::num_output_offset_features, downsample_grid_y,
-       downsample_grid_x},
-      torch_options);
-    output_z_t_ = torch::zeros(
-      {Config::batch_size, Config::num_output_z_features, downsample_grid_y, downsample_grid_x},
-      torch_options);
-    output_dim_t_ = torch::zeros(
-      {Config::batch_size, Config::num_output_dim_features, downsample_grid_y, downsample_grid_x},
-      torch_options);
-    output_rot_t_ = torch::zeros(
-      {Config::batch_size, Config::num_output_rot_features, downsample_grid_y, downsample_grid_x},
-      torch_options);
-    output_vel_t_ = torch::zeros(
-      {Config::batch_size, Config::num_output_vel_features, downsample_grid_y, downsample_grid_x},
-      torch_options);
-  }
-
-  return true;
+  const auto voxels_size =
+    Config::max_num_voxels * Config::max_num_points_per_voxel * Config::point_feature_size;
+  const auto coordinates_size = Config::max_num_voxels * Config::point_dim_size;
+  encoder_in_feature_size_ =
+    Config::max_num_voxels * Config::max_num_points_per_voxel * Config::encoder_in_feature_size;
+  const auto pillar_features_size = Config::max_num_voxels * Config::encoder_out_feature_size;
+  spatial_features_size_ =
+    Config::grid_size_x * Config::grid_size_y * Config::encoder_out_feature_size;
+  const auto grid_xy = Config::down_grid_size_x * Config::down_grid_size_y;
+
+  // host
+  voxels_.resize(voxels_size);
+  coordinates_.resize(coordinates_size);
+  num_points_per_voxel_.resize(Config::max_num_voxels);
+
+  // device
+  voxels_d_ = cuda::make_unique<float[]>(voxels_size);
+  coordinates_d_ = cuda::make_unique<int[]>(coordinates_size);
+  num_points_per_voxel_d_ = cuda::make_unique<float[]>(Config::max_num_voxels);
+  encoder_in_features_d_ = cuda::make_unique<float[]>(encoder_in_feature_size_);
+  pillar_features_d_ = cuda::make_unique<float[]>(pillar_features_size);
+  spatial_features_d_ = cuda::make_unique<float[]>(spatial_features_size_);
+  head_out_heatmap_d_ = cuda::make_unique<float[]>(grid_xy * num_class_);
+  head_out_offset_d_ = cuda::make_unique<float[]>(grid_xy * Config::head_out_offset_size);
+  head_out_z_d_ = cuda::make_unique<float[]>(grid_xy * Config::head_out_z_size);
+  head_out_dim_d_ = cuda::make_unique<float[]>(grid_xy * Config::head_out_dim_size);
+  head_out_rot_d_ = cuda::make_unique<float[]>(grid_xy * Config::head_out_rot_size);
+  head_out_vel_d_ = cuda::make_unique<float[]>(grid_xy * Config::head_out_vel_size);
 }
 
-std::vector<float> CenterPointTRT::detect(
-  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+bool CenterPointTRT::detect(
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
+  std::vector<Box3D> & det_boxes3d)
 {
-  voxels_t_ = torch::zeros(
-    {Config::max_num_voxels, Config::max_num_points_per_voxel, Config::num_point_features},
-    torch::TensorOptions().device(torch::kCPU).dtype(torch::kFloat));
-  coordinates_t_ = torch::zeros(
-    {Config::max_num_voxels, Config::num_point_dims},
-    torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt));
-  num_points_per_voxel_t_ = torch::zeros(
-    {Config::max_num_voxels}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt));
-
-  vg_ptr_->pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
-  int num_voxels = vg_ptr_->pointsToVoxels(voxels_t_, coordinates_t_, num_points_per_voxel_t_);
-  // Note: unlike python implementation, no slicing by num_voxels
-  //       .s.t voxels_t_ = voxels_t_[:num_voxels].
-  //       w/ slicing more GPU memories are allocated
-
-  voxels_t_ = voxels_t_.to(device_);
-  coordinates_t_ = coordinates_t_.to(device_);
-  num_points_per_voxel_t_ = num_points_per_voxel_t_.to(device_);
-  at::Tensor input_features =
-    createInputFeatures(voxels_t_, coordinates_t_, num_points_per_voxel_t_);
-
-  // Note: num_voxels <= max_num_voxels, so input_features[num_voxels:] are invalid features.
-  input_features.index_put_({Slice(num_voxels)}, 0);
-
-  if (encoder_trt_ptr_ && encoder_trt_ptr_->context_) {
-    std::vector<void *> encoder_buffers{
-      input_features.data_ptr(), output_pillar_feature_t_.data_ptr()};
-    encoder_trt_ptr_->context_->setBindingDimensions(
-      0, nvinfer1::Dims3(
-           Config::max_num_voxels, Config::max_num_points_per_voxel,
-           Config::num_encoder_input_features));
-    encoder_trt_ptr_->context_->enqueueV2(encoder_buffers.data(), stream_, nullptr);
-  } else {
-    std::vector<torch::jit::IValue> batch_input_features;
-    batch_input_features.emplace_back(input_features);
-    batch_input_features.emplace_back(num_points_per_voxel_t_);
-    batch_input_features.emplace_back(coordinates_t_);
-    {
-      torch::NoGradGuard no_grad;
-      output_pillar_feature_t_ = encoder_pt_.forward(batch_input_features).toTensor();
-    }
+  std::fill(voxels_.begin(), voxels_.end(), 0);
+  std::fill(coordinates_.begin(), coordinates_.end(), -1);
+  std::fill(num_points_per_voxel_.begin(), num_points_per_voxel_.end(), 0);
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    encoder_in_features_d_.get(), 0, encoder_in_feature_size_ * sizeof(float), stream_));
+  CHECK_CUDA_ERROR(
+    cudaMemsetAsync(spatial_features_d_.get(), 0, spatial_features_size_ * sizeof(float), stream_));
+
+  if (!preprocess(input_pointcloud_msg, tf_buffer)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_centerpoint"), "Fail to preprocess and skip to detect.");
+    return false;
   }
 
-  at::Tensor spatial_features =
-    scatterPillarFeatures(output_pillar_feature_t_, coordinates_t_.to(torch::kLong));
-
-  if (head_trt_ptr_ && head_trt_ptr_->context_) {
-    std::vector<void *> head_buffers = {spatial_features.data_ptr(), output_heatmap_t_.data_ptr(),
-                                        output_offset_t_.data_ptr(), output_z_t_.data_ptr(),
-                                        output_dim_t_.data_ptr(),    output_rot_t_.data_ptr(),
-                                        output_vel_t_.data_ptr()};
-    head_trt_ptr_->context_->enqueueV2(head_buffers.data(), stream_, nullptr);
-  } else {
-    std::vector<torch::jit::IValue> batch_spatial_features;
-    batch_spatial_features.emplace_back(spatial_features);
+  inference();
 
-    {
-      torch::NoGradGuard no_grad;
-      auto pred_arr = head_pt_.forward(batch_spatial_features).toTuple()->elements();
-      output_heatmap_t_ = pred_arr[0].toTensor();
-      output_offset_t_ = pred_arr[1].toTensor();
-      output_z_t_ = pred_arr[2].toTensor();
-      output_dim_t_ = pred_arr[3].toTensor();
-      output_rot_t_ = pred_arr[4].toTensor();
-      output_vel_t_ = pred_arr[5].toTensor();
-    }
-  }
+  postProcess(det_boxes3d);
 
-  at::Tensor boxes3d = generatePredictedBoxes();
-  std::vector<float> boxes3d_vec =
-    std::vector<float>(boxes3d.data_ptr<float>(), boxes3d.data_ptr<float>() + boxes3d.numel());
-
-  return boxes3d_vec;
-}
-
-at::Tensor CenterPointTRT::createInputFeatures(
-  const at::Tensor & voxels, const at::Tensor & coords, const at::Tensor & voxel_num_points)
-{
-  // voxels (float): (num_pillars, num_max_points, num_point_features)
-  // coordinates (int): (num_pillars, num_point_dims)
-  // voxel_num_points (int): (num_pillars,)
-
-  at::Tensor coords_f = coords.to(torch::kFloat);
-  at::Tensor voxel_num_points_f = voxel_num_points.to(torch::kFloat);
-
-  at::Tensor points_mean =
-    voxels.slice(/*dim=*/2, /*start=*/0, /*end=*/3).sum({1}, /*keepdim=*/true) /
-    voxel_num_points_f.view({-1, 1, 1});
-  at::Tensor cluster = voxels.slice(2, 0, 3) - points_mean;
-
-  // Note: unlike python implementation, batch_index isn't used in coords,
-  at::Tensor center_x =
-    voxels.slice(2, 0, 1) -
-    (coords_f.slice(1, 2, 3).unsqueeze(2) * Config::voxel_size_x + Config::offset_x);
-  at::Tensor center_y =
-    voxels.slice(2, 1, 2) -
-    (coords_f.slice(1, 1, 2).unsqueeze(2) * Config::voxel_size_y + Config::offset_y);
-  at::Tensor input_features = torch::cat({voxels, cluster, center_x, center_y}, /*dim=*/2);
-
-  // paddings_indicator
-  const size_t axis = 0;
-  const int voxel_cnt = input_features.sizes()[1];
-  at::Tensor actual_num = voxel_num_points.unsqueeze(axis + 1);
-  at::Tensor max_num =
-    torch::arange(
-      voxel_cnt, torch::TensorOptions().dtype(torch::kInt32).device(actual_num.device()))
-      .view({1, -1});
-  at::Tensor mask = actual_num.to(torch::kInt32) > max_num;
-  mask = mask.unsqueeze(-1).to(torch::kFloat);
-  input_features *= mask;
-
-  return input_features;  // (num_pillars, num_max_points, num_voxel_features)
+  return true;
 }
 
-at::Tensor CenterPointTRT::scatterPillarFeatures(
-  const at::Tensor & pillar_features, const at::Tensor & coordinates)
+bool CenterPointTRT::preprocess(
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
 {
-  // pillar_features (float): (num_pillars, num_encoder_output_features)
-  // coordinates (float): (num_pillars, num_point_dims)
+  bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+  if (!is_success) {
+    return false;
+  }
+  num_voxels_ = vg_ptr_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_);
+  if (num_voxels_ == 0) {
+    return false;
+  }
 
-  at::Tensor spatial_feature = torch::zeros(
-    {Config::num_encoder_output_features, Config::grid_size_y * Config::grid_size_x},
-    torch::TensorOptions().dtype(pillar_features.dtype()).device(pillar_features.device()));
-  auto index = coordinates.select(1, 1) * Config::grid_size_x + coordinates.select(1, 2);
-  spatial_feature.index_put_({"...", index}, pillar_features.t());
+  const auto voxels_size =
+    num_voxels_ * Config::max_num_points_per_voxel * Config::point_feature_size;
+  const auto coordinates_size = num_voxels_ * Config::point_dim_size;
+  // memcpy from host to device (not copy empty voxels)
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int),
+    cudaMemcpyHostToDevice));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float),
+    cudaMemcpyHostToDevice));
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  CHECK_CUDA_ERROR(generateFeatures_launch(
+    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_,
+    encoder_in_features_d_.get(), stream_));
 
-  return spatial_feature.view({1 /*batch size*/, -1, Config::grid_size_y, Config::grid_size_x})
-    .contiguous();
+  return true;
 }
 
-at::Tensor CenterPointTRT::generatePredictedBoxes()
+void CenterPointTRT::inference()
 {
-  // output_heatmap (float): (batch_size, num_class, H, W)
-  // output_offset (float): (batch_size, num_offset_features, H, W)
-  // output_z (float): (batch_size, num_z_features, H, W)
-  // output_dim (float): (batch_size, num_dim_features, H, W)
-  // output_rot (float): (batch_size, num_rot_features, H, W)
-  // output_vel (float): (batch_size, num_vel_features, H, W)
-
-  at::Tensor heatmap_pred = output_heatmap_t_.clone();
-  heatmap_pred = sigmoid_hm(heatmap_pred);
-  heatmap_pred = nms_hm(heatmap_pred);
-
-  auto topk_tuple = select_topk(heatmap_pred, Config::max_num_output_objects);
-  at::Tensor scores = std::get<0>(topk_tuple);
-  at::Tensor index = std::get<1>(topk_tuple);
-  at::Tensor classes = std::get<2>(topk_tuple);
-  at::Tensor ys = std::get<3>(topk_tuple);
-  at::Tensor xs = std::get<4>(topk_tuple);
-
-  at::Tensor offset_poi = select_point_of_interest(index, output_offset_t_);
-  at::Tensor z_poi = select_point_of_interest(index, output_z_t_);
-  at::Tensor dim_poi = select_point_of_interest(index, output_dim_t_);
-  at::Tensor rot_poi = select_point_of_interest(index, output_rot_t_);
-  at::Tensor vel_poi = select_point_of_interest(index, output_vel_t_);
-
-  at::Tensor x = Config::voxel_size_x * Config::downsample_factor *
-                   (xs.view({1, -1, 1}) + offset_poi.slice(2, 0, 1)) +
-                 Config::pointcloud_range_xmin;
-  at::Tensor y = Config::voxel_size_y * Config::downsample_factor *
-                   (ys.view({1, -1, 1}) + offset_poi.slice(2, 1, 2)) +
-                 Config::pointcloud_range_ymin;
-  dim_poi = torch::exp(dim_poi);
-  at::Tensor rot = torch::atan2(rot_poi.slice(2, 0, 1), rot_poi.slice(2, 1, 2));
-  rot = -rot - tier4_autoware_utils::pi / 2;
-
-  at::Tensor boxes3d =
-    torch::cat(
-      {scores.view({1, -1, 1}), classes.view({1, -1, 1}), x, y, z_poi, dim_poi, rot, vel_poi},
-      /*dim=*/2)
-      .contiguous()
-      .to(torch::kCPU)
-      .to(torch::kFloat);
+  if (!encoder_trt_ptr_->context_ || !head_trt_ptr_->context_) {
+    throw std::runtime_error("Failed to create tensorrt context.");
+  }
 
-  return boxes3d;
+  // pillar encoder network
+  std::vector<void *> encoder_buffers{encoder_in_features_d_.get(), pillar_features_d_.get()};
+  encoder_trt_ptr_->context_->enqueueV2(encoder_buffers.data(), stream_, nullptr);
+
+  // scatter
+  CHECK_CUDA_ERROR(scatterFeatures_launch(
+    pillar_features_d_.get(), coordinates_d_.get(), num_voxels_, spatial_features_d_.get(),
+    stream_));
+
+  // head network
+  std::vector<void *> head_buffers = {spatial_features_d_.get(), head_out_heatmap_d_.get(),
+                                      head_out_offset_d_.get(),  head_out_z_d_.get(),
+                                      head_out_dim_d_.get(),     head_out_rot_d_.get(),
+                                      head_out_vel_d_.get()};
+  head_trt_ptr_->context_->enqueueV2(head_buffers.data(), stream_, nullptr);
 }
 
-bool CenterPointTRT::loadTorchScript(
-  torch::jit::script::Module & module, const std::string & model_path)
+void CenterPointTRT::postProcess(std::vector<Box3D> & det_boxes3d)
 {
-  try {
-    module = torch::jit::load(model_path, device_);
-    module.eval();
-  } catch (const c10::Error & e) {
-    std::cout << "LOADING ERROR: " << e.msg() << std::endl;
-    return false;
+  CHECK_CUDA_ERROR(post_proc_ptr_->generateDetectedBoxes3D_launch(
+    head_out_heatmap_d_.get(), head_out_offset_d_.get(), head_out_z_d_.get(), head_out_dim_d_.get(),
+    head_out_rot_d_.get(), head_out_vel_d_.get(), det_boxes3d, stream_));
+  if (det_boxes3d.size() == 0) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_centerpoint"), "No detected boxes.");
   }
-  std::cout << "Loading from " << model_path << std::endl;
-  return true;
 }
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/circle_nms_kernel.cu b/perception/lidar_centerpoint/lib/src/circle_nms_kernel.cu
new file mode 100644
index 0000000000000..1e0d5846d7563
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/src/circle_nms_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <circle_nms_kernel.hpp>
+#include <cuda_utils.hpp>
+
+#include <thrust/host_vector.h>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK_NMS = 16;
+}  // namespace
+
+namespace centerpoint
+{
+
+__device__ inline float dist2dPow(const Box3D * a, const Box3D * b)
+{
+  return powf(a->x - b->x, 2) + powf(a->y - b->y, 2);
+}
+
+__global__ void circleNMS_Kernel(
+  const Box3D * boxes, const std::size_t num_boxes3d, const std::size_t col_blocks,
+  const float dist2d_pow_threshold, std::uint64_t * mask)
+{
+  // params: boxes (N,)
+  // params: mask (N, divup(N/THREADS_PER_BLOCK_NMS))
+
+  const auto row_start = blockIdx.y;
+  const auto col_start = blockIdx.x;
+
+  if (row_start > col_start) return;
+
+  const std::size_t row_size =
+    fminf(num_boxes3d - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const std::size_t col_size =
+    fminf(num_boxes3d - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ Box3D block_boxes[THREADS_PER_BLOCK_NMS];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x] = boxes[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const std::size_t cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const Box3D * cur_box = boxes + cur_box_idx;
+
+    std::uint64_t t = 0;
+    std::size_t start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (std::size_t i = start; i < col_size; i++) {
+      if (dist2dPow(cur_box, block_boxes + i) < dist2d_pow_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+cudaError_t circleNMS_launch(
+  const thrust::device_vector<Box3D> & boxes3d, const std::size_t num_boxes3d,
+  std::size_t col_blocks, const float distance_threshold,
+  thrust::device_vector<std::uint64_t> & mask, cudaStream_t stream)
+{
+  const float dist2d_pow_thres = powf(distance_threshold, 2);
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  circleNMS_Kernel<<<blocks, threads, 0, stream>>>(
+    thrust::raw_pointer_cast(boxes3d.data()), num_boxes3d, col_blocks, dist2d_pow_thres,
+    thrust::raw_pointer_cast(mask.data()));
+
+  return cudaGetLastError();
+}
+
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream)
+{
+  const auto num_boxes3d = boxes3d.size();
+  const auto col_blocks = divup(num_boxes3d, THREADS_PER_BLOCK_NMS);
+  thrust::device_vector<std::uint64_t> mask_d(num_boxes3d * col_blocks);
+
+  CHECK_CUDA_ERROR(
+    circleNMS_launch(boxes3d, num_boxes3d, col_blocks, distance_threshold, mask_d, stream));
+
+  // memcpy device to host
+  thrust::host_vector<std::uint64_t> mask_h(mask_d.size());
+  thrust::copy(mask_d.begin(), mask_d.end(), mask_h.begin());
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+
+  // generate keep_mask
+  std::vector<std::uint64_t> remv_h(col_blocks);
+  thrust::host_vector<bool> keep_mask_h(keep_mask.size());
+  std::size_t num_to_keep = 0;
+  for (std::size_t i = 0; i < num_boxes3d; i++) {
+    auto nblock = i / THREADS_PER_BLOCK_NMS;
+    auto inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_h[nblock] & (1ULL << inblock))) {
+      keep_mask_h[i] = true;
+      num_to_keep++;
+      std::uint64_t * p = &mask_h[0] + i * col_blocks;
+      for (std::size_t j = nblock; j < col_blocks; j++) {
+        remv_h[j] |= p[j];
+      }
+    } else {
+      keep_mask_h[i] = false;
+    }
+  }
+
+  // memcpy host to device
+  keep_mask = keep_mask_h;
+
+  return num_to_keep;
+}
+
+}  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/network_trt.cpp b/perception/lidar_centerpoint/lib/src/network_trt.cpp
index 3733c12d4f8e5..379809d9e44fb 100644
--- a/perception/lidar_centerpoint/lib/src/network_trt.cpp
+++ b/perception/lidar_centerpoint/lib/src/network_trt.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,23 +22,24 @@ bool VoxelEncoderTRT::setProfile(
   nvinfer1::IBuilderConfig & config)
 {
   auto profile = builder.createOptimizationProfile();
-  auto input_name = network.getInput(0)->getName();
+  auto in_name = network.getInput(0)->getName();
   auto in_dims = nvinfer1::Dims3(
-    Config::max_num_voxels, Config::max_num_points_per_voxel, Config::num_encoder_input_features);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kMIN, in_dims);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kOPT, in_dims);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kMAX, in_dims);
-  auto output_name = network.getOutput(0)->getName();
-  auto out_dims = nvinfer1::Dims2(Config::max_num_voxels, Config::num_encoder_output_features);
-  profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kMIN, out_dims);
-  profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kOPT, out_dims);
-  profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kMAX, out_dims);
+    Config::max_num_voxels, Config::max_num_points_per_voxel, Config::encoder_in_feature_size);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims);
+
+  auto out_name = network.getOutput(0)->getName();
+  auto out_dims = nvinfer1::Dims2(Config::max_num_voxels, Config::encoder_out_feature_size);
+  profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims);
+  profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims);
+  profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims);
   config.addOptimizationProfile(profile);
 
   return true;
 }
 
-HeadTRT::HeadTRT(const int num_class, const bool verbose)
+HeadTRT::HeadTRT(const std::size_t num_class, const bool verbose)
 : TensorRTWrapper(verbose), num_class_(num_class)
 {
 }
@@ -48,33 +49,30 @@ bool HeadTRT::setProfile(
   nvinfer1::IBuilderConfig & config)
 {
   auto profile = builder.createOptimizationProfile();
-  auto input_name = network.getInput(0)->getName();
+  auto in_name = network.getInput(0)->getName();
   auto in_dims = nvinfer1::Dims4(
-    Config::batch_size, Config::num_encoder_output_features, Config::grid_size_y,
-    Config::grid_size_x);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kMIN, in_dims);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kOPT, in_dims);
-  profile->setDimensions(input_name, nvinfer1::OptProfileSelector::kMAX, in_dims);
+    Config::batch_size, Config::encoder_out_feature_size, Config::grid_size_y, Config::grid_size_x);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims);
+  profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims);
 
-  std::array<int, Config::num_output_features> output_channels = {
+  std::array<std::size_t, Config::head_out_size> output_channels = {
     num_class_,
-    Config::num_output_offset_features,
-    Config::num_output_z_features,
-    Config::num_output_dim_features,
-    Config::num_output_rot_features,
-    Config::num_output_vel_features};
-  const int downsample_grid_y =
-    static_cast<int>(static_cast<float>(Config::grid_size_y) / Config::downsample_factor);
-  const int downsample_grid_x =
-    static_cast<int>(static_cast<float>(Config::grid_size_x) / Config::downsample_factor);
-  for (int ci = 0; ci < Config::num_output_features; ci++) {
-    auto output_name = network.getOutput(ci)->getName();
-    auto out_dims = nvinfer1::Dims4(1, output_channels[ci], downsample_grid_y, downsample_grid_x);
-    profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kMIN, out_dims);
-    profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kOPT, out_dims);
-    profile->setDimensions(output_name, nvinfer1::OptProfileSelector::kMAX, out_dims);
+    Config::head_out_offset_size,
+    Config::head_out_z_size,
+    Config::head_out_dim_size,
+    Config::head_out_rot_size,
+    Config::head_out_vel_size};
+  for (std::size_t ci = 0; ci < Config::head_out_size; ci++) {
+    auto out_name = network.getOutput(ci)->getName();
+    auto out_dims = nvinfer1::Dims4(
+      Config::batch_size, output_channels[ci], Config::down_grid_size_x, Config::down_grid_size_y);
+    profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims);
+    profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims);
+    profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims);
   }
   config.addOptimizationProfile(profile);
+
   return true;
 }
 
diff --git a/perception/lidar_centerpoint/lib/src/pointcloud_densification.cpp b/perception/lidar_centerpoint/lib/src/pointcloud_densification.cpp
index e8cd32984f480..fad4f1e2a33d3 100644
--- a/perception/lidar_centerpoint/lib/src/pointcloud_densification.cpp
+++ b/perception/lidar_centerpoint/lib/src/pointcloud_densification.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
 {
 }
 
-void PointCloudDensification::enqueuePointCloud(
+bool PointCloudDensification::enqueuePointCloud(
   const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
 {
   const auto header = pointcloud_msg.header;
@@ -63,12 +63,14 @@ void PointCloudDensification::enqueuePointCloud(
   auto transform_world2current =
     getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
   if (!transform_world2current) {
-    return;
+    return false;
   }
   auto affine_world2current = transformToEigen(transform_world2current.get());
 
   enqueue(pointcloud_msg, affine_world2current);
   dequeue();
+
+  return true;
 }
 
 void PointCloudDensification::enqueue(
diff --git a/perception/lidar_centerpoint/lib/src/postprocess_kernel.cu b/perception/lidar_centerpoint/lib/src/postprocess_kernel.cu
new file mode 100644
index 0000000000000..8e01b7409d74f
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/src/postprocess_kernel.cu
@@ -0,0 +1,159 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <circle_nms_kernel.hpp>
+#include <postprocess_kernel.hpp>
+
+#include <thrust/count.h>
+#include <thrust/sort.h>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK = 32;
+}  // namespace
+
+namespace centerpoint
+{
+
+struct is_score_greater
+{
+  is_score_greater(float t) : t_(t) {}
+
+  __device__ bool operator()(const Box3D & b) { return b.score > t_; }
+
+private:
+  float t_{0.0};
+};
+
+struct is_kept
+{
+  __device__ bool operator()(const bool keep) { return keep; }
+};
+
+struct score_greater
+{
+  __device__ bool operator()(const Box3D & lb, const Box3D & rb) { return lb.score > rb.score; }
+};
+
+__device__ inline float sigmoid(float x) { return 1.0f / expf(-x); }
+
+__global__ void generateBoxes3D_kernel(
+  const float * out_heatmap, const float * out_offset, const float * out_z, const float * out_dim,
+  const float * out_rot, const float * out_vel, const float voxel_size_x, const float voxel_size_y,
+  const float range_min_x, const float range_min_y, const std::size_t down_grid_size_x,
+  const std::size_t down_grid_size_y, const std::size_t downsample_factor, const int num_class,
+  Box3D * det_boxes3d)
+{
+  // generate boxes3d from the outputs of the network.
+  // shape of out_*: (N, DOWN_GRID_SIZE_Y, DOWN_GRID_SIZE_X)
+  // heatmap: N = num_class, offset: N = 2, z: N = 1, dim: N = 3, rot: N = 2, vel: N = 2
+  const auto yi = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+  const auto xi = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const auto idx = down_grid_size_x * yi + xi;
+  const auto down_grid_size = down_grid_size_y * down_grid_size_x;
+
+  if (yi >= down_grid_size_y || xi >= down_grid_size_x) {
+    return;
+  }
+
+  int label = -1;
+  float max_score = -1;
+  for (int ci = 0; ci < num_class; ci++) {
+    float score = sigmoid(out_heatmap[down_grid_size * ci + idx]);
+    if (score > max_score) {
+      label = ci;
+      max_score = score;
+    }
+  }
+
+  const float offset_x = out_offset[down_grid_size * 0 + idx];
+  const float offset_y = out_offset[down_grid_size * 1 + idx];
+  const float x = voxel_size_x * downsample_factor * (xi + offset_x) + range_min_x;
+  const float y = voxel_size_y * downsample_factor * (yi + offset_y) + range_min_y;
+  const float z = out_z[idx];
+  const float w = out_dim[down_grid_size * 0 + idx];
+  const float l = out_dim[down_grid_size * 1 + idx];
+  const float h = out_dim[down_grid_size * 2 + idx];
+  const float yaw_sin = out_rot[down_grid_size * 0 + idx];
+  const float yaw_cos = out_rot[down_grid_size * 1 + idx];
+  const float vel_x = out_vel[down_grid_size * 0 + idx];
+  const float vel_y = out_vel[down_grid_size * 1 + idx];
+
+  det_boxes3d[idx].label = label;
+  det_boxes3d[idx].score = max_score;
+  det_boxes3d[idx].x = x;
+  det_boxes3d[idx].y = y;
+  det_boxes3d[idx].z = z;
+  det_boxes3d[idx].length = expf(l);
+  det_boxes3d[idx].width = expf(w);
+  det_boxes3d[idx].height = expf(h);
+  det_boxes3d[idx].yaw = atan2f(yaw_sin, yaw_cos);
+  det_boxes3d[idx].vel_x = vel_x;
+  det_boxes3d[idx].vel_y = vel_y;
+}
+
+PostProcessCUDA::PostProcessCUDA(const std::size_t num_class, const float score_threshold)
+: num_class_(num_class), score_threshold_(score_threshold)
+{
+  const auto num_raw_boxes3d = Config::down_grid_size_y * Config::down_grid_size_x;
+  boxes3d_d_ = thrust::device_vector<Box3D>(num_raw_boxes3d);
+}
+
+cudaError_t PostProcessCUDA::generateDetectedBoxes3D_launch(
+  const float * out_heatmap, const float * out_offset, const float * out_z, const float * out_dim,
+  const float * out_rot, const float * out_vel, std::vector<Box3D> & det_boxes3d,
+  cudaStream_t stream)
+{
+  dim3 blocks(
+    divup(Config::down_grid_size_y, THREADS_PER_BLOCK),
+    divup(Config::down_grid_size_x, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+  generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
+    out_heatmap, out_offset, out_z, out_dim, out_rot, out_vel, Config::voxel_size_x,
+    Config::voxel_size_y, Config::range_min_x, Config::range_min_y, Config::down_grid_size_x,
+    Config::down_grid_size_y, Config::downsample_factor, num_class_,
+    thrust::raw_pointer_cast(boxes3d_d_.data()));
+
+  // suppress by socre
+  const auto num_det_boxes3d = thrust::count_if(
+    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(), is_score_greater(score_threshold_));
+  if (num_det_boxes3d == 0) {
+    return cudaGetLastError();
+  }
+  thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, boxes3d_d_.begin(), boxes3d_d_.end(), det_boxes3d_d.begin(),
+    is_score_greater(score_threshold_));
+
+  // sort by score
+  thrust::sort(det_boxes3d_d.begin(), det_boxes3d_d.end(), score_greater());
+
+  // supress by NMS
+  thrust::device_vector<bool> final_keep_mask_d(num_det_boxes3d);
+  const auto num_final_det_boxes3d =
+    circleNMS(det_boxes3d_d, dist_threshold_, final_keep_mask_d, stream);
+
+  thrust::device_vector<Box3D> final_det_boxes3d_d(num_final_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, det_boxes3d_d.begin(), det_boxes3d_d.end(), final_keep_mask_d.begin(),
+    final_det_boxes3d_d.begin(), is_kept());
+
+  // memcpy device to host
+  det_boxes3d.resize(num_final_det_boxes3d);
+  thrust::copy(final_det_boxes3d_d.begin(), final_det_boxes3d_d.end(), det_boxes3d.begin());
+
+  return cudaGetLastError();
+}
+
+}  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/src/preprocess_kernel.cu
new file mode 100644
index 0000000000000..d14d7eb9cd4ad
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/src/preprocess_kernel.cu
@@ -0,0 +1,157 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.hpp>
+#include <preprocess_kernel.hpp>
+#include <utils.hpp>
+
+namespace
+{
+const std::size_t WARP_SIZE = 32;
+const std::size_t WARPS_PER_BLOCK = 4;
+const std::size_t FEATURE_SIZE = 9;  // same as `box_feature_size` in config.hpp
+
+}  // namespace
+
+namespace centerpoint
+{
+__global__ void generateFeatures_kernel(
+  const float * voxel_features, const float * voxel_num_points, const int * coords,
+  const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
+  const float range_min_x, const float range_min_y, const float range_min_z, float * features)
+{
+  // voxel_features (float): (max_num_voxels, max_num_points_per_voxel, point_feature_size)
+  // voxel_num_points (int): (max_num_voxels)
+  // coords (int): (max_num_voxels, point_dim_size)
+  int pillar_idx = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE;
+  int point_idx = threadIdx.x % WARP_SIZE;
+  int pillar_idx_inBlock = threadIdx.x / WARP_SIZE;
+
+  if (pillar_idx >= num_voxels) return;
+
+  // load src
+  __shared__ float4 pillarSM[WARPS_PER_BLOCK][WARP_SIZE];
+  __shared__ float3 pillarSumSM[WARPS_PER_BLOCK];
+  __shared__ int3 cordsSM[WARPS_PER_BLOCK];
+  __shared__ int pointsNumSM[WARPS_PER_BLOCK];
+  __shared__ float pillarOutSM[WARPS_PER_BLOCK][WARP_SIZE][FEATURE_SIZE];
+
+  if (threadIdx.x < WARPS_PER_BLOCK) {
+    pointsNumSM[threadIdx.x] = voxel_num_points[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
+    cordsSM[threadIdx.x] = ((int3 *)coords)[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
+    pillarSumSM[threadIdx.x] = {0, 0, 0};
+  }
+
+  pillarSM[pillar_idx_inBlock][point_idx] =
+    ((float4 *)voxel_features)[pillar_idx * WARP_SIZE + point_idx];
+  __syncthreads();
+
+  // calculate sm in a pillar
+  if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
+    atomicAdd(&(pillarSumSM[pillar_idx_inBlock].x), pillarSM[pillar_idx_inBlock][point_idx].x);
+    atomicAdd(&(pillarSumSM[pillar_idx_inBlock].y), pillarSM[pillar_idx_inBlock][point_idx].y);
+    atomicAdd(&(pillarSumSM[pillar_idx_inBlock].z), pillarSM[pillar_idx_inBlock][point_idx].z);
+  }
+  __syncthreads();
+
+  // feature-mean
+  float3 mean;
+  float validPoints = pointsNumSM[pillar_idx_inBlock];
+  mean.x = pillarSumSM[pillar_idx_inBlock].x / validPoints;
+  mean.y = pillarSumSM[pillar_idx_inBlock].y / validPoints;
+  mean.z = pillarSumSM[pillar_idx_inBlock].z / validPoints;
+
+  mean.x = pillarSM[pillar_idx_inBlock][point_idx].x - mean.x;
+  mean.y = pillarSM[pillar_idx_inBlock][point_idx].y - mean.y;
+  mean.z = pillarSM[pillar_idx_inBlock][point_idx].z - mean.z;
+
+  // calculate offset
+  float x_offset = voxel_x / 2 + cordsSM[pillar_idx_inBlock].z * voxel_x + range_min_x;
+  float y_offset = voxel_y / 2 + cordsSM[pillar_idx_inBlock].y * voxel_y + range_min_y;
+  float z_offset = voxel_z / 2 + cordsSM[pillar_idx_inBlock].x * voxel_z + range_min_z;
+
+  // feature-offset
+  float3 center;
+  center.x = pillarSM[pillar_idx_inBlock][point_idx].x - x_offset;
+  center.y = pillarSM[pillar_idx_inBlock][point_idx].y - y_offset;
+  center.z = pillarSM[pillar_idx_inBlock][point_idx].z - z_offset;
+
+  // store output
+  if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
+    pillarOutSM[pillar_idx_inBlock][point_idx][0] = pillarSM[pillar_idx_inBlock][point_idx].x;
+    pillarOutSM[pillar_idx_inBlock][point_idx][1] = pillarSM[pillar_idx_inBlock][point_idx].y;
+    pillarOutSM[pillar_idx_inBlock][point_idx][2] = pillarSM[pillar_idx_inBlock][point_idx].z;
+    pillarOutSM[pillar_idx_inBlock][point_idx][3] = pillarSM[pillar_idx_inBlock][point_idx].w;
+
+    pillarOutSM[pillar_idx_inBlock][point_idx][4] = mean.x;
+    pillarOutSM[pillar_idx_inBlock][point_idx][5] = mean.y;
+    pillarOutSM[pillar_idx_inBlock][point_idx][6] = mean.z;
+
+    pillarOutSM[pillar_idx_inBlock][point_idx][7] = center.x;
+    pillarOutSM[pillar_idx_inBlock][point_idx][8] = center.y;
+
+  } else {
+    pillarOutSM[pillar_idx_inBlock][point_idx][0] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][1] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][2] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][3] = 0;
+
+    pillarOutSM[pillar_idx_inBlock][point_idx][4] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][5] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][6] = 0;
+
+    pillarOutSM[pillar_idx_inBlock][point_idx][7] = 0;
+    pillarOutSM[pillar_idx_inBlock][point_idx][8] = 0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < FEATURE_SIZE; i++) {
+    int outputSMId = pillar_idx_inBlock * WARP_SIZE * FEATURE_SIZE + i * WARP_SIZE + point_idx;
+    int outputId = pillar_idx * WARP_SIZE * FEATURE_SIZE + i * WARP_SIZE + point_idx;
+    features[outputId] = ((float *)pillarOutSM)[outputSMId];
+  }
+}
+
+cudaError_t generateFeatures_launch(
+  const float * voxel_features, const float * voxel_num_points, const int * coords,
+  const std::size_t num_voxels, float * features, cudaStream_t stream)
+{
+  dim3 blocks(divup(Config::max_num_voxels, WARPS_PER_BLOCK));
+  dim3 threads(WARPS_PER_BLOCK * WARP_SIZE);
+  generateFeatures_kernel<<<blocks, threads, 0, stream>>>(
+    voxel_features, voxel_num_points, coords, num_voxels, Config::voxel_size_x,
+    Config::voxel_size_y, Config::voxel_size_z, Config::range_min_x, Config::range_min_y,
+    Config::range_min_z, features);
+
+  return cudaGetLastError();
+}
+
+}  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/scatter_kernel.cu b/perception/lidar_centerpoint/lib/src/scatter_kernel.cu
new file mode 100644
index 0000000000000..c4fb48d943528
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/src/scatter_kernel.cu
@@ -0,0 +1,66 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <config.hpp>
+#include <scatter_kernel.hpp>
+#include <utils.hpp>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK = 32;
+}  // namespace
+
+namespace centerpoint
+{
+__global__ void scatterFeatures_kernel(
+  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  const std::size_t pillar_feature_size, const std::size_t grid_size_x,
+  const std::size_t grid_size_y, float * scattered_features)
+{
+  // pillar_features: shape of (max_num_pillars, pillar_feature_size)
+  // coords: shape of (max_num_pillars, 3)
+  // scattered_features: shape of (num_pillars, grid_size_y, grid_size_x)
+  const auto pillar_i = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+  const auto feature_i = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+
+  if (pillar_i >= num_pillars || feature_i >= pillar_feature_size) {
+    return;
+  }
+
+  const int3 coord = ((int3 *)coords)[pillar_i];  // zyx
+  if (coord.x < 0) {
+    return;
+  }
+
+  const auto feature = pillar_features[pillar_feature_size * pillar_i + feature_i];
+  scattered_features[grid_size_y * grid_size_x * feature_i + grid_size_x * coord.y + coord.z] =
+    feature;
+}
+
+cudaError_t scatterFeatures_launch(
+  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  float * scattered_features, cudaStream_t stream)
+{
+  dim3 blocks(
+    divup(Config::max_num_voxels, THREADS_PER_BLOCK),
+    divup(Config::encoder_out_feature_size, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+  scatterFeatures_kernel<<<blocks, threads, 0, stream>>>(
+    pillar_features, coords, num_pillars, Config::encoder_out_feature_size, Config::grid_size_x,
+    Config::grid_size_y, scattered_features);
+
+  return cudaGetLastError();
+}
+
+}  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/tensorrt_wrapper.cpp b/perception/lidar_centerpoint/lib/src/tensorrt_wrapper.cpp
index 118c1829c9250..3691128137c39 100644
--- a/perception/lidar_centerpoint/lib/src/tensorrt_wrapper.cpp
+++ b/perception/lidar_centerpoint/lib/src/tensorrt_wrapper.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/perception/lidar_centerpoint/lib/src/utils.cpp b/perception/lidar_centerpoint/lib/src/utils.cpp
new file mode 100644
index 0000000000000..8914615ff9f1d
--- /dev/null
+++ b/perception/lidar_centerpoint/lib/src/utils.cpp
@@ -0,0 +1,33 @@
+// Copyright 2022 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <utils.hpp>
+
+#include <stdexcept>
+
+namespace centerpoint
+{
+std::size_t divup(const std::size_t a, const std::size_t b)
+{
+  if (a == 0) {
+    throw std::runtime_error("A dividend of divup isn't positive.");
+  }
+  if (b == 0) {
+    throw std::runtime_error("A divisor of divup isn't positive.");
+  }
+
+  return (a + b - 1) / b;
+}
+
+}  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/src/voxel_generator.cpp b/perception/lidar_centerpoint/lib/src/voxel_generator.cpp
index 01f8b60b14f80..576c8d92d6d2a 100644
--- a/perception/lidar_centerpoint/lib/src/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/src/voxel_generator.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,8 +16,6 @@
 
 #include <sensor_msgs/point_cloud2_iterator.hpp>
 
-#include <torch/torch.h>
-
 namespace centerpoint
 {
 VoxelGeneratorTemplate::VoxelGeneratorTemplate(const DensificationParam & param)
@@ -25,27 +23,29 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(const DensificationParam & param)
   pd_ptr_ = std::make_unique<PointCloudDensification>(param);
 }
 
-int VoxelGenerator::pointsToVoxels(
-  at::Tensor & voxels, at::Tensor & coordinates, at::Tensor & num_points_per_voxel)
+bool VoxelGeneratorTemplate::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
+}
+
+std::size_t VoxelGenerator::pointsToVoxels(
+  std::vector<float> & voxels, std::vector<int> & coordinates,
+  std::vector<float> & num_points_per_voxel)
 {
-  // voxels (float): (max_num_voxels, max_num_points_per_voxel, num_point_features)
-  // coordinates (int): (max_num_voxels, num_point_dims)
-  // num_points_per_voxel (int): (max_num_voxels)
-
-  at::Tensor coord_to_voxel_idx = torch::full(
-    {Config::grid_size_z, Config::grid_size_y, Config::grid_size_x}, -1,
-    at::TensorOptions().dtype(torch::kInt));
-
-  auto voxels_p = voxels.data_ptr<float>();
-  auto coordinates_p = coordinates.data_ptr<int>();
-  auto num_points_per_voxel_p = num_points_per_voxel.data_ptr<int>();
-  auto coord_to_voxel_idx_p = coord_to_voxel_idx.data_ptr<int>();
-
-  int voxel_cnt = 0;  // @return
-  std::array<float, Config::num_point_features> point;
-  std::array<float, Config::num_point_dims> coord_zyx;
+  // voxels (float): (max_num_voxels * max_num_points_per_voxel * point_feature_size)
+  // coordinates (int): (max_num_voxels * point_dim_size)
+  // num_points_per_voxel (float): (max_num_voxels)
+
+  const std::size_t grid_size = Config::grid_size_z * Config::grid_size_y * Config::grid_size_x;
+  std::vector<int> coord_to_voxel_idx(grid_size, -1);
+
+  std::size_t voxel_cnt = 0;  // @return
+  std::array<float, Config::point_feature_size> point;
+  std::array<float, Config::point_dim_size> coord_zyx;
   bool out_of_range;
-  int c, coord_idx, voxel_idx, point_cnt;
+  std::size_t point_cnt;
+  int c, coord_idx, voxel_idx;
   Eigen::Vector3f point_current, point_past;
 
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
@@ -68,13 +68,13 @@ int VoxelGenerator::pointsToVoxels(
       point[3] = timelag;
 
       out_of_range = false;
-      for (int di = 0; di < Config::num_point_dims; di++) {
-        c = static_cast<int>((point[di] - pointcloud_range_[di]) * recip_voxel_size_[di]);
+      for (std::size_t di = 0; di < Config::point_dim_size; di++) {
+        c = static_cast<int>((point[di] - range_[di]) * recip_voxel_size_[di]);
         if (c < 0 || c >= grid_size_[di]) {
           out_of_range = true;
           break;
         }
-        coord_zyx[Config::num_point_dims - di - 1] = c;
+        coord_zyx[Config::point_dim_size - di - 1] = c;
       }
       if (out_of_range) {
         continue;
@@ -82,7 +82,7 @@ int VoxelGenerator::pointsToVoxels(
 
       coord_idx = coord_zyx[0] * Config::grid_size_y * Config::grid_size_x +
                   coord_zyx[1] * Config::grid_size_x + coord_zyx[2];
-      voxel_idx = coord_to_voxel_idx_p[coord_idx];
+      voxel_idx = coord_to_voxel_idx[coord_idx];
       if (voxel_idx == -1) {
         voxel_idx = voxel_cnt;
         if (voxel_cnt >= Config::max_num_voxels) {
@@ -90,20 +90,20 @@ int VoxelGenerator::pointsToVoxels(
         }
 
         voxel_cnt++;
-        coord_to_voxel_idx_p[coord_idx] = voxel_idx;
-        for (int di = 0; di < Config::num_point_dims; di++) {
-          coordinates_p[voxel_idx * Config::num_point_dims + di] = coord_zyx[di];
+        coord_to_voxel_idx[coord_idx] = voxel_idx;
+        for (std::size_t di = 0; di < Config::point_dim_size; di++) {
+          coordinates[voxel_idx * Config::point_dim_size + di] = coord_zyx[di];
         }
       }
 
-      point_cnt = num_points_per_voxel_p[voxel_idx];
+      point_cnt = num_points_per_voxel[voxel_idx];
       if (point_cnt < Config::max_num_points_per_voxel) {
-        for (int fi = 0; fi < Config::num_point_features; fi++) {
-          voxels_p
-            [voxel_idx * Config::max_num_points_per_voxel * Config::num_point_features +
-             point_cnt * Config::num_point_features + fi] = point[fi];
+        for (std::size_t fi = 0; fi < Config::point_feature_size; fi++) {
+          voxels
+            [voxel_idx * Config::max_num_points_per_voxel * Config::point_feature_size +
+             point_cnt * Config::point_feature_size + fi] = point[fi];
         }
-        num_points_per_voxel_p[voxel_idx]++;
+        num_points_per_voxel[voxel_idx]++;
       }
     }
   }
diff --git a/perception/lidar_centerpoint/src/node.cpp b/perception/lidar_centerpoint/src/node.cpp
index 403a6d83400c9..dbcd898c178d1 100644
--- a/perception/lidar_centerpoint/src/node.cpp
+++ b/perception/lidar_centerpoint/src/node.cpp
@@ -1,4 +1,4 @@
-// Copyright 2021 Tier IV, Inc.
+// Copyright 2021 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <pcl_ros/transforms.hpp>
 #include <pointcloud_densification.hpp>
 #include <tier4_autoware_utils/geometry/geometry.hpp>
+#include <tier4_autoware_utils/math/constants.hpp>
+#include <utils.hpp>
 
 #include <tf2_geometry_msgs/tf2_geometry_msgs.h>
 
@@ -34,34 +36,26 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti
   std::string densification_world_frame_id =
     this->declare_parameter("densification_world_frame_id", "map");
   int densification_num_past_frames = this->declare_parameter("densification_num_past_frames", 1);
-  use_encoder_trt_ = this->declare_parameter("use_encoder_trt", false);
-  use_head_trt_ = this->declare_parameter("use_head_trt", true);
-  trt_precision_ = this->declare_parameter("trt_precision", "fp16");
-  encoder_onnx_path_ = this->declare_parameter("encoder_onnx_path", "");
-  encoder_engine_path_ = this->declare_parameter("encoder_engine_path", "");
-  encoder_pt_path_ = this->declare_parameter("encoder_pt_path", "");
-  head_onnx_path_ = this->declare_parameter("head_onnx_path", "");
-  head_engine_path_ = this->declare_parameter("head_engine_path", "");
-  head_pt_path_ = this->declare_parameter("head_pt_path", "");
+  std::string trt_precision = this->declare_parameter("trt_precision", "fp16");
+  std::string encoder_onnx_path = this->declare_parameter("encoder_onnx_path", "");
+  std::string encoder_engine_path = this->declare_parameter("encoder_engine_path", "");
+  std::string head_onnx_path = this->declare_parameter("head_onnx_path", "");
+  std::string head_engine_path = this->declare_parameter("head_engine_path", "");
   class_names_ = this->declare_parameter<std::vector<std::string>>("class_names");
   rename_car_to_truck_and_bus_ = this->declare_parameter("rename_car_to_truck_and_bus", false);
 
-  NetworkParam encoder_param(
-    encoder_onnx_path_, encoder_engine_path_, encoder_pt_path_, trt_precision_, use_encoder_trt_);
-  NetworkParam head_param(
-    head_onnx_path_, head_engine_path_, head_pt_path_, trt_precision_, use_head_trt_);
+  NetworkParam encoder_param(encoder_onnx_path, encoder_engine_path, trt_precision);
+  NetworkParam head_param(head_onnx_path, head_engine_path, trt_precision);
   DensificationParam densification_param(
     densification_world_frame_id, densification_num_past_frames);
   detector_ptr_ = std::make_unique<CenterPointTRT>(
-    static_cast<int>(class_names_.size()), encoder_param, head_param, densification_param);
+    class_names_.size(), score_threshold_, encoder_param, head_param, densification_param);
 
   pointcloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
     "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
     std::bind(&LidarCenterPointNode::pointCloudCallback, this, std::placeholders::_1));
   objects_pub_ = this->create_publisher<autoware_auto_perception_msgs::msg::DetectedObjects>(
     "~/output/objects", rclcpp::QoS{1});
-  pointcloud_pub_ = this->create_publisher<sensor_msgs::msg::PointCloud2>(
-    "~/debug/pointcloud_densification", rclcpp::SensorDataQoS{}.keep_last(1));
 }
 
 void LidarCenterPointNode::pointCloudCallback(
@@ -69,78 +63,84 @@ void LidarCenterPointNode::pointCloudCallback(
 {
   const auto objects_sub_count =
     objects_pub_->get_subscription_count() + objects_pub_->get_intra_process_subscription_count();
-  const auto pointcloud_sub_count = pointcloud_pub_->get_subscription_count() +
-                                    pointcloud_pub_->get_intra_process_subscription_count();
-  if (objects_sub_count < 1 && pointcloud_sub_count < 1) {
+  if (objects_sub_count < 1) {
     return;
   }
 
-  std::vector<float> boxes3d_vec = detector_ptr_->detect(*input_pointcloud_msg, tf_buffer_);
+  std::vector<Box3D> det_boxes3d;
+  bool is_success = detector_ptr_->detect(*input_pointcloud_msg, tf_buffer_, det_boxes3d);
+  if (!is_success) {
+    return;
+  }
 
   autoware_auto_perception_msgs::msg::DetectedObjects output_msg;
   output_msg.header = input_pointcloud_msg->header;
-  for (size_t obj_i = 0; obj_i < boxes3d_vec.size() / Config::num_box_features; obj_i++) {
-    float score = boxes3d_vec[obj_i * Config::num_box_features + 0];
-    if (score < score_threshold_) {
+  for (const auto & box3d : det_boxes3d) {
+    if (box3d.score < score_threshold_) {
       continue;
     }
-
-    int class_id = static_cast<int>(boxes3d_vec[obj_i * Config::num_box_features + 1]);
-    float x = boxes3d_vec[obj_i * Config::num_box_features + 2];
-    float y = boxes3d_vec[obj_i * Config::num_box_features + 3];
-    float z = boxes3d_vec[obj_i * Config::num_box_features + 4];
-    float w = boxes3d_vec[obj_i * Config::num_box_features + 5];
-    float l = boxes3d_vec[obj_i * Config::num_box_features + 6];
-    float h = boxes3d_vec[obj_i * Config::num_box_features + 7];
-    float yaw = boxes3d_vec[obj_i * Config::num_box_features + 8];
-    float vel_x = boxes3d_vec[obj_i * Config::num_box_features + 9];
-    float vel_y = boxes3d_vec[obj_i * Config::num_box_features + 10];
-
     autoware_auto_perception_msgs::msg::DetectedObject obj;
-    // TODO(yukke42): the value of classification confidence of DNN, not probability.
-    obj.existence_probability = score;
-    autoware_auto_perception_msgs::msg::ObjectClassification classification;
-    classification.probability = 1.0f;
-    classification.label = getSemanticType(class_names_[class_id]);
-
-    if (classification.label == Label::CAR && rename_car_to_truck_and_bus_) {
-      // Note: object size is referred from multi_object_tracker
-      if ((w * l > 2.2 * 5.5) && (w * l <= 2.5 * 7.9)) {
-        classification.label = Label::TRUCK;
-      } else if (w * l > 2.5 * 7.9) {
-        classification.label = Label::BUS;
-      }
-    }
-
-    if (isCarLikeVehicleLabel(classification.label)) {
-      obj.kinematics.orientation_availability =
-        autoware_auto_perception_msgs::msg::DetectedObjectKinematics::SIGN_UNKNOWN;
-    }
-
-    obj.classification.emplace_back(classification);
-
-    obj.kinematics.pose_with_covariance.pose.position = tier4_autoware_utils::createPoint(x, y, z);
-    obj.kinematics.pose_with_covariance.pose.orientation =
-      tier4_autoware_utils::createQuaternionFromYaw(yaw);
-    obj.shape.type = autoware_auto_perception_msgs::msg::Shape::BOUNDING_BOX;
-    obj.shape.dimensions = tier4_autoware_utils::createTranslation(l, w, h);
-
-    geometry_msgs::msg::Twist twist;
-    twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2));
-    twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - yaw);
-    obj.kinematics.twist_with_covariance.twist = twist;
-    obj.kinematics.has_twist = true;
-
+    box3DToDetectedObject(box3d, obj);
     output_msg.objects.emplace_back(obj);
   }
 
   if (objects_sub_count > 0) {
     objects_pub_->publish(output_msg);
   }
-  if (pointcloud_sub_count > 0) {
-    // TODO(yukke42): change to densification pointcloud for debugging
-    pointcloud_pub_->publish(*input_pointcloud_msg);
+}
+
+void LidarCenterPointNode::box3DToDetectedObject(
+  const Box3D & box3d, autoware_auto_perception_msgs::msg::DetectedObject & obj)
+{
+  // TODO(yukke42): the value of classification confidence of DNN, not probability.
+  obj.existence_probability = box3d.score;
+
+  // classification
+  autoware_auto_perception_msgs::msg::ObjectClassification classification;
+  classification.probability = 1.0f;
+  if (box3d.label >= 0 && static_cast<size_t>(box3d.label) < class_names_.size()) {
+    classification.label = getSemanticType(class_names_[box3d.label]);
+  } else {
+    classification.label = Label::UNKNOWN;
+  }
+
+  float l = box3d.length;
+  float w = box3d.width;
+  if (classification.label == Label::CAR && rename_car_to_truck_and_bus_) {
+    // Note: object size is referred from multi_object_tracker
+    if ((w * l > 2.2 * 5.5) && (w * l <= 2.5 * 7.9)) {
+      classification.label = Label::TRUCK;
+    } else if (w * l > 2.5 * 7.9) {
+      classification.label = Label::BUS;
+    }
   }
+
+  if (isCarLikeVehicleLabel(classification.label)) {
+    obj.kinematics.orientation_availability =
+      autoware_auto_perception_msgs::msg::DetectedObjectKinematics::SIGN_UNKNOWN;
+  }
+
+  obj.classification.emplace_back(classification);
+
+  // pose and shape
+  // mmdet3d yaw format to ros yaw format
+  float yaw = -box3d.yaw - tier4_autoware_utils::pi / 2;
+  obj.kinematics.pose_with_covariance.pose.position =
+    tier4_autoware_utils::createPoint(box3d.x, box3d.y, box3d.z);
+  obj.kinematics.pose_with_covariance.pose.orientation =
+    tier4_autoware_utils::createQuaternionFromYaw(yaw);
+  obj.shape.type = autoware_auto_perception_msgs::msg::Shape::BOUNDING_BOX;
+  obj.shape.dimensions =
+    tier4_autoware_utils::createTranslation(box3d.length, box3d.width, box3d.height);
+
+  // twist
+  float vel_x = box3d.vel_x;
+  float vel_y = box3d.vel_y;
+  geometry_msgs::msg::Twist twist;
+  twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2));
+  twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - yaw);
+  obj.kinematics.twist_with_covariance.twist = twist;
+  obj.kinematics.has_twist = true;
 }
 
 uint8_t LidarCenterPointNode::getSemanticType(const std::string & class_name)