tier4 · amc-nu · Aug 9, 2023 · Jul 18, 2023 · Jul 20, 2023 · Jul 24, 2023
diff --git a/.cspell.json b/.cspell.json
@@ -21,6 +21,9 @@
     "vccint",
     "adctp",
     "Adctp",
-
+    "stds",
+    "nproc",
+    "nohup",
+    "schedutil"
   ]
 }
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,9 @@ build/
 install/
 log/
 
+# profiling
+profiling_output/
+
 # Python
 srv/_*.py
 *.pyc

diff --git a/README.md b/README.md
@@ -37,39 +37,39 @@ colcon build --symlink-install --cmake-args -DCMAKE_BUILD_TYPE=Release
 
 Run tests:
 
-```
+```bash
 colcon test --event-handlers console_cohesion+ --packages-above nebula_common
 ```
 
 Show results:
 
-```
+```bash
 colcon test-result --all
 ```
 
 ## Generic launch file
 
 You can easily run the sensor hardware interface, the sensor hardware monitor and sensor driver using (e.g. Pandar64):
 
-```
+```bash
 ros2 launch nebula_ros nebula_launch.py sensor_model:=Pandar64
 ```
 
 If you don't want to launch the hardware (i.e. when you are working from a rosbag), set the `launch_hw` flag to false:
 
-```
+```bash
 ros2 launch nebula_ros nebula_launch.py sensor_model:=Pandar64 launch_hw:=false
 ```
 
 If you don't want the hardware driver to perform the sensor configuration communication (i.e. limited number of connections) set the `setup_sensor` flag to false:
 
-```
+```bash
 ros2 launch nebula_ros nebula_launch.py sensor_model:=Pandar64 setup_sensor:=false
 ```
 
 You should ideally provide a config file for your specific sensor, but default ones are provided `nebula_drivers/config`:
 
-```
+```bash
 ros2 launch nebula_ros nebula_launch.py sensor_model:=Pandar64 config_file:=your_sensor.yaml
 ```
 
@@ -217,3 +217,24 @@ Parameters shared by all supported models:
 ## Software design overview
 
 ![DriverOrganization](docs/diagram.png)
+
+
+## How to evaluate performance
+
+You can evaluate Nebula performance on a given rosbag and sensor model using the below tools.
+The profiling runner is most accurate when assigning isolated cores via the `-c <core_id>`.
+CPU frequencies are locked/unlocked automatically by the runner to increase repeatability.
+
+Run profiling for each version you want to compare:
+
+```bash
+./scripts/profiling_runner.bash baseline -m Pandar64 -b ~/my_rosbag -c 2 -t 20 -n 3
+git checkout my_improved_branch
+./scripts/profiling_runner.bash improved -m Pandar64 -b ~/my_rosbag -c 2 -t 20 -n 3
+```
+Show results:
+
+```bash
+pip3 install scripts/requirements.txt  # first-time setup
+python3 scripts/plot_times.py baseline improved
+```
diff --git a/nebula_decoders/include/nebula_decoders/nebula_decoders_hesai/decoders/pandar_at_decoder.hpp b/nebula_decoders/include/nebula_decoders/nebula_decoders_hesai/decoders/pandar_at_decoder.hpp
@@ -31,6 +31,11 @@ const uint16_t MAX_AZIMUTH_DEGREE_NUM = 36000;
 const uint16_t LIDAR_AZIMUTH_UNIT = 256;
 const uint32_t MAX_AZI_LEN = 36000 * 256;
 
+/// @brief The resolution of one scan (in single return mode)
+const uint32_t SCAN_POINTS_NUM = 1200 * LASER_COUNT;
+/// @brief This sensor supports a maximum of 2 returns (= dual return)
+const uint16_t MAX_RETURN_COUNT = 2;
+
 /// @brief Hesai LiDAR decoder (AT128)
 class PandarATDecoder : public HesaiScanDecoder
 {
@@ -79,6 +84,10 @@ class PandarATDecoder : public HesaiScanDecoder
   /// @param block_id target block
   /// @return Point cloud
   drivers::NebulaPointCloudPtr convert(size_t block_id) override;
+  /// @brief Convert to point cloud (without temporary buffer)
+  /// @param block_id target block
+  /// @param out_pc Point cloud to append the decoded points to
+  void convert(size_t block_id, NebulaPointCloudPtr & out_pc);
   /// @brief Convert to point cloud for dual return
   /// @param block_id target block
   /// @return Point cloud
@@ -93,13 +102,8 @@ class PandarATDecoder : public HesaiScanDecoder
   std::array<float, BLOCKS_PER_PACKET> block_offset_dual_{};
   std::array<float, BLOCKS_PER_PACKET> block_offset_triple_{};
 
-  std::vector<float> m_elevation_rad_map_;
-  std::vector<float> m_sin_elevation_map_;
-  std::vector<float> m_cos_elevation_map_;
-
-  std::vector<float> m_azimuth_rad_map_;
-  std::vector<float> m_sin_azimuth_map_;
-  std::vector<float> m_cos_azimuth_map_;
+  std::vector<float> m_sin_map_;
+  std::vector<float> m_cos_map_;
 
   Packet packet_{};
 

diff --git a/nebula_decoders/src/nebula_decoders_hesai/decoders/pandar_at_decoder.cpp b/nebula_decoders/src/nebula_decoders_hesai/decoders/pandar_at_decoder.cpp
@@ -32,19 +32,12 @@ PandarATDecoder::PandarATDecoder(
     azimuth_offset_[laser] = calibration_configuration->azimuth_offset_map[laser];
   }
 
-  m_elevation_rad_map_.resize(MAX_AZI_LEN);
-  m_sin_elevation_map_.resize(MAX_AZI_LEN);
-  m_cos_elevation_map_.resize(MAX_AZI_LEN);
-  m_azimuth_rad_map_.resize(MAX_AZI_LEN);
-  m_sin_azimuth_map_.resize(MAX_AZI_LEN);
-  m_cos_azimuth_map_.resize(MAX_AZI_LEN);
+  m_sin_map_.resize(MAX_AZI_LEN);
+  m_cos_map_.resize(MAX_AZI_LEN);
   for (size_t i = 0; i < MAX_AZI_LEN; ++i) {
-    m_elevation_rad_map_[i] = 2.f * i * M_PI / MAX_AZI_LEN;
-    m_sin_elevation_map_[i] = sinf(m_elevation_rad_map_[i]);
-    m_cos_elevation_map_[i] = cosf(m_elevation_rad_map_[i]);
-    m_azimuth_rad_map_[i] = 2.f * i * M_PI / MAX_AZI_LEN;
-    m_sin_azimuth_map_[i] = sinf(m_azimuth_rad_map_[i]);
-    m_cos_azimuth_map_[i] = cosf(m_azimuth_rad_map_[i]);
+    const float rad = 2.f * i * M_PI / MAX_AZI_LEN;
+    m_sin_map_[i] = sinf(rad);
+    m_cos_map_[i] = cosf(rad);
   }
 
   dual_return_distance_threshold_ = sensor_configuration_->dual_return_distance_threshold;
@@ -57,14 +50,16 @@ PandarATDecoder::PandarATDecoder(
   last_field_ = -1;
 
   scan_pc_.reset(new NebulaPointCloud);
+  scan_pc_->reserve(SCAN_POINTS_NUM * MAX_RETURN_COUNT);
   overflow_pc_.reset(new NebulaPointCloud);
+  overflow_pc_->reserve(SCAN_POINTS_NUM * MAX_RETURN_COUNT);
 }
 
 bool PandarATDecoder::hasScanned() { return has_scanned_; }
 
 std::tuple<drivers::NebulaPointCloudPtr, double> PandarATDecoder::get_pointcloud()
 {
-  return std::make_tuple(scan_pc_, scan_timestamp_);
+  return std::make_tuple(overflow_pc_, scan_timestamp_);
 }
 
 int PandarATDecoder::unpack(const pandar_msgs::msg::PandarPacket & pandar_packet)
@@ -75,8 +70,6 @@ int PandarATDecoder::unpack(const pandar_msgs::msg::PandarPacket & pandar_packet
   }
 
   if (has_scanned_) {
-    scan_pc_ = overflow_pc_;
-    overflow_pc_.reset(new NebulaPointCloud);
     has_scanned_ = false;
   }
 
@@ -96,17 +89,19 @@ int PandarATDecoder::unpack(const pandar_msgs::msg::PandarPacket & pandar_packet
       field = (field + 1) % correction_configuration_->frameNumber;
       count++;
     }
+
+    // The field (= mirror) has changed, meaning the previous scan is complete
     if (last_field_ != field) {
       if (max_azimuth_ < azimuth) {
         max_azimuth_ = azimuth;
       }
       scan_timestamp_ = packet_.unix_second + static_cast<double>(packet_.usec) / 1000000.f;
-      auto block_pc = convert(block_id);
-      *overflow_pc_ += *block_pc;
+      std::swap(scan_pc_, overflow_pc_);
+      scan_pc_->points.clear();
+      convert(block_id, scan_pc_);
       has_scanned_ = true;
     } else {
-      auto block_pc = convert(block_id);
-      *scan_pc_ += *block_pc;
+      convert(block_id, scan_pc_);
     }
     last_azimuth_ = azimuth;
     last_field_ = field;
@@ -171,10 +166,10 @@ void PandarATDecoder::CalcXTPointXYZIT(
     point.distance = unit.distance;
     point.elevation = 2.f * elevation * M_PI / MAX_AZI_LEN;
     {
-      float xyDistance = unit.distance * m_cos_elevation_map_[elevation];
-      point.x = static_cast<float>(xyDistance * m_sin_azimuth_map_[azimuth]);
-      point.y = static_cast<float>(xyDistance * m_cos_azimuth_map_[azimuth]);
-      point.z = static_cast<float>(unit.distance * m_sin_elevation_map_[elevation]);
+      float xyDistance = unit.distance * m_cos_map_[elevation];
+      point.x = xyDistance * m_sin_map_[azimuth];
+      point.y = xyDistance * m_cos_map_[azimuth];
+      point.z = unit.distance * m_sin_map_[elevation];
     }
     if (scan_timestamp_ < 0) {  // invalid timestamp
       scan_timestamp_ = packet_.unix_second + static_cast<double>(packet_.usec) / 1000000.;
@@ -227,17 +222,18 @@ void PandarATDecoder::CalcXTPointXYZIT(
   }
 }
 
+void PandarATDecoder::convert(size_t block_id, NebulaPointCloudPtr & out_pc) {
+  CalcXTPointXYZIT(block_id, static_cast<int>(packet_.header.chLaserNumber), out_pc);
+}
+
 drivers::NebulaPointCloudPtr PandarATDecoder::convert(size_t block_id)
 {
-  NebulaPointCloudPtr block_pc(new NebulaPointCloud);
-  CalcXTPointXYZIT(block_id, static_cast<int>(packet_.header.chLaserNumber), block_pc);
-
-  return block_pc;
+  throw std::runtime_error("Not implemented");
 }
 
 drivers::NebulaPointCloudPtr PandarATDecoder::convert_dual(size_t block_id)
 {
-  return convert(block_id);
+  throw std::runtime_error("Not implemented");
 }
 
 bool PandarATDecoder::parsePacket(const pandar_msgs::msg::PandarPacket & pandar_packet)

diff --git a/nebula_ros/include/nebula_ros/hesai/hesai_decoder_ros_wrapper.hpp b/nebula_ros/include/nebula_ros/hesai/hesai_decoder_ros_wrapper.hpp
@@ -13,6 +13,8 @@
 #include <rclcpp/rclcpp.hpp>
 #include <rclcpp_components/register_node_macro.hpp>
 
+#include <chrono>
+
 #include "pandar_msgs/msg/pandar_packet.hpp"
 #include "pandar_msgs/msg/pandar_scan.hpp"
 

diff --git a/nebula_ros/launch/nebula_launch.py b/nebula_ros/launch/nebula_launch.py
@@ -102,6 +102,10 @@ def launch_setup(context, *args, **kwargs):
         target_container=LaunchConfiguration("container"),
     )
 
+    container_kwargs = {}
+    if LaunchConfiguration("debug_logging").perform(context) == "true":
+        container_kwargs["ros_arguments"] = ['--log-level', 'debug']
+
     container = ComposableNodeContainer(
         name="nebula_ros_node",
         namespace="",
@@ -110,6 +114,7 @@ def launch_setup(context, *args, **kwargs):
         composable_node_descriptions=nodes,
         output="screen",
         condition=LaunchConfigurationEquals("container", ""),
+        **container_kwargs,
     )
 
     group = GroupAction(
@@ -135,6 +140,7 @@ def add_launch_arg(name: str, default_value=None):
             add_launch_arg("return_mode", "Dual"),
             add_launch_arg("launch_hw", "true"),
             add_launch_arg("setup_sensor", "true"),
+            add_launch_arg("debug_logging", "false"),
         ]
         + [OpaqueFunction(function=launch_setup)]
     )
diff --git a/nebula_ros/src/hesai/hesai_decoder_ros_wrapper.cpp b/nebula_ros/src/hesai/hesai_decoder_ros_wrapper.cpp
@@ -56,8 +56,7 @@ HesaiDriverRosWrapper::HesaiDriverRosWrapper(const rclcpp::NodeOptions & options
 void HesaiDriverRosWrapper::ReceiveScanMsgCallback(
   const pandar_msgs::msg::PandarScan::SharedPtr scan_msg)
 {
-  // take packets out of scan msg
-  std::vector<pandar_msgs::msg::PandarPacket> pkt_msgs = scan_msg->packets;
+  auto t_start = std::chrono::high_resolution_clock::now();
 
   std::tuple<nebula::drivers::NebulaPointCloudPtr, double> pointcloud_ts =
     driver_ptr_->ConvertScanToPointcloud(scan_msg);
@@ -98,6 +97,9 @@ void HesaiDriverRosWrapper::ReceiveScanMsgCallback(
       rclcpp::Time(SecondsToChronoNanoSeconds(std::get<1>(pointcloud_ts)).count());
     PublishCloud(std::move(ros_pc_msg_ptr), aw_points_ex_pub_);
   }
+
+  auto runtime = std::chrono::high_resolution_clock::now() - t_start;
+  RCLCPP_DEBUG(get_logger(), "PROFILING {'d_total': %lu, 'n_out': %lu}", runtime.count(), pointcloud->size());
 }
 
 void HesaiDriverRosWrapper::PublishCloud(

diff --git a/scripts/plot_times.py b/scripts/plot_times.py
@@ -0,0 +1,88 @@
+#%%
+
+import os
+import pandas as pd
+import re
+import glob
+import argparse
+from matplotlib import pyplot as plt
+
+def parse_logs(run_name):
+    dfs = []
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    glob_pattern = os.path.join(script_dir, f"../profiling_output/{run_name}-[0-9].log")
+    for path in glob.glob(glob_pattern):
+        with open(path) as f:
+            lines = f.readlines()
+            lines = filter(lambda line: "PROFILING" in line, lines)
+            lines = [re.sub(r'.*PROFILING (\{.*?\}).*', r'\1', line) for line in lines]
+            records = [eval(line) for line in lines]
+
+        dfs.append(pd.DataFrame(records))
+
+    df = pd.concat(dfs)
+
+    for col in [c for c in df.columns if c.startswith("d_")]:
+        df[col] /= 1e6  # ns to ms
+    df['d_total'] = sum([df[c] for c in df.columns if c.startswith("d_")]) # type: ignore
+    return df
+
+def plot_timing_comparison(run_names):
+    scenario_dfs = {
+        run_name: parse_logs(run_name) for run_name in run_names
+    }
+
+    n_cols = sum(1 for c in next(iter(scenario_dfs.values())).columns if c.startswith("d_"))
+
+    fig, axs = plt.subplots(1 + n_cols, figsize=(10, 10), num="Timing comparison")
+    ax_d = axs[0]
+    ax_n = ax_d.twinx()
+    ax_n.set_ylabel("# points published")
+    boxes = axs[1:]
+
+    for i, (label, df) in enumerate(scenario_dfs.items()):
+        durations = df['d_total']
+
+        ax_d.plot(durations.index, durations, label=label, linestyle='', marker='.')
+        for col in filter(lambda col: col.startswith("n_"), df.columns):
+            ax_n.plot(df.index, df[col], label=f"{label}::{col}", linestyle='', marker='.', color='black')
+
+        d_columns = [col for col in df.columns if col.startswith("d_")]
+        n_cols = len(d_columns)
+        for j, col in enumerate(d_columns):
+            if (i == 0):
+                boxes[j].set_ylabel(f"{col} (ms)")
+            boxes[j].boxplot(df[col], positions=[i], labels=[label])
+
+        ax_d.legend(loc="upper right")
+        ax_d.set_ylabel("time (ms)")
+        ax_d.set_xlabel("iteration")
+
+    df_means = pd.DataFrame({
+        label: df.describe().loc["mean"] for label, df in scenario_dfs.items()
+    })
+
+    df_stds = pd.DataFrame({
+        label: df.describe().loc["std"] for label, df in scenario_dfs.items()
+    })
+
+    df_means = df_means.rename(index=lambda label: f"{label} AVG")
+    df_stds = df_stds.rename(index=lambda label: f"{label} STD")
+
+    df_stat = pd.concat([df_means, df_stds], axis=0)
+
+    baseline_name = run_names[0]
+    df_base = df_stat[baseline_name]
+    for label in df_base.index:
+        df_stat.loc[f"{label} % rel to {baseline_name}", :] = df_stat.loc[label, :] / df_base.loc[label] * 100
+
+    print(df_stat.to_markdown())
+    plt.show()
+# %%
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("run_names", nargs='+', help="The names of the runs to plot, as entered in test_runner.bash (e.g. 'baseline', 'optimized')\nAll files of the form <run-name>-[0-9].log will be plotted.")
+    args = parser.parse_args()
+
+    plot_timing_comparison(args.run_names)