From 2b0c3624d83900ad6103c37c74d18d1d338e760b Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 10:49:18 +0000
Subject: [PATCH 1/6] Fix small typo

---
 bird_cloud_gnn/radar_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index 8c1e6b6..5004203 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -58,7 +58,7 @@ def __init__(
             ValueError: If `data_folder` is not a valid folder.
         """
         if not os.path.isdir(data_folder):
-            raise ValueError(f"'${data_folder}' is not a folder")
+            raise ValueError(f"'{data_folder}' is not a folder")
         self._name = name
         self.data_folder = data_folder
         self.features = features

From 51655c01060353df7bf24afcca36e329609dff58 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 10:58:16 +0000
Subject: [PATCH 2/6] Move internal loop of RadarDataset.process to a function

---
 bird_cloud_gnn/radar_dataset.py | 90 ++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index 5004203..d44e99c 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -81,53 +81,59 @@ def __init__(
             ),
         )
 
+    def _read_one_file(self, data_file):
+        """Reads a file and creates the graphs and labels for it."""
+
+        xyz = ["x", "y", "z"]
+        split_on_dots = data_file.split(".")
+        if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz":
+            return
+        data = pd.read_csv(os.path.join(self.data_folder, data_file))
+        data = data.drop(
+            data[
+                np.logical_or(
+                    data.range > 100000,
+                    np.logical_or(data.z > 10000, data.range < 5000),
+                )
+            ].index
+        ).reset_index(drop=True)
+        na_index = data[data[self.target].isna()].index
+        data_notna = data.drop(na_index)
+        notna_index = data_notna.index
+        data_notna.reset_index(drop=True, inplace=True)
+        tree = KDTree(data.loc[:, xyz])
+        tree_notna = KDTree(data_notna.loc[:, xyz])
+        distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance)
+        number_neighbours = (
+            np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1
+        )
+        points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0]
+
+        for point in points_of_interest:
+            _, indexes = tree.query(
+                data.loc[notna_index[point], xyz], self.min_neighbours
+            )
+            local_tree = KDTree(data.loc[indexes, xyz])
+            distances = local_tree.sparse_distance_matrix(
+                local_tree, self.max_edge_distance, output_type="coo_matrix"
+            )
+            graph = dgl.graph((distances.row, distances.col))
+
+            # TODO: Better fillna
+            local_data = data.loc[indexes, self.features].fillna(0)
+            assert not np.any(np.isnan(local_data))
+            graph.ndata["x"] = torch.tensor(local_data.values)
+            graph.edata["a"] = torch.tensor(distances.data)
+            self.graphs.append(graph)
+            self.labels.append(data_notna.loc[point, self.target])
+
     def process(self):
         """Internal function for the DGLDataset. Process the folder to create the graphs."""
-        xyz = ["x", "y", "z"]
+
         self.graphs = []
         self.labels = []
         for data_file in os.listdir(self.data_folder):
-            split_on_dots = data_file.split(".")
-            if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz":
-                continue
-            data = pd.read_csv(os.path.join(self.data_folder, data_file))
-            data = data.drop(
-                data[
-                    np.logical_or(
-                        data.range > 100000,
-                        np.logical_or(data.z > 10000, data.range < 5000),
-                    )
-                ].index
-            ).reset_index(drop=True)
-            na_index = data[data[self.target].isna()].index
-            data_notna = data.drop(na_index)
-            notna_index = data_notna.index
-            data_notna.reset_index(drop=True, inplace=True)
-            tree = KDTree(data.loc[:, xyz])
-            tree_notna = KDTree(data_notna.loc[:, xyz])
-            distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance)
-            number_neighbours = (
-                np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1
-            )
-            points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0]
-
-            for point in points_of_interest:
-                _, indexes = tree.query(
-                    data.loc[notna_index[point], xyz], self.min_neighbours
-                )
-                local_tree = KDTree(data.loc[indexes, xyz])
-                distances = local_tree.sparse_distance_matrix(
-                    local_tree, self.max_edge_distance, output_type="coo_matrix"
-                )
-                graph = dgl.graph((distances.row, distances.col))
-
-                # TODO: Better fillna
-                local_data = data.loc[indexes, self.features].fillna(0)
-                assert not np.any(np.isnan(local_data))
-                graph.ndata["x"] = torch.tensor(local_data.values)
-                graph.edata["a"] = torch.tensor(distances.data)
-                self.graphs.append(graph)
-                self.labels.append(data_notna.loc[point, self.target])
+            self._read_one_file(data_file)
 
         if len(self.graphs) == 0:
             raise ValueError("No graphs selected under rules passed")

From 61602211db86749bdfc45619a58de9155f938075 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 13:32:24 +0000
Subject: [PATCH 3/6] Allow reading parquet

---
 bird_cloud_gnn/radar_dataset.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index d44e99c..b5d8251 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -86,9 +86,15 @@ def _read_one_file(self, data_file):
 
         xyz = ["x", "y", "z"]
         split_on_dots = data_file.split(".")
-        if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz":
+        if (
+            split_on_dots[-1] not in ["csv", "parquet"]
+            and ".".join(split_on_dots[-2:]) != "csv.gz"
+        ):
             return
-        data = pd.read_csv(os.path.join(self.data_folder, data_file))
+        if split_on_dots[-1] == "parquet":
+            data = pd.read_parquet(os.path.join(self.data_folder, data_file))
+        else:
+            data = pd.read_csv(os.path.join(self.data_folder, data_file))
         data = data.drop(
             data[
                 np.logical_or(

From 977f55dbeda8dcdb7a22ceafa80516e0fa41ecb6 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 13:35:00 +0000
Subject: [PATCH 4/6] Change how the number_neighbours is computed to improve
 speed

---
 bird_cloud_gnn/radar_dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index b5d8251..4891286 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -109,10 +109,11 @@ def _read_one_file(self, data_file):
         data_notna.reset_index(drop=True, inplace=True)
         tree = KDTree(data.loc[:, xyz])
         tree_notna = KDTree(data_notna.loc[:, xyz])
-        distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance)
-        number_neighbours = (
-            np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1
+        distance_matrix = tree_notna.sparse_distance_matrix(
+            tree, self.max_distance, output_type="coo_matrix"
         )
+
+        number_neighbours = distance_matrix.getnnz(1)
         points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0]
 
         for point in points_of_interest:

From 898406e94c1c76da15d539bd2d23bbd7c52e7326 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 13:59:00 +0000
Subject: [PATCH 5/6] Speed up RadarDataset by moving tree.query out of the
 loop

---
 bird_cloud_gnn/radar_dataset.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index 4891286..d680860 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -105,7 +105,6 @@ def _read_one_file(self, data_file):
         ).reset_index(drop=True)
         na_index = data[data[self.target].isna()].index
         data_notna = data.drop(na_index)
-        notna_index = data_notna.index
         data_notna.reset_index(drop=True, inplace=True)
         tree = KDTree(data.loc[:, xyz])
         tree_notna = KDTree(data_notna.loc[:, xyz])
@@ -116,11 +115,14 @@ def _read_one_file(self, data_file):
         number_neighbours = distance_matrix.getnnz(1)
         points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0]
 
-        for point in points_of_interest:
-            _, indexes = tree.query(
-                data.loc[notna_index[point], xyz], self.min_neighbours
-            )
-            local_tree = KDTree(data.loc[indexes, xyz])
+        _, poi_indexes = tree.query(
+            data_notna.loc[points_of_interest, xyz], self.min_neighbours
+        )
+        self.labels = np.concatenate(
+            (self.labels, data_notna[self.target].values[points_of_interest])
+        )
+        for _, indexes in enumerate(poi_indexes):
+            local_tree = KDTree(data.loc[indexes, xyz])  # slow
             distances = local_tree.sparse_distance_matrix(
                 local_tree, self.max_edge_distance, output_type="coo_matrix"
             )
@@ -132,13 +134,12 @@ def _read_one_file(self, data_file):
             graph.ndata["x"] = torch.tensor(local_data.values)
             graph.edata["a"] = torch.tensor(distances.data)
             self.graphs.append(graph)
-            self.labels.append(data_notna.loc[point, self.target])
 
     def process(self):
         """Internal function for the DGLDataset. Process the folder to create the graphs."""
 
         self.graphs = []
-        self.labels = []
+        self.labels = np.array([])
         for data_file in os.listdir(self.data_folder):
             self._read_one_file(data_file)
 

From e3b777272a7aaade4de8856d591c632968971ab6 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Tue, 9 May 2023 14:20:25 +0000
Subject: [PATCH 6/6] Improve RadarDataset speed storing extra structures
 outside loop

---
 bird_cloud_gnn/radar_dataset.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py
index d680860..49b9b1b 100644
--- a/bird_cloud_gnn/radar_dataset.py
+++ b/bird_cloud_gnn/radar_dataset.py
@@ -103,11 +103,24 @@ def _read_one_file(self, data_file):
                 )
             ].index
         ).reset_index(drop=True)
+
+        data_xyz = data[xyz]
+        data_features = data[self.features]
+
         na_index = data[data[self.target].isna()].index
-        data_notna = data.drop(na_index)
-        data_notna.reset_index(drop=True, inplace=True)
-        tree = KDTree(data.loc[:, xyz])
-        tree_notna = KDTree(data_notna.loc[:, xyz])
+
+        data_xyz_notna = data_xyz.drop(na_index)
+        data_features_notna = data_features.drop(na_index)
+
+        data_target = data[self.target]
+        data_target_notna = data_target[data_xyz_notna.index]
+
+        data_xyz_notna.reset_index(drop=True, inplace=True)
+        data_features_notna.reset_index(drop=True, inplace=True)
+
+        tree = KDTree(data_xyz)
+        tree_notna = KDTree(data_xyz_notna)
+
         distance_matrix = tree_notna.sparse_distance_matrix(
             tree, self.max_distance, output_type="coo_matrix"
         )
@@ -116,21 +129,20 @@ def _read_one_file(self, data_file):
         points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0]
 
         _, poi_indexes = tree.query(
-            data_notna.loc[points_of_interest, xyz], self.min_neighbours
+            data_xyz_notna.loc[points_of_interest], self.min_neighbours
         )
         self.labels = np.concatenate(
-            (self.labels, data_notna[self.target].values[points_of_interest])
+            (self.labels, data_target_notna.values[points_of_interest])
         )
         for _, indexes in enumerate(poi_indexes):
-            local_tree = KDTree(data.loc[indexes, xyz])  # slow
+            local_tree = KDTree(data_xyz.iloc[indexes])  # slow
             distances = local_tree.sparse_distance_matrix(
                 local_tree, self.max_edge_distance, output_type="coo_matrix"
             )
             graph = dgl.graph((distances.row, distances.col))
 
             # TODO: Better fillna
-            local_data = data.loc[indexes, self.features].fillna(0)
-            assert not np.any(np.isnan(local_data))
+            local_data = data_features.iloc[indexes].fillna(0)
             graph.ndata["x"] = torch.tensor(local_data.values)
             graph.edata["a"] = torch.tensor(distances.data)
             self.graphs.append(graph)