From 2b0c3624d83900ad6103c37c74d18d1d338e760b Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 10:49:18 +0000 Subject: [PATCH 1/6] Fix small typo --- bird_cloud_gnn/radar_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index 8c1e6b6..5004203 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -58,7 +58,7 @@ def __init__( ValueError: If `data_folder` is not a valid folder. """ if not os.path.isdir(data_folder): - raise ValueError(f"'${data_folder}' is not a folder") + raise ValueError(f"'{data_folder}' is not a folder") self._name = name self.data_folder = data_folder self.features = features From 51655c01060353df7bf24afcca36e329609dff58 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 10:58:16 +0000 Subject: [PATCH 2/6] Move internal loop of RadarDataset.process to a function --- bird_cloud_gnn/radar_dataset.py | 90 ++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index 5004203..d44e99c 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -81,53 +81,59 @@ def __init__( ), ) + def _read_one_file(self, data_file): + """Reads a file and creates the graphs and labels for it.""" + + xyz = ["x", "y", "z"] + split_on_dots = data_file.split(".") + if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz": + return + data = pd.read_csv(os.path.join(self.data_folder, data_file)) + data = data.drop( + data[ + np.logical_or( + data.range > 100000, + np.logical_or(data.z > 10000, data.range < 5000), + ) + ].index + ).reset_index(drop=True) + na_index = data[data[self.target].isna()].index + data_notna = data.drop(na_index) + notna_index = data_notna.index + data_notna.reset_index(drop=True, inplace=True) + tree = KDTree(data.loc[:, xyz]) + tree_notna = KDTree(data_notna.loc[:, xyz]) + distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance) + number_neighbours = ( + np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1 + ) + points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0] + + for point in points_of_interest: + _, indexes = tree.query( + data.loc[notna_index[point], xyz], self.min_neighbours + ) + local_tree = KDTree(data.loc[indexes, xyz]) + distances = local_tree.sparse_distance_matrix( + local_tree, self.max_edge_distance, output_type="coo_matrix" + ) + graph = dgl.graph((distances.row, distances.col)) + + # TODO: Better fillna + local_data = data.loc[indexes, self.features].fillna(0) + assert not np.any(np.isnan(local_data)) + graph.ndata["x"] = torch.tensor(local_data.values) + graph.edata["a"] = torch.tensor(distances.data) + self.graphs.append(graph) + self.labels.append(data_notna.loc[point, self.target]) + def process(self): """Internal function for the DGLDataset. Process the folder to create the graphs.""" - xyz = ["x", "y", "z"] + self.graphs = [] self.labels = [] for data_file in os.listdir(self.data_folder): - split_on_dots = data_file.split(".") - if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz": - continue - data = pd.read_csv(os.path.join(self.data_folder, data_file)) - data = data.drop( - data[ - np.logical_or( - data.range > 100000, - np.logical_or(data.z > 10000, data.range < 5000), - ) - ].index - ).reset_index(drop=True) - na_index = data[data[self.target].isna()].index - data_notna = data.drop(na_index) - notna_index = data_notna.index - data_notna.reset_index(drop=True, inplace=True) - tree = KDTree(data.loc[:, xyz]) - tree_notna = KDTree(data_notna.loc[:, xyz]) - distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance) - number_neighbours = ( - np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1 - ) - points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0] - - for point in points_of_interest: - _, indexes = tree.query( - data.loc[notna_index[point], xyz], self.min_neighbours - ) - local_tree = KDTree(data.loc[indexes, xyz]) - distances = local_tree.sparse_distance_matrix( - local_tree, self.max_edge_distance, output_type="coo_matrix" - ) - graph = dgl.graph((distances.row, distances.col)) - - # TODO: Better fillna - local_data = data.loc[indexes, self.features].fillna(0) - assert not np.any(np.isnan(local_data)) - graph.ndata["x"] = torch.tensor(local_data.values) - graph.edata["a"] = torch.tensor(distances.data) - self.graphs.append(graph) - self.labels.append(data_notna.loc[point, self.target]) + self._read_one_file(data_file) if len(self.graphs) == 0: raise ValueError("No graphs selected under rules passed") From 61602211db86749bdfc45619a58de9155f938075 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 13:32:24 +0000 Subject: [PATCH 3/6] Allow reading parquet --- bird_cloud_gnn/radar_dataset.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index d44e99c..b5d8251 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -86,9 +86,15 @@ def _read_one_file(self, data_file): xyz = ["x", "y", "z"] split_on_dots = data_file.split(".") - if split_on_dots[-1] != "csv" and ".".join(split_on_dots[-2:]) != "csv.gz": + if ( + split_on_dots[-1] not in ["csv", "parquet"] + and ".".join(split_on_dots[-2:]) != "csv.gz" + ): return - data = pd.read_csv(os.path.join(self.data_folder, data_file)) + if split_on_dots[-1] == "parquet": + data = pd.read_parquet(os.path.join(self.data_folder, data_file)) + else: + data = pd.read_csv(os.path.join(self.data_folder, data_file)) data = data.drop( data[ np.logical_or( From 977f55dbeda8dcdb7a22ceafa80516e0fa41ecb6 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 13:35:00 +0000 Subject: [PATCH 4/6] Change how the number_neighbours is computed to improve speed --- bird_cloud_gnn/radar_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index b5d8251..4891286 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -109,10 +109,11 @@ def _read_one_file(self, data_file): data_notna.reset_index(drop=True, inplace=True) tree = KDTree(data.loc[:, xyz]) tree_notna = KDTree(data_notna.loc[:, xyz]) - distance_matrix = tree_notna.sparse_distance_matrix(tree, self.max_distance) - number_neighbours = ( - np.array(np.sum(distance_matrix > 0, axis=1)).reshape(-1) + 1 + distance_matrix = tree_notna.sparse_distance_matrix( + tree, self.max_distance, output_type="coo_matrix" ) + + number_neighbours = distance_matrix.getnnz(1) points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0] for point in points_of_interest: From 898406e94c1c76da15d539bd2d23bbd7c52e7326 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 13:59:00 +0000 Subject: [PATCH 5/6] Speed up RadarDataset by moving tree.query out of the loop --- bird_cloud_gnn/radar_dataset.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index 4891286..d680860 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -105,7 +105,6 @@ def _read_one_file(self, data_file): ).reset_index(drop=True) na_index = data[data[self.target].isna()].index data_notna = data.drop(na_index) - notna_index = data_notna.index data_notna.reset_index(drop=True, inplace=True) tree = KDTree(data.loc[:, xyz]) tree_notna = KDTree(data_notna.loc[:, xyz]) @@ -116,11 +115,14 @@ def _read_one_file(self, data_file): number_neighbours = distance_matrix.getnnz(1) points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0] - for point in points_of_interest: - _, indexes = tree.query( - data.loc[notna_index[point], xyz], self.min_neighbours - ) - local_tree = KDTree(data.loc[indexes, xyz]) + _, poi_indexes = tree.query( + data_notna.loc[points_of_interest, xyz], self.min_neighbours + ) + self.labels = np.concatenate( + (self.labels, data_notna[self.target].values[points_of_interest]) + ) + for _, indexes in enumerate(poi_indexes): + local_tree = KDTree(data.loc[indexes, xyz]) # slow distances = local_tree.sparse_distance_matrix( local_tree, self.max_edge_distance, output_type="coo_matrix" ) @@ -132,13 +134,12 @@ def _read_one_file(self, data_file): graph.ndata["x"] = torch.tensor(local_data.values) graph.edata["a"] = torch.tensor(distances.data) self.graphs.append(graph) - self.labels.append(data_notna.loc[point, self.target]) def process(self): """Internal function for the DGLDataset. Process the folder to create the graphs.""" self.graphs = [] - self.labels = [] + self.labels = np.array([]) for data_file in os.listdir(self.data_folder): self._read_one_file(data_file) From e3b777272a7aaade4de8856d591c632968971ab6 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Tue, 9 May 2023 14:20:25 +0000 Subject: [PATCH 6/6] Improve RadarDataset speed storing extra structures outside loop --- bird_cloud_gnn/radar_dataset.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/bird_cloud_gnn/radar_dataset.py b/bird_cloud_gnn/radar_dataset.py index d680860..49b9b1b 100644 --- a/bird_cloud_gnn/radar_dataset.py +++ b/bird_cloud_gnn/radar_dataset.py @@ -103,11 +103,24 @@ def _read_one_file(self, data_file): ) ].index ).reset_index(drop=True) + + data_xyz = data[xyz] + data_features = data[self.features] + na_index = data[data[self.target].isna()].index - data_notna = data.drop(na_index) - data_notna.reset_index(drop=True, inplace=True) - tree = KDTree(data.loc[:, xyz]) - tree_notna = KDTree(data_notna.loc[:, xyz]) + + data_xyz_notna = data_xyz.drop(na_index) + data_features_notna = data_features.drop(na_index) + + data_target = data[self.target] + data_target_notna = data_target[data_xyz_notna.index] + + data_xyz_notna.reset_index(drop=True, inplace=True) + data_features_notna.reset_index(drop=True, inplace=True) + + tree = KDTree(data_xyz) + tree_notna = KDTree(data_xyz_notna) + distance_matrix = tree_notna.sparse_distance_matrix( tree, self.max_distance, output_type="coo_matrix" ) @@ -116,21 +129,20 @@ def _read_one_file(self, data_file): points_of_interest = np.where(number_neighbours >= self.min_neighbours)[0] _, poi_indexes = tree.query( - data_notna.loc[points_of_interest, xyz], self.min_neighbours + data_xyz_notna.loc[points_of_interest], self.min_neighbours ) self.labels = np.concatenate( - (self.labels, data_notna[self.target].values[points_of_interest]) + (self.labels, data_target_notna.values[points_of_interest]) ) for _, indexes in enumerate(poi_indexes): - local_tree = KDTree(data.loc[indexes, xyz]) # slow + local_tree = KDTree(data_xyz.iloc[indexes]) # slow distances = local_tree.sparse_distance_matrix( local_tree, self.max_edge_distance, output_type="coo_matrix" ) graph = dgl.graph((distances.row, distances.col)) # TODO: Better fillna - local_data = data.loc[indexes, self.features].fillna(0) - assert not np.any(np.isnan(local_data)) + local_data = data_features.iloc[indexes].fillna(0) graph.ndata["x"] = torch.tensor(local_data.values) graph.edata["a"] = torch.tensor(distances.data) self.graphs.append(graph)