Add a class to differentiate between Tabular and Graph CSV files (#517)

* use networkx to differentiate graph * use networkx to differentiate graph * add tests, update is_match * simplify graph differentiator, add tests * remove extra comments, clean up imports/usr/bin/python3 * add input options handling, add tests, reformat and condense file * remove outdated test file * add EOF new line * cleanup, make is_match a classmethod * format test data file * cleanup test file * cleanup GraphData, integrated options * options now updated in is_match, CSVData.is_match call is properly executing in Graph Data (issue with csv files), tests were cleaned up * remove superfluous code * remove headers, remove update on delimiter in is_match * add networkx to requirements * add networkx 2.5.1 to requirements Co-authored-by: Taylor Turner <[email protected]>
capitalone · Jul 11, 2022 · 89f69a2 · 89f69a2
1 parent 358e22a
commit 89f69a2
Show file tree

Hide file tree

Showing 7 changed files with 209 additions and 0 deletions.
diff --git a/dataprofiler/data_readers/graph_data.py b/dataprofiler/data_readers/graph_data.py
@@ -0,0 +1,96 @@
+import csv
+
+import networkx as nx
+from numpy import source
+
+from .base_data import BaseData
+from .csv_data import CSVData
+from .filepath_or_buffer import FileOrBufferHandler
+
+class GraphData(BaseData):
+
+    def __init__(self, input_file_path=None, data=None, options=None):
+        BaseData.__init__(self, input_file_path, data, options)
+        if options is None:
+            options = dict()
+
+    @classmethod
+    def _find_target_string_in_column(self, column_names, keyword_list):
+        '''
+        Find whether one of the columns names contains a keyword that could refer to a target node column
+        '''
+        column_name_symbols = ['_', '.', '-']
+        has_target = False
+        target_index = -1
+
+        # iterate through columns, keywords, and delimiter name symbols to see if any permutation is contained in column names
+        for column in range(0, len(column_names)):
+            for keyword in keyword_list:
+                for symbol in column_name_symbols:
+
+                    append_start_word = symbol + keyword
+                    append_end_word = keyword + symbol
+
+                    if append_start_word in column_names[column] or append_end_word in column_names[column]:
+                        target_index = column
+                        has_target = True
+                        break
+            if has_target:
+                break
+
+        return target_index
+
+
+    @classmethod
+    def csv_column_names(cls, file_path, options):
+        '''
+        fetches a list of column names from the csv file
+        '''
+        column_names = []
+
+        with FileOrBufferHandler(file_path) as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter = options.get("delimiter", ","))
+
+            # fetch only column names
+            for row in csv_reader:
+                column_names.append(row)
+                break
+        column_names = column_names[0]
+
+        # replace all whitespaces in the column names
+        for index in range(0, len(column_names)):
+            column_names[index] = column_names[index].replace(" ", "")
+
+        return column_names
+
+
+    @classmethod
+    def is_match(cls, file_path, options=None):
+        '''
+        Determines whether the file is a graph
+        Current formats checked:
+            - attributed edge list
+
+        This works by finding whether the file contains a target and a source node
+        '''
+        if options is None:
+            options = dict()
+        if not CSVData.is_match(file_path, options):
+            return False
+        column_names = cls.csv_column_names(file_path, options)
+        source_keywords = ['source', 'src', 'origin']
+        target_keywords = ['target', 'destination', 'dst']
+        source_index = cls._find_target_string_in_column(column_names, source_keywords)
+        destination_index = cls._find_target_string_in_column(column_names, target_keywords)
+        has_source = True if source_index >= 0 else False
+        has_target = True if destination_index >= 0 else False
+
+        if has_target and has_source:
+            options.update(source_node = source_index)
+            options.update(destination_node = destination_index)
+            options.update(destination_list = target_keywords)
+            options.update(source_list = source_keywords)
+            options.update(column_name = column_names)
+            return True
+
+        return False
diff --git a/dataprofiler/tests/data/csv/graph-data-input-json.json b/dataprofiler/tests/data/csv/graph-data-input-json.json
@@ -0,0 +1 @@
+{"name":"John", "age":30, "car":null}
diff --git a/dataprofiler/tests/data/csv/graph-differentiator-input-negative.csv b/dataprofiler/tests/data/csv/graph-differentiator-input-negative.csv
@@ -0,0 +1,4 @@
+node_id, node_id, attrib_id, attrib_type, edge_date, open_date, open_date
+1, 2, 0,0,0,0,0
+3, 1, 0,0,0,0,0
+4, 2, 0,0,0,0,0
diff --git a/dataprofiler/tests/data/csv/graph-differentiator-input-positive.csv b/dataprofiler/tests/data/csv/graph-differentiator-input-positive.csv
@@ -0,0 +1,10 @@
+node_id_dst, node_id_src, dst_node, src_node, weight, edge_attribute1, edge_attribute2, edge_attribute3, edge_destination_attribute1
+1,2,0,0,0,0,0,0,0
+2,3,0,0,0,0,0,0,0
+2,4,0,0,0,0,0,0,0
+2,5,0,0,0,0,0,0,0
+2,6,0,0,0,0,0,0,0
+2,7,0,0,0,0,0,0,0
+2,8,0,0,0,0,0,0,0
+2,9,0,0,0,0,0,0,0
+2,10,0,0,0,0,0,0,0
diff --git a/dataprofiler/tests/data/csv/graph-differentiator-input-standard-positive.csv b/dataprofiler/tests/data/csv/graph-differentiator-input-standard-positive.csv
@@ -0,0 +1,4 @@
+node_id_dst, node_id_src,attrib_id,attrib_type,edge_date,open_date_src,open_date_dst
+1, 2, 0,0,0,0,0
+3, 1, 0,0,0,0,0
+4, 2, 0,0,0,0,0
diff --git a/dataprofiler/tests/data_readers/test_csv_graph_data.py b/dataprofiler/tests/data_readers/test_csv_graph_data.py
@@ -0,0 +1,93 @@
+import os
+import unittest
+from io import BytesIO, StringIO, TextIOWrapper
+
+import networkx as nx
+
+from dataprofiler.data_readers.graph_data import GraphData
+
+test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+class TestGraphDataClass(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        test_dir = os.path.join(test_root_path, 'data')
+        cls.input_file_names_pos = [
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-positive.csv'), encoding='utf-8'),
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-standard-positive.csv'), encoding='utf-8'),
+        ]
+
+        cls.input_file_names_neg = [
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-negative.csv'), encoding='utf-8'),
+            dict(path=os.path.join(test_dir, 'csv/graph-data-input-json.json'), encoding='utf-8'),
+        ]
+
+        cls.buffer_list = []
+        for input_file in cls.input_file_names_pos:
+            # add StringIO
+            buffer_info = input_file.copy()
+            with open(input_file["path"], "r", encoding=input_file["encoding"]) as fp:
+                buffer_info["path"] = StringIO(fp.read())
+            cls.buffer_list.append(buffer_info)
+
+            # add BytesIO
+            buffer_info = input_file.copy()
+            with open(input_file["path"], "rb") as fp:
+                buffer_info["path"] = BytesIO(fp.read())
+            cls.buffer_list.append(buffer_info)
+
+        cls.file_or_buf_list = cls.input_file_names_pos + cls.buffer_list
+
+
+    def test_finding_string_in_column_positive(self):
+        '''
+        Determine whether keywords can be detected with underscore before and after
+        '''
+        column_names_after = ['node_src', 'node_dst', 'attribute1']
+        column_names_before = ['src_node', 'dst_node', 'attribute1']
+        keyword_list = ["src", "destination"]
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_after, keyword_list), 0)
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_before, keyword_list), 0)
+
+    def test_finding_string_in_column_negative(self):
+        '''
+        Determine whether the output is false when keywords are not found or without substring delimiters
+        '''
+        column_names_no_keywords = ['movie', 'audience_type', 'audience_source']
+        column_names_no_delimiter = ['flight_number', 'destination', 'price']
+        keyword_list = ['dst', 'destination', 'target']
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_no_keywords, keyword_list), -1)
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_no_delimiter, keyword_list), -1)
+
+    #test csv_column_name
+    def test_csv_column_names(self):
+        """
+        Determine if column names are fetched correctly and in the right format
+        """
+        column_names = ['node_id_dst', 'node_id_src', 'attrib_id', 'attrib_type',\
+             'edge_date', 'open_date_src', 'open_date_dst']
+        input_file = self.input_file_names_pos[1]['path']
+        options = {"delimiter": ","}
+        self.assertEqual(GraphData.csv_column_names(input_file, options), column_names)
+
+    # test is_match for true output w/ different options
+    def test_is_graph_positive_1(self):
+        """
+        Determine if the input CSV file can automatically be recognized as being a graph
+        """
+        for input_file in self.file_or_buf_list:
+            self.assertTrue(GraphData.is_match(input_file["path"]))
+
+    # test is_match for false output w/ different options
+    def test_is_graph_negative_1(self):
+        """
+        Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
+        """
+        for input_file in self.input_file_names_neg:
+            self.assertFalse(GraphData.is_match(input_file["path"]))   
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ charset-normalizer>=1.3.6
 psutil>=4.0.0
 scipy>=1.4.1
 requests==2.27.1
+networkx==2.5.1