capitalone · JGSweets · Jul 11, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022
@@ -0,0 +1,114 @@
+import csv
+
+import networkx as nx
+
+from .base_data import BaseData
+from .csv_data import CSVData
+
+class GraphData(BaseData):
+
+    def __init__(self, input_file_path=None, data=None, options=None):
+
+        BaseData.__init__(self, input_file_path, data, options)
+
+        if options is None:
+            options = dict()
+        if options.get("delimiter", None) is None:
+            options.update(delimiter = ",")
+        if options.get("column_names", None) is None:
+            options.update(column_name = self.csv_column_names(self.input_file_path, options))
+        if options.get("source_list", None) is None:
+            options.update(source_list = ['source', 'src', 'origin'])
+        if options.get("destination_list", None) is None:
+            options.update(destination_list = ['target', 'destination', 'dst'])
+        if options.get("source_node", None) is None:
+            options.update(source_node = self._find_target_string_in_column(options.get("column_name", None), options.get("source_list", None)))
+        if options.get("destination_node", None) is None:
+            options.update(destination_node = self._find_target_string_in_column(options.get("column_name", None), options.get("destination_list", None)))
+
+        #return self._load_data()
+
+    @classmethod
+    def _find_target_string_in_column(self, column_names, keyword_list):
+        '''
+        Find whether one of the columns names contains a keyword that could refer to a target node column
+        '''
+
+        column_name_symbols = ['_', '.', '-']
+        has_target = False
+        target_index = -1
+
+        # iterate through columns, keywords, and delimiter name symbols to see if any permutation is contained in column names
+        for column in range(0, len(column_names)):
+            for keyword in keyword_list:
+                for symbol in column_name_symbols:
+
+                    append_start_word = symbol + keyword
+                    append_end_word = keyword + symbol
+
+                    if append_start_word in column_names[column] or append_end_word in column_names[column]:
+                        target_index = column
+                        has_target = True
+                        break
+            if has_target:
+                break
+
+        return target_index
+
+
+    @classmethod
+    def csv_column_names(cls, file_path, options):
+        '''
+        fetches a list of column names from the csv file
+        '''
+
+        column_names = []
+
+        with open(file_path) as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter = options.get("delimiter", None))
+
+            # fetch only column names
+            for row in csv_reader:
+                column_names.append(row)
+                break
+
+        column_names = column_names[0]
+
+        # replace all whitespaces in the column names
+        for index in range(0, len(column_names)):
+            column_names[index] = column_names[index].replace(" ", "")
+
+        return column_names
+
+
+    @classmethod
+    def is_match(cls, file_path, options):
+        '''
+        Determines whether the file is a graph
+        Current formats checked:
+            - attributed edge list
+
+        This works by finding whether the file contains a target and a source node
+        '''
+
+        if options is None:
+            options = dict()
+        if CSVData.is_match(file_path, options):
+            return False
+        column_names = cls.csv_column_names(file_path, options)
+        source_keywords = ['source', 'src', 'origin']
+        target_keywords = ['target', 'destination', 'dst']
+        source_index = cls._find_target_string_in_column(column_names, source_keywords)
+        destination_index = cls._find_target_string_in_column(column_names, target_keywords)
+        has_source = True if source_index >= 0 else False
+        has_target = True if destination_index >= 0 else False
+
+        if has_target and has_source:
+            options.update(delimiter = ",")
+            options.update(source_node = source_index)
+            options.update(destination_node = destination_index)
+            options.update(destination_list = target_keywords)
+            options.update(source_list = source_keywords)
+            return True
+
+        return False
@@ -0,0 +1 @@
+{"name":"John", "age":30, "car":null}
@@ -0,0 +1,4 @@
+node_id, node_id, attrib_id, attrib_type, edge_date, open_date, open_date
+1, 2, 0,0,0,0,0
+3, 1, 0,0,0,0,0
+4, 2, 0,0,0,0,0
@@ -0,0 +1 @@
+node_id_dst, node_id_src, dst_node, src_node, weight, edge_attribute1, edge_attribute2, edge_attribute3, edge_destination_attribute1
@@ -0,0 +1,4 @@
+node_id_dst, node_id_src,attrib_id,attrib_type,edge_date,open_date_src,open_date_dst
+1, 2, 0,0,0,0,0
+3, 1, 0,0,0,0,0
+4, 2, 0,0,0,0,0
@@ -0,0 +1,84 @@
+import os
+import unittest
+from io import BytesIO, StringIO, TextIOWrapper
+
+import networkx as nx
+
+from dataprofiler.data_readers.data_utils import is_stream_buffer
+from dataprofiler.data_readers.graph_data import GraphData
+
+test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+class TestGraphDataClass(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        test_dir = os.path.join(test_root_path, 'data')
+        cls.input_file_names = [
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-positive.csv')),
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-standard-positive.csv')),
+            dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-negative.csv')),
+            dict(path=os.path.join(test_dir, 'csv/graph-data-input-json.json')),
+        ]
+
+    def test_finding_string_in_column_positive(self):
+        '''
+        Determine whether keywords can be detected with underscore before and after
+        '''
+        column_names_after = ['node_src', 'node_dst', 'attribute1']
+        column_names_before = ['src_node', 'dst_node', 'attribute1']
+        keyword_list = ["src", "destination"]
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_after, keyword_list), 0)
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_before, keyword_list), 0)
+
+    def test_finding_string_in_column_negative(self):
+        '''
+        Determine whether the output is false when keywords are not found or without substring delimiters
+        '''
+        column_names_no_keywords = ['movie', 'audience_type', 'audience_source']
+        column_names_no_delimiter = ['flight_number', 'destination', 'price']
+        keyword_list = ['dst', 'destination', 'target']
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_no_keywords, keyword_list), -1)
+        self.assertEqual(GraphData._find_target_string_in_column(column_names_no_delimiter, keyword_list), -1)
+
+    #test csv_column_name
+    def test_csv_column_names(self):
+        """
+        Determine if column names are fetched correctly and in the right format
+        """
+        column_names = ['node_id_dst', 'node_id_src', 'attrib_id', 'attrib_type',\
+             'edge_date', 'open_date_src', 'open_date_dst']
+        input_file = self.input_file_names[1]['path']
+        options = {"header": True, "delimiter": ","}
+        self.assertEqual(GraphData.csv_column_names(input_file, options), column_names)
+
+    # test is_match for true output w/ different options
+    def test_is_graph_positive_1(self):
+        """
+        Determine if the input CSV file can automatically be recognized as being a graph
+        """
+        input_file_1 = self.input_file_names[0]['path']
+        input_file_2 = self.input_file_names[1]['path']
+        options_1 = {"header": True, "delimiter": ","}
+        options_2 = {"header": True, "delimiter": ","}
+        self.assertTrue(GraphData.is_match(input_file_1, options_2))
+        self.assertTrue(GraphData.is_match(input_file_1, options_2))
+
+    # test is_match for false output w/ different options
+    def test_is_graph_negative_1(self):
+        """
+        Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
+        """
+        input_file = self.input_file_names[1]['path']
+        input_file_1 = self.input_file_names[2]['path']
+        input_file_2 = self.input_file_names[3]['path']
+        options_1 = {"header": False, "delimiter": ","}
+        options = {"header": True, "delimiter": ","}
+        self.assertFalse(GraphData.is_match(input_file_1, options))
+        self.assertFalse(GraphData.is_match(input_file_2, options))   
+        self.assertFalse(GraphData.is_match(input_file, options_1))
+
+if __name__ == '__main__':
+    unittest.main()