Skip to content

Commit

Permalink
Add a class to differentiate between Tabular and Graph CSV files (#517)
Browse files Browse the repository at this point in the history
* use networkx to differentiate graph

* use networkx to differentiate graph

* add tests, update is_match

* simplify graph differentiator, add tests

* remove extra comments, clean up imports/usr/bin/python3

* add input options handling, add tests, reformat and condense file

* remove outdated test file

* add EOF new line

* cleanup, make is_match a classmethod

* format test data file

* cleanup test file

* cleanup GraphData, integrated options

* options now updated in is_match, CSVData.is_match call is properly executing in Graph Data (issue with csv files), tests were cleaned up

* remove superfluous code

* remove headers, remove update on delimiter in is_match

* add networkx to requirements

* add networkx 2.5.1 to requirements

Co-authored-by: Taylor Turner <[email protected]>
  • Loading branch information
MisterPNP and taylorfturner authored Jul 11, 2022
1 parent 358e22a commit 89f69a2
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 0 deletions.
96 changes: 96 additions & 0 deletions dataprofiler/data_readers/graph_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import csv

import networkx as nx
from numpy import source

from .base_data import BaseData
from .csv_data import CSVData
from .filepath_or_buffer import FileOrBufferHandler

class GraphData(BaseData):

def __init__(self, input_file_path=None, data=None, options=None):
BaseData.__init__(self, input_file_path, data, options)
if options is None:
options = dict()

@classmethod
def _find_target_string_in_column(self, column_names, keyword_list):
'''
Find whether one of the columns names contains a keyword that could refer to a target node column
'''
column_name_symbols = ['_', '.', '-']
has_target = False
target_index = -1

# iterate through columns, keywords, and delimiter name symbols to see if any permutation is contained in column names
for column in range(0, len(column_names)):
for keyword in keyword_list:
for symbol in column_name_symbols:

append_start_word = symbol + keyword
append_end_word = keyword + symbol

if append_start_word in column_names[column] or append_end_word in column_names[column]:
target_index = column
has_target = True
break
if has_target:
break

return target_index


@classmethod
def csv_column_names(cls, file_path, options):
'''
fetches a list of column names from the csv file
'''
column_names = []

with FileOrBufferHandler(file_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter = options.get("delimiter", ","))

# fetch only column names
for row in csv_reader:
column_names.append(row)
break
column_names = column_names[0]

# replace all whitespaces in the column names
for index in range(0, len(column_names)):
column_names[index] = column_names[index].replace(" ", "")

return column_names


@classmethod
def is_match(cls, file_path, options=None):
'''
Determines whether the file is a graph
Current formats checked:
- attributed edge list
This works by finding whether the file contains a target and a source node
'''
if options is None:
options = dict()
if not CSVData.is_match(file_path, options):
return False
column_names = cls.csv_column_names(file_path, options)
source_keywords = ['source', 'src', 'origin']
target_keywords = ['target', 'destination', 'dst']
source_index = cls._find_target_string_in_column(column_names, source_keywords)
destination_index = cls._find_target_string_in_column(column_names, target_keywords)
has_source = True if source_index >= 0 else False
has_target = True if destination_index >= 0 else False

if has_target and has_source:
options.update(source_node = source_index)
options.update(destination_node = destination_index)
options.update(destination_list = target_keywords)
options.update(source_list = source_keywords)
options.update(column_name = column_names)
return True

return False
1 change: 1 addition & 0 deletions dataprofiler/tests/data/csv/graph-data-input-json.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name":"John", "age":30, "car":null}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
node_id, node_id, attrib_id, attrib_type, edge_date, open_date, open_date
1, 2, 0,0,0,0,0
3, 1, 0,0,0,0,0
4, 2, 0,0,0,0,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
node_id_dst, node_id_src, dst_node, src_node, weight, edge_attribute1, edge_attribute2, edge_attribute3, edge_destination_attribute1
1,2,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0
2,8,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0
2,10,0,0,0,0,0,0,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
node_id_dst, node_id_src,attrib_id,attrib_type,edge_date,open_date_src,open_date_dst
1, 2, 0,0,0,0,0
3, 1, 0,0,0,0,0
4, 2, 0,0,0,0,0
93 changes: 93 additions & 0 deletions dataprofiler/tests/data_readers/test_csv_graph_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import unittest
from io import BytesIO, StringIO, TextIOWrapper

import networkx as nx

from dataprofiler.data_readers.graph_data import GraphData

test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))


class TestGraphDataClass(unittest.TestCase):

@classmethod
def setUpClass(cls):

test_dir = os.path.join(test_root_path, 'data')
cls.input_file_names_pos = [
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-positive.csv'), encoding='utf-8'),
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-standard-positive.csv'), encoding='utf-8'),
]

cls.input_file_names_neg = [
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-negative.csv'), encoding='utf-8'),
dict(path=os.path.join(test_dir, 'csv/graph-data-input-json.json'), encoding='utf-8'),
]

cls.buffer_list = []
for input_file in cls.input_file_names_pos:
# add StringIO
buffer_info = input_file.copy()
with open(input_file["path"], "r", encoding=input_file["encoding"]) as fp:
buffer_info["path"] = StringIO(fp.read())
cls.buffer_list.append(buffer_info)

# add BytesIO
buffer_info = input_file.copy()
with open(input_file["path"], "rb") as fp:
buffer_info["path"] = BytesIO(fp.read())
cls.buffer_list.append(buffer_info)

cls.file_or_buf_list = cls.input_file_names_pos + cls.buffer_list


def test_finding_string_in_column_positive(self):
'''
Determine whether keywords can be detected with underscore before and after
'''
column_names_after = ['node_src', 'node_dst', 'attribute1']
column_names_before = ['src_node', 'dst_node', 'attribute1']
keyword_list = ["src", "destination"]
self.assertEqual(GraphData._find_target_string_in_column(column_names_after, keyword_list), 0)
self.assertEqual(GraphData._find_target_string_in_column(column_names_before, keyword_list), 0)

def test_finding_string_in_column_negative(self):
'''
Determine whether the output is false when keywords are not found or without substring delimiters
'''
column_names_no_keywords = ['movie', 'audience_type', 'audience_source']
column_names_no_delimiter = ['flight_number', 'destination', 'price']
keyword_list = ['dst', 'destination', 'target']
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_keywords, keyword_list), -1)
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_delimiter, keyword_list), -1)

#test csv_column_name
def test_csv_column_names(self):
"""
Determine if column names are fetched correctly and in the right format
"""
column_names = ['node_id_dst', 'node_id_src', 'attrib_id', 'attrib_type',\
'edge_date', 'open_date_src', 'open_date_dst']
input_file = self.input_file_names_pos[1]['path']
options = {"delimiter": ","}
self.assertEqual(GraphData.csv_column_names(input_file, options), column_names)

# test is_match for true output w/ different options
def test_is_graph_positive_1(self):
"""
Determine if the input CSV file can automatically be recognized as being a graph
"""
for input_file in self.file_or_buf_list:
self.assertTrue(GraphData.is_match(input_file["path"]))

# test is_match for false output w/ different options
def test_is_graph_negative_1(self):
"""
Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
"""
for input_file in self.input_file_names_neg:
self.assertFalse(GraphData.is_match(input_file["path"]))

if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ charset-normalizer>=1.3.6
psutil>=4.0.0
scipy>=1.4.1
requests==2.27.1
networkx==2.5.1

0 comments on commit 89f69a2

Please sign in to comment.