Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a class to differentiate between Tabular and Graph CSV files #517

Merged
merged 22 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
48361ca
use networkx to differentiate graph
MisterPNP Jun 29, 2022
04f7a28
use networkx to differentiate graph
MisterPNP Jun 29, 2022
da3d524
add tests, update is_match
MisterPNP Jun 30, 2022
7d72c6e
rebase, accept changes to tests
MisterPNP Jun 30, 2022
78b5803
simplify graph differentiator, add tests
MisterPNP Jul 7, 2022
c195341
Merge remote-tracking branch 'origin/main' into graph
MisterPNP Jul 7, 2022
fdd955c
remove extra comments, clean up imports/usr/bin/python3
MisterPNP Jul 7, 2022
2a484b0
add input options handling, add tests, reformat and condense file
MisterPNP Jul 7, 2022
833ef04
remove outdated test file
MisterPNP Jul 7, 2022
33d0f38
add EOF new line
MisterPNP Jul 7, 2022
e1cb04b
cleanup, make is_match a classmethod
MisterPNP Jul 8, 2022
968406f
format test data file
MisterPNP Jul 8, 2022
4f717eb
Merge branch 'main' into graph
taylorfturner Jul 8, 2022
3feb32d
cleanup test file
MisterPNP Jul 11, 2022
7636af7
cleanup GraphData, integrated options
MisterPNP Jul 11, 2022
fa954eb
Merge branch 'graph' of https://github.com/MisterPNP/DataProfiler int…
MisterPNP Jul 11, 2022
f2f7975
options now updated in is_match, CSVData.is_match call is properly ex…
MisterPNP Jul 11, 2022
353c710
remove superfluous code
MisterPNP Jul 11, 2022
5359ccf
remove headers, remove update on delimiter in is_match
MisterPNP Jul 11, 2022
81e2c53
add networkx to requirements
MisterPNP Jul 11, 2022
c226077
add networkx 2.5.1 to requirements
MisterPNP Jul 11, 2022
388e075
Merge branch 'main' into graph
MisterPNP Jul 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions dataprofiler/data_readers/graph_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import csv

import networkx as nx

from .base_data import BaseData
from .csv_data import CSVData

JGSweets marked this conversation as resolved.
Show resolved Hide resolved
class GraphData(BaseData):

def __init__(self, input_file_path=None, data=None, options=None):

BaseData.__init__(self, input_file_path, data, options)

if options is None:
options = dict()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can refactor this next PR.

if options.get("delimiter", None) is None:
options.update(delimiter = ",")
if options.get("column_names", None) is None:
options.update(column_name = self.csv_column_names(self.input_file_path, options))
if options.get("source_list", None) is None:
options.update(source_list = ['source', 'src', 'origin'])
if options.get("destination_list", None) is None:
options.update(destination_list = ['target', 'destination', 'dst'])
if options.get("source_node", None) is None:
options.update(source_node = self._find_target_string_in_column(options.get("column_name", None), options.get("source_list", None)))
if options.get("destination_node", None) is None:
options.update(destination_node = self._find_target_string_in_column(options.get("column_name", None), options.get("destination_list", None)))

#return self._load_data()

@classmethod
def _find_target_string_in_column(self, column_names, keyword_list):
'''
Find whether one of the columns names contains a keyword that could refer to a target node column
'''

column_name_symbols = ['_', '.', '-']
has_target = False
target_index = -1

# iterate through columns, keywords, and delimiter name symbols to see if any permutation is contained in column names
for column in range(0, len(column_names)):
for keyword in keyword_list:
for symbol in column_name_symbols:

append_start_word = symbol + keyword
append_end_word = keyword + symbol

if append_start_word in column_names[column] or append_end_word in column_names[column]:
target_index = column
has_target = True
break
if has_target:
break

return target_index


@classmethod
def csv_column_names(cls, file_path, options):
'''
fetches a list of column names from the csv file
'''

column_names = []

with open(file_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter = options.get("delimiter", None))

# fetch only column names
for row in csv_reader:
column_names.append(row)
break

column_names = column_names[0]

# replace all whitespaces in the column names
for index in range(0, len(column_names)):
column_names[index] = column_names[index].replace(" ", "")

return column_names


@classmethod
def is_match(cls, file_path, options):
'''
Determines whether the file is a graph
Current formats checked:
- attributed edge list

This works by finding whether the file contains a target and a source node
'''

if options is None:
options = dict()
if CSVData.is_match(file_path, options):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may have misspoke before.
I think this should be:
if not CSVData.is_match(file_path, options):
^^ Are we not guaranteeing that it is a CSV to be read?

return False
column_names = cls.csv_column_names(file_path, options)
source_keywords = ['source', 'src', 'origin']
target_keywords = ['target', 'destination', 'dst']
source_index = cls._find_target_string_in_column(column_names, source_keywords)
destination_index = cls._find_target_string_in_column(column_names, target_keywords)
has_source = True if source_index >= 0 else False
has_target = True if destination_index >= 0 else False

if has_target and has_source:
options.update(delimiter = ",")
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
options.update(source_node = source_index)
options.update(destination_node = destination_index)
options.update(destination_list = target_keywords)
options.update(source_list = source_keywords)
return True

return False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new line EOF

1 change: 1 addition & 0 deletions dataprofiler/tests/data/csv/graph-data-input-json.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"name":"John", "age":30, "car":null}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

json serializable should be a "null", no?

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
node_id, node_id, attrib_id, attrib_type, edge_date, open_date, open_date
1, 2, 0,0,0,0,0
3, 1, 0,0,0,0,0
4, 2, 0,0,0,0,0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_id_dst, node_id_src, dst_node, src_node, weight, edge_attribute1, edge_attribute2, edge_attribute3, edge_destination_attribute1
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
node_id_dst, node_id_src,attrib_id,attrib_type,edge_date,open_date_src,open_date_dst
1, 2, 0,0,0,0,0
3, 1, 0,0,0,0,0
4, 2, 0,0,0,0,0
84 changes: 84 additions & 0 deletions dataprofiler/tests/data_readers/test_csv_graph_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
import unittest
from io import BytesIO, StringIO, TextIOWrapper

import networkx as nx

from dataprofiler.data_readers.data_utils import is_stream_buffer
from dataprofiler.data_readers.graph_data import GraphData

test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))


class TestGraphDataClass(unittest.TestCase):

@classmethod
def setUpClass(cls):

test_dir = os.path.join(test_root_path, 'data')
cls.input_file_names = [
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-positive.csv')),
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-standard-positive.csv')),
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-negative.csv')),
dict(path=os.path.join(test_dir, 'csv/graph-data-input-json.json')),
]

def test_finding_string_in_column_positive(self):
'''
Determine whether keywords can be detected with underscore before and after
'''
column_names_after = ['node_src', 'node_dst', 'attribute1']
column_names_before = ['src_node', 'dst_node', 'attribute1']
keyword_list = ["src", "destination"]
self.assertEqual(GraphData._find_target_string_in_column(column_names_after, keyword_list), 0)
self.assertEqual(GraphData._find_target_string_in_column(column_names_before, keyword_list), 0)

def test_finding_string_in_column_negative(self):
'''
Determine whether the output is false when keywords are not found or without substring delimiters
'''
column_names_no_keywords = ['movie', 'audience_type', 'audience_source']
column_names_no_delimiter = ['flight_number', 'destination', 'price']
keyword_list = ['dst', 'destination', 'target']
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_keywords, keyword_list), -1)
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_delimiter, keyword_list), -1)

#test csv_column_name
def test_csv_column_names(self):
"""
Determine if column names are fetched correctly and in the right format
"""
column_names = ['node_id_dst', 'node_id_src', 'attrib_id', 'attrib_type',\
'edge_date', 'open_date_src', 'open_date_dst']
input_file = self.input_file_names[1]['path']
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
options = {"header": True, "delimiter": ","}
self.assertEqual(GraphData.csv_column_names(input_file, options), column_names)

# test is_match for true output w/ different options
def test_is_graph_positive_1(self):
"""
Determine if the input CSV file can automatically be recognized as being a graph
"""
input_file_1 = self.input_file_names[0]['path']
input_file_2 = self.input_file_names[1]['path']
options_1 = {"header": True, "delimiter": ","}
options_2 = {"header": True, "delimiter": ","}
self.assertTrue(GraphData.is_match(input_file_1, options_2))
self.assertTrue(GraphData.is_match(input_file_1, options_2))

# test is_match for false output w/ different options
def test_is_graph_negative_1(self):
"""
Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
"""
input_file = self.input_file_names[1]['path']
input_file_1 = self.input_file_names[2]['path']
input_file_2 = self.input_file_names[3]['path']
options_1 = {"header": False, "delimiter": ","}
options = {"header": True, "delimiter": ","}
self.assertFalse(GraphData.is_match(input_file_1, options))
self.assertFalse(GraphData.is_match(input_file_2, options))
self.assertFalse(GraphData.is_match(input_file, options_1))

if __name__ == '__main__':
unittest.main()