-
Notifications
You must be signed in to change notification settings - Fork 162
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a class to differentiate between Tabular and Graph CSV files (#517)
* use networkx to differentiate graph * use networkx to differentiate graph * add tests, update is_match * simplify graph differentiator, add tests * remove extra comments, clean up imports/usr/bin/python3 * add input options handling, add tests, reformat and condense file * remove outdated test file * add EOF new line * cleanup, make is_match a classmethod * format test data file * cleanup test file * cleanup GraphData, integrated options * options now updated in is_match, CSVData.is_match call is properly executing in Graph Data (issue with csv files), tests were cleaned up * remove superfluous code * remove headers, remove update on delimiter in is_match * add networkx to requirements * add networkx 2.5.1 to requirements Co-authored-by: Taylor Turner <[email protected]>
- Loading branch information
1 parent
358e22a
commit 89f69a2
Showing
7 changed files
with
209 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import csv | ||
|
||
import networkx as nx | ||
from numpy import source | ||
|
||
from .base_data import BaseData | ||
from .csv_data import CSVData | ||
from .filepath_or_buffer import FileOrBufferHandler | ||
|
||
class GraphData(BaseData): | ||
|
||
def __init__(self, input_file_path=None, data=None, options=None): | ||
BaseData.__init__(self, input_file_path, data, options) | ||
if options is None: | ||
options = dict() | ||
|
||
@classmethod | ||
def _find_target_string_in_column(self, column_names, keyword_list): | ||
''' | ||
Find whether one of the columns names contains a keyword that could refer to a target node column | ||
''' | ||
column_name_symbols = ['_', '.', '-'] | ||
has_target = False | ||
target_index = -1 | ||
|
||
# iterate through columns, keywords, and delimiter name symbols to see if any permutation is contained in column names | ||
for column in range(0, len(column_names)): | ||
for keyword in keyword_list: | ||
for symbol in column_name_symbols: | ||
|
||
append_start_word = symbol + keyword | ||
append_end_word = keyword + symbol | ||
|
||
if append_start_word in column_names[column] or append_end_word in column_names[column]: | ||
target_index = column | ||
has_target = True | ||
break | ||
if has_target: | ||
break | ||
|
||
return target_index | ||
|
||
|
||
@classmethod | ||
def csv_column_names(cls, file_path, options): | ||
''' | ||
fetches a list of column names from the csv file | ||
''' | ||
column_names = [] | ||
|
||
with FileOrBufferHandler(file_path) as csv_file: | ||
csv_reader = csv.reader(csv_file, delimiter = options.get("delimiter", ",")) | ||
|
||
# fetch only column names | ||
for row in csv_reader: | ||
column_names.append(row) | ||
break | ||
column_names = column_names[0] | ||
|
||
# replace all whitespaces in the column names | ||
for index in range(0, len(column_names)): | ||
column_names[index] = column_names[index].replace(" ", "") | ||
|
||
return column_names | ||
|
||
|
||
@classmethod | ||
def is_match(cls, file_path, options=None): | ||
''' | ||
Determines whether the file is a graph | ||
Current formats checked: | ||
- attributed edge list | ||
This works by finding whether the file contains a target and a source node | ||
''' | ||
if options is None: | ||
options = dict() | ||
if not CSVData.is_match(file_path, options): | ||
return False | ||
column_names = cls.csv_column_names(file_path, options) | ||
source_keywords = ['source', 'src', 'origin'] | ||
target_keywords = ['target', 'destination', 'dst'] | ||
source_index = cls._find_target_string_in_column(column_names, source_keywords) | ||
destination_index = cls._find_target_string_in_column(column_names, target_keywords) | ||
has_source = True if source_index >= 0 else False | ||
has_target = True if destination_index >= 0 else False | ||
|
||
if has_target and has_source: | ||
options.update(source_node = source_index) | ||
options.update(destination_node = destination_index) | ||
options.update(destination_list = target_keywords) | ||
options.update(source_list = source_keywords) | ||
options.update(column_name = column_names) | ||
return True | ||
|
||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"name":"John", "age":30, "car":null} |
4 changes: 4 additions & 0 deletions
4
dataprofiler/tests/data/csv/graph-differentiator-input-negative.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
node_id, node_id, attrib_id, attrib_type, edge_date, open_date, open_date | ||
1, 2, 0,0,0,0,0 | ||
3, 1, 0,0,0,0,0 | ||
4, 2, 0,0,0,0,0 |
10 changes: 10 additions & 0 deletions
10
dataprofiler/tests/data/csv/graph-differentiator-input-positive.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
node_id_dst, node_id_src, dst_node, src_node, weight, edge_attribute1, edge_attribute2, edge_attribute3, edge_destination_attribute1 | ||
1,2,0,0,0,0,0,0,0 | ||
2,3,0,0,0,0,0,0,0 | ||
2,4,0,0,0,0,0,0,0 | ||
2,5,0,0,0,0,0,0,0 | ||
2,6,0,0,0,0,0,0,0 | ||
2,7,0,0,0,0,0,0,0 | ||
2,8,0,0,0,0,0,0,0 | ||
2,9,0,0,0,0,0,0,0 | ||
2,10,0,0,0,0,0,0,0 |
4 changes: 4 additions & 0 deletions
4
dataprofiler/tests/data/csv/graph-differentiator-input-standard-positive.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
node_id_dst, node_id_src,attrib_id,attrib_type,edge_date,open_date_src,open_date_dst | ||
1, 2, 0,0,0,0,0 | ||
3, 1, 0,0,0,0,0 | ||
4, 2, 0,0,0,0,0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import os | ||
import unittest | ||
from io import BytesIO, StringIO, TextIOWrapper | ||
|
||
import networkx as nx | ||
|
||
from dataprofiler.data_readers.graph_data import GraphData | ||
|
||
test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) | ||
|
||
|
||
class TestGraphDataClass(unittest.TestCase): | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
|
||
test_dir = os.path.join(test_root_path, 'data') | ||
cls.input_file_names_pos = [ | ||
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-positive.csv'), encoding='utf-8'), | ||
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-standard-positive.csv'), encoding='utf-8'), | ||
] | ||
|
||
cls.input_file_names_neg = [ | ||
dict(path=os.path.join(test_dir, 'csv/graph-differentiator-input-negative.csv'), encoding='utf-8'), | ||
dict(path=os.path.join(test_dir, 'csv/graph-data-input-json.json'), encoding='utf-8'), | ||
] | ||
|
||
cls.buffer_list = [] | ||
for input_file in cls.input_file_names_pos: | ||
# add StringIO | ||
buffer_info = input_file.copy() | ||
with open(input_file["path"], "r", encoding=input_file["encoding"]) as fp: | ||
buffer_info["path"] = StringIO(fp.read()) | ||
cls.buffer_list.append(buffer_info) | ||
|
||
# add BytesIO | ||
buffer_info = input_file.copy() | ||
with open(input_file["path"], "rb") as fp: | ||
buffer_info["path"] = BytesIO(fp.read()) | ||
cls.buffer_list.append(buffer_info) | ||
|
||
cls.file_or_buf_list = cls.input_file_names_pos + cls.buffer_list | ||
|
||
|
||
def test_finding_string_in_column_positive(self): | ||
''' | ||
Determine whether keywords can be detected with underscore before and after | ||
''' | ||
column_names_after = ['node_src', 'node_dst', 'attribute1'] | ||
column_names_before = ['src_node', 'dst_node', 'attribute1'] | ||
keyword_list = ["src", "destination"] | ||
self.assertEqual(GraphData._find_target_string_in_column(column_names_after, keyword_list), 0) | ||
self.assertEqual(GraphData._find_target_string_in_column(column_names_before, keyword_list), 0) | ||
|
||
def test_finding_string_in_column_negative(self): | ||
''' | ||
Determine whether the output is false when keywords are not found or without substring delimiters | ||
''' | ||
column_names_no_keywords = ['movie', 'audience_type', 'audience_source'] | ||
column_names_no_delimiter = ['flight_number', 'destination', 'price'] | ||
keyword_list = ['dst', 'destination', 'target'] | ||
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_keywords, keyword_list), -1) | ||
self.assertEqual(GraphData._find_target_string_in_column(column_names_no_delimiter, keyword_list), -1) | ||
|
||
#test csv_column_name | ||
def test_csv_column_names(self): | ||
""" | ||
Determine if column names are fetched correctly and in the right format | ||
""" | ||
column_names = ['node_id_dst', 'node_id_src', 'attrib_id', 'attrib_type',\ | ||
'edge_date', 'open_date_src', 'open_date_dst'] | ||
input_file = self.input_file_names_pos[1]['path'] | ||
options = {"delimiter": ","} | ||
self.assertEqual(GraphData.csv_column_names(input_file, options), column_names) | ||
|
||
# test is_match for true output w/ different options | ||
def test_is_graph_positive_1(self): | ||
""" | ||
Determine if the input CSV file can automatically be recognized as being a graph | ||
""" | ||
for input_file in self.file_or_buf_list: | ||
self.assertTrue(GraphData.is_match(input_file["path"])) | ||
|
||
# test is_match for false output w/ different options | ||
def test_is_graph_negative_1(self): | ||
""" | ||
Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected | ||
""" | ||
for input_file in self.input_file_names_neg: | ||
self.assertFalse(GraphData.is_match(input_file["path"])) | ||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,3 +14,4 @@ charset-normalizer>=1.3.6 | |
psutil>=4.0.0 | ||
scipy>=1.4.1 | ||
requests==2.27.1 | ||
networkx==2.5.1 |