Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add property array to continuous distribution profile in GraphProfiler/ Integer Node ID identified as Integer in GraphData #579

Merged
merged 6 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions dataprofiler/data_readers/graph_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def __init__(self, input_file_path=None, options=None, data=None):
self._quotechar = options.get("quotechar", None)
self._header = options.get("header", "auto")

if data is not None:
self._load_data(data)
self._load_data(data)

@classmethod
def _find_target_string_in_column(self, column_names, keyword_list):
Expand Down Expand Up @@ -159,7 +158,7 @@ def is_match(cls, file_path, options=None):

def _format_data_networkx(self):
"""Format the input file into a networkX graph."""
networkx_graph = nx.DiGraph()
networkx_graph = nx.Graph()

# read lines from csv
csv_as_list = []
Expand Down Expand Up @@ -189,10 +188,12 @@ def _format_data_networkx(self):
csv_as_list[line][column]
)
elif column is self._source_node or column is self._destination_node:
networkx_graph.add_node(csv_as_list[line][column])
networkx_graph.add_node(
self.check_integer(csv_as_list[line][column])
)
networkx_graph.add_edge(
csv_as_list[line][self._source_node],
csv_as_list[line][self._destination_node],
self.check_integer(csv_as_list[line][self._source_node]),
self.check_integer(csv_as_list[line][self._destination_node]),
**attributes
)

Expand All @@ -206,3 +207,13 @@ def _load_data(self, data=None):
self._data = data
else:
self._data = self._format_data_networkx()

def check_integer(self, string):
"""Check whether string is integer and output integer."""
stringVal = string
if string[0] == ("-", "+"):
stringVal = string[1:]
if stringVal.isdigit():
return int(string)
else:
return string
28 changes: 27 additions & 1 deletion dataprofiler/profilers/graph_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,17 @@ def _get_global_max_component_size(self, graph):

@BaseColumnProfiler._timeit(name="continuous_distribution")
def _get_continuous_distribution(self, graph, continuous_attributes):
"""Compute the continuous distribution of graph edge continuous attributes."""
"""
Compute the continuous distribution of graph edge continuous attributes.

Returns properties array in the profile:
[optional: shape, loc, scale, mean, variance, skew, kurtosis]

- 6-property length: norm, uniform, expon, logistic
- 7-property length: gamma, lognorm
- gamma: shape=a
- lognorm: shape=s
"""
attributes = self._find_all_attributes(graph)
continuous_distributions = dict()

Expand All @@ -291,6 +301,8 @@ def _get_continuous_distribution(self, graph, continuous_attributes):
df = pd.Series(data_as_list)
best_fit = None
best_mle = 1000
best_fit_properties = None

for distribution in distribution_candidates:
# compute fit, mle, kolmogorov-smirnov test to test fit, and pdf
fit = distribution.fit(df)
Expand All @@ -299,10 +311,24 @@ def _get_continuous_distribution(self, graph, continuous_attributes):
if mle <= best_mle:
best_fit = distribution.name
best_mle = mle
best_fit_properties = fit

mean, variance, skew, kurtosis = distribution.stats(
best_fit_properties, moments="mvsk"
)
properties = list(best_fit_properties) + [
mean,
variance,
skew,
kurtosis,
]

continuous_distributions[attribute] = {
"name": best_fit,
"scale": best_mle,
"properties": properties,
}

else:
continuous_distributions[attribute] = None

Expand Down
42 changes: 21 additions & 21 deletions dataprofiler/tests/data_readers/test_csv_graph_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ def setUpClass(cls):
path=os.path.join(
test_dir, "csv/graph-differentiator-input-positive.csv"
),
list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
list_nodes=[1, 2, 3, 4, 5, 6, 7, 8, 9],
list_edges=[
("2", "1"),
("3", "2"),
("4", "2"),
("5", "2"),
("6", "2"),
("7", "2"),
("8", "2"),
("9", "2"),
(2, 1),
(3, 2),
(4, 2),
(5, 2),
(6, 2),
(7, 2),
(8, 2),
(9, 2),
],
options={"header": 0, "delimiter": ","},
encoding="utf-8",
Expand All @@ -38,23 +38,23 @@ def setUpClass(cls):
path=os.path.join(
test_dir, "csv/graph-differentiator-input-standard-positive.csv"
),
list_nodes=["1", "2", "3", "4"],
list_edges=[("2", "1"), ("1", "3"), ("2", "4")],
list_nodes=[1, 2, 3, 4],
list_edges=[(2, 1), (1, 3), (2, 4)],
options={"header": 0, "delimiter": ","},
encoding="utf-8",
),
dict(
path=os.path.join(test_dir, "csv/graph-data-input-positive-header.csv"),
list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
list_nodes=[1, 2, 3, 4, 5, 6, 7, 8, 9],
list_edges=[
("2", "1"),
("3", "2"),
("4", "2"),
("5", "2"),
("6", "2"),
("7", "2"),
("8", "2"),
("9", "2"),
(2, 1),
(3, 2),
(4, 2),
(5, 2),
(6, 2),
(7, 2),
(8, 2),
(9, 2),
],
options={"header": 2, "delimiter": ","},
encoding="utf-8",
Expand Down Expand Up @@ -188,7 +188,7 @@ def test_data_loader_edges(self):
data_edges = list(data.edges)

for edge in input_file["list_edges"]:
if edge not in data_edges:
if edge not in data_edges and (edge[1], edge[0]) not in data_edges:
all_edges_present = False
self.assertTrue(all_edges_present)

Expand Down
34 changes: 34 additions & 0 deletions dataprofiler/tests/profilers/test_graph_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from collections import defaultdict

import networkx as nx
import numpy as np

from dataprofiler.data_readers.graph_data import GraphData
from dataprofiler.profilers.graph_profiler import GraphProfile
Expand Down Expand Up @@ -63,20 +64,49 @@ def setUpClass(cls):
),
)

cls.expected_props = [
8.646041719759628,
1.6999999999999997,
0.19403886939727638,
np.array([8.64604172, 1.7, 0.19403887]),
np.array([8.64604172, 1.7, 0.19403887]),
np.array([0.68017604, 1.53392998, 4.54031127]),
np.array([0.69395918, 3.52941176, 30.92163966]),
]

def check_continuous_properties(self, continuous_distribution_props):
"""Tests the properties array for continuous distribution"""
for index, property in enumerate(continuous_distribution_props):
if isinstance(property, np.ndarray):
np.testing.assert_array_almost_equal(
self.expected_props[index], property
)
else:
self.assertAlmostEqual(self.expected_props[index], property)

def test_profile(self):
graph_profile = GraphProfile("test_update")
with utils.mock_timeit():
profile = graph_profile.update(self.graph)
scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
continuous_distribution_props = profile.profile["continuous_distribution"][
"weight"
].pop("properties")
self.assertAlmostEqual(scale, -15.250985118262854)
print(continuous_distribution_props)
self.check_continuous_properties(continuous_distribution_props)
self.assertDictEqual(self.expected_profile, profile.profile)

def test_report(self):
graph_profile = GraphProfile("test_report")
with utils.mock_timeit():
profile = graph_profile.update(self.graph)
scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
continuous_distribution_props = profile.profile["continuous_distribution"][
"weight"
].pop("properties")
self.assertAlmostEqual(scale, -15.250985118262854)
self.check_continuous_properties(continuous_distribution_props)
self.assertDictEqual(self.expected_profile, graph_profile.report())

def test_graph_data_object(self):
Expand All @@ -85,7 +115,11 @@ def test_graph_data_object(self):
with utils.mock_timeit():
profile = graph_profile.update(data)
scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
continuous_distribution_props = profile.profile["continuous_distribution"][
"weight"
].pop("properties")
self.assertAlmostEqual(scale, -15.250985118262854)
self.check_continuous_properties(continuous_distribution_props)
self.assertDictEqual(self.expected_profile, profile.profile)


Expand Down