Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Graph Profiler Update to Profile Builder #587

Merged
10 changes: 4 additions & 6 deletions dataprofiler/profilers/graph_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pandas as pd
import scipy.stats as st

from dataprofiler.data_readers.graph_data import GraphData

from ..data_readers.graph_data import GraphData
from . import BaseColumnProfiler


Expand All @@ -19,16 +18,15 @@ class GraphProfiler(object):
Statistical properties of graph
"""

def __init__(self, name, options=None):
def __init__(self, data, options=None):
"""
Initialize Graph Profiler.

:param name: Name of the data
:param data: data
:type name: String
:param options: Options for the Graph Profiler
:type options: GraphOptions
"""
self.name = name
self.sample_size = 0
self.times = defaultdict(float)

Expand Down Expand Up @@ -153,7 +151,7 @@ def update(self, graph):
if isinstance(graph, GraphData):
graph = graph.data
graph_size = graph.size()
if graph_size == 0:
if graph_size == 0 and graph.number_of_nodes() == 0:
return self
profile = dict(sample_size=graph_size)

Expand Down
4 changes: 3 additions & 1 deletion dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2609,7 +2609,9 @@ def __new__(

# Construct based off of initial kwarg input or inference
if profiler_type == "graph":
return GraphProfiler(data, options=options)
profile = GraphProfiler(data, options=options)
profile.update(data)
return profile
elif profiler_type == "structured":
return StructuredProfiler(
data, samples_per_update, min_true_samples, options
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/tests/data_readers/test_csv_graph_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_data_loader_nodes(self):
if input_file["list_nodes"] is None:
continue
data = GraphData(
input_file_path=input_file["path"], data=None, options=options
input_file_path=input_file["path"], options=options
)
self.assertEqual(input_file["list_nodes"], sorted(data.nodes))

Expand All @@ -201,7 +201,7 @@ def test_data_loader_edges(self):
if input_file["list_edges"] is None:
continue
data = GraphData(
input_file_path=input_file["path"], data=None, options=options
input_file_path=input_file["path"], options=options
)
data_edges = list(data.edges)

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/tests/profilers/test_graph_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def test_report(self):
def test_graph_data_object(self):
data = GraphData(input_file_path=None, data=self.graph)
graph_profile = GraphProfiler("test_graph_data_object_update")

with utils.mock_timeit():
profile = graph_profile.update(data)
scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
Expand Down
10 changes: 6 additions & 4 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3622,10 +3622,6 @@ def test_profiler_factory_class_bad_input(self):
"dataprofiler.profilers.profile_builder.UnstructuredProfiler",
spec=UnstructuredProfiler,
)
@mock.patch(
"dataprofiler.profilers.graph_profiler.GraphProfiler",
spec=GraphProfiler,
)
def test_profiler_factory_class_creates_correct_profiler(self, *mocks):
"""
Ensure Profiler factory class either respects user input or makes
Expand Down Expand Up @@ -3670,6 +3666,12 @@ def test_profiler_factory_class_creates_correct_profiler(self, *mocks):
data_str = "test"
self.assertIsInstance(Profiler(data_str), UnstructuredProfiler)

data_graph.add_node(1)
data_graph = dp.Data(data=data_graph, data_type="graph")
graph_profile = Profiler(data_graph)
profile = graph_profile.profile
self.assertIsNotNone(profile.get("num_nodes"))

def test_save_and_load_structured(self):
datapth = "dataprofiler/tests/data/"
test_files = ["csv/guns.csv", "csv/iris.csv"]
Expand Down