capitalone · JGSweets · Aug 5, 2022 · Aug 4, 2022 · Aug 4, 2022 · Aug 4, 2022
@@ -70,8 +70,7 @@ def __init__(self, input_file_path=None, options=None, data=None):
         self._quotechar = options.get("quotechar", None)
         self._header = options.get("header", "auto")
 
-        if data is not None:
-            self._load_data(data)
+        self._load_data(data)
 
     @classmethod
     def _find_target_string_in_column(self, column_names, keyword_list):
@@ -159,7 +158,7 @@ def is_match(cls, file_path, options=None):
 
     def _format_data_networkx(self):
         """Format the input file into a networkX graph."""
-        networkx_graph = nx.DiGraph()
+        networkx_graph = nx.Graph()
 
         # read lines from csv
         csv_as_list = []
@@ -189,10 +188,12 @@ def _format_data_networkx(self):
                         csv_as_list[line][column]
                     )
                 elif column is self._source_node or column is self._destination_node:
-                    networkx_graph.add_node(csv_as_list[line][column])
+                    networkx_graph.add_node(
+                        self.check_integer(csv_as_list[line][column])
+                    )
             networkx_graph.add_edge(
-                csv_as_list[line][self._source_node],
-                csv_as_list[line][self._destination_node],
+                self.check_integer(csv_as_list[line][self._source_node]),
+                self.check_integer(csv_as_list[line][self._destination_node]),
                 **attributes
             )
 
@@ -206,3 +207,13 @@ def _load_data(self, data=None):
             self._data = data
         else:
             self._data = self._format_data_networkx()
+
+    def check_integer(self, string):
+        """Check whether string is integer and output integer."""
+        stringVal = string
+        if string[0] == ("-", "+"):
+            stringVal = string[1:]
+        if stringVal.isdigit():
+            return int(string)
+        else:
+            return string
@@ -273,7 +273,17 @@ def _get_global_max_component_size(self, graph):
 
     @BaseColumnProfiler._timeit(name="continuous_distribution")
     def _get_continuous_distribution(self, graph, continuous_attributes):
-        """Compute the continuous distribution of graph edge continuous attributes."""
+        """
+        Compute the continuous distribution of graph edge continuous attributes.
+
+        Returns properties array in the profile:
+        [optional: shape, loc, scale, mean, variance, skew, kurtosis]
+
+        - 6-property length: norm, uniform, expon, logistic
+        - 7-property length: gamma, lognorm
+            - gamma: shape=a
+            - lognorm: shape=s
+        """
         attributes = self._find_all_attributes(graph)
         continuous_distributions = dict()
 
@@ -291,6 +301,8 @@ def _get_continuous_distribution(self, graph, continuous_attributes):
                 df = pd.Series(data_as_list)
                 best_fit = None
                 best_mle = 1000
+                best_fit_properties = None
+
                 for distribution in distribution_candidates:
                     # compute fit, mle, kolmogorov-smirnov test to test fit, and pdf
                     fit = distribution.fit(df)
@@ -299,10 +311,24 @@ def _get_continuous_distribution(self, graph, continuous_attributes):
                     if mle <= best_mle:
                         best_fit = distribution.name
                         best_mle = mle
+                        best_fit_properties = fit
+
+                mean, variance, skew, kurtosis = distribution.stats(
+                    best_fit_properties, moments="mvsk"
+                )
+                properties = list(best_fit_properties) + [
+                    mean,
+                    variance,
+                    skew,
+                    kurtosis,
+                ]
+
                 continuous_distributions[attribute] = {
                     "name": best_fit,
                     "scale": best_mle,
+                    "properties": properties,
                 }
+
             else:
                 continuous_distributions[attribute] = None
 

@@ -20,16 +20,16 @@ def setUpClass(cls):
                 path=os.path.join(
                     test_dir, "csv/graph-differentiator-input-positive.csv"
                 ),
-                list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
+                list_nodes=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                 list_edges=[
-                    ("2", "1"),
-                    ("3", "2"),
-                    ("4", "2"),
-                    ("5", "2"),
-                    ("6", "2"),
-                    ("7", "2"),
-                    ("8", "2"),
-                    ("9", "2"),
+                    (2, 1),
+                    (3, 2),
+                    (4, 2),
+                    (5, 2),
+                    (6, 2),
+                    (7, 2),
+                    (8, 2),
+                    (9, 2),
                 ],
                 options={"header": 0, "delimiter": ","},
                 encoding="utf-8",
@@ -38,23 +38,23 @@ def setUpClass(cls):
                 path=os.path.join(
                     test_dir, "csv/graph-differentiator-input-standard-positive.csv"
                 ),
-                list_nodes=["1", "2", "3", "4"],
-                list_edges=[("2", "1"), ("1", "3"), ("2", "4")],
+                list_nodes=[1, 2, 3, 4],
+                list_edges=[(2, 1), (1, 3), (2, 4)],
                 options={"header": 0, "delimiter": ","},
                 encoding="utf-8",
             ),
             dict(
                 path=os.path.join(test_dir, "csv/graph-data-input-positive-header.csv"),
-                list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
+                list_nodes=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                 list_edges=[
-                    ("2", "1"),
-                    ("3", "2"),
-                    ("4", "2"),
-                    ("5", "2"),
-                    ("6", "2"),
-                    ("7", "2"),
-                    ("8", "2"),
-                    ("9", "2"),
+                    (2, 1),
+                    (3, 2),
+                    (4, 2),
+                    (5, 2),
+                    (6, 2),
+                    (7, 2),
+                    (8, 2),
+                    (9, 2),
                 ],
                 options={"header": 2, "delimiter": ","},
                 encoding="utf-8",
@@ -188,7 +188,7 @@ def test_data_loader_edges(self):
             data_edges = list(data.edges)
 
             for edge in input_file["list_edges"]:
-                if edge not in data_edges:
+                if edge not in data_edges and (edge[1], edge[0]) not in data_edges:
                     all_edges_present = False
             self.assertTrue(all_edges_present)
 

@@ -6,6 +6,7 @@
 from collections import defaultdict
 
 import networkx as nx
+import numpy as np
 
 from dataprofiler.data_readers.graph_data import GraphData
 from dataprofiler.profilers.graph_profiler import GraphProfile
@@ -63,20 +64,49 @@ def setUpClass(cls):
             ),
         )
 
+        cls.expected_props = [
+            8.646041719759628,
+            1.6999999999999997,
+            0.19403886939727638,
+            np.array([8.64604172, 1.7, 0.19403887]),
+            np.array([8.64604172, 1.7, 0.19403887]),
+            np.array([0.68017604, 1.53392998, 4.54031127]),
+            np.array([0.69395918, 3.52941176, 30.92163966]),
+        ]
+
+    def check_continuous_properties(self, continuous_distribution_props):
+        """Tests the properties array for continuous distribution"""
+        for index, property in enumerate(continuous_distribution_props):
+            if isinstance(property, np.ndarray):
+                np.testing.assert_array_almost_equal(
+                    self.expected_props[index], property
+                )
+            else:
+                self.assertAlmostEqual(self.expected_props[index], property)
+
     def test_profile(self):
         graph_profile = GraphProfile("test_update")
         with utils.mock_timeit():
             profile = graph_profile.update(self.graph)
         scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
+        continuous_distribution_props = profile.profile["continuous_distribution"][
+            "weight"
+        ].pop("properties")
         self.assertAlmostEqual(scale, -15.250985118262854)
+        print(continuous_distribution_props)
+        self.check_continuous_properties(continuous_distribution_props)
         self.assertDictEqual(self.expected_profile, profile.profile)
 
     def test_report(self):
         graph_profile = GraphProfile("test_report")
         with utils.mock_timeit():
             profile = graph_profile.update(self.graph)
         scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
+        continuous_distribution_props = profile.profile["continuous_distribution"][
+            "weight"
+        ].pop("properties")
         self.assertAlmostEqual(scale, -15.250985118262854)
+        self.check_continuous_properties(continuous_distribution_props)
         self.assertDictEqual(self.expected_profile, graph_profile.report())
 
     def test_graph_data_object(self):
@@ -85,7 +115,11 @@ def test_graph_data_object(self):
         with utils.mock_timeit():
             profile = graph_profile.update(data)
         scale = profile.profile["continuous_distribution"]["weight"].pop("scale")
+        continuous_distribution_props = profile.profile["continuous_distribution"][
+            "weight"
+        ].pop("properties")
         self.assertAlmostEqual(scale, -15.250985118262854)
+        self.check_continuous_properties(continuous_distribution_props)
         self.assertDictEqual(self.expected_profile, profile.profile)