Skip to content

Commit

Permalink
add value comparison, fix histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Jul 23, 2024
1 parent 320f7db commit 453156b
Showing 1 changed file with 99 additions and 7 deletions.
106 changes: 99 additions & 7 deletions bdikit/visualization/schema_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,34 @@ def gdc_clean_heatmap_recommendations(
return candidates_dfs


def clean_heatmap_recommendations(
heatmap_recommendations: List[Dict], target: pd.DataFrame
):
candidates_dfs = {}

for column_data in heatmap_recommendations:
column_name = column_data["source_column"]
recommendations = []
for candidate_name, candidate_similarity in column_data["top_k_columns"]:
candidate_values = ", ".join(
target[candidate_name].astype(str).unique()[:5]
)
recommendations.append(
(
candidate_name,
candidate_similarity,
candidate_values,
)
)

candidates_dfs[column_name] = pd.DataFrame(
recommendations,
columns=["Candidate", "Similarity", "Values (sample)"],
)

return candidates_dfs


def truncate_text(text: str, max_chars: int):
if len(text) > max_chars:
return text[:max_chars] + "..."
Expand Down Expand Up @@ -181,6 +209,11 @@ def __init__(
self.candidates_dfs = gdc_clean_heatmap_recommendations(
self.heatmap_recommendations, max_chars_samples=max_chars_samples
)
elif isinstance(target, pd.DataFrame):
self.candidates_dfs = clean_heatmap_recommendations(
self.heatmap_recommendations, target
)

self.height = height

# Undo/Redo
Expand Down Expand Up @@ -576,9 +609,19 @@ def _plot_column_histogram(
)
return layered

def _plot_source_histogram(
self, source_column: str, heatmap_rec_list: pd.DataFrame, selection: List[int]
) -> "pn.pane.Markdown | alt.LayerChart":
if not selection:
return self._plot_column_histogram(source_column, self.source)

column, _ = self._update_column_selection(heatmap_rec_list, selection)

return self._plot_column_histogram(column, self.source)

def _plot_target_histogram(
self, heatmap_rec_list: pd.DataFrame, selection: List[int]
) -> "pn.pane.Markdown | alt.Chart":
) -> "pn.pane.Markdown | alt.LayerChart":
if not isinstance(self.target, pd.DataFrame):
return pn.pane.Markdown("No ground truth provided.")
if not selection:
Expand Down Expand Up @@ -615,6 +658,37 @@ def _plot_value_matches(
height=200,
)

def _plot_value_comparisons(
self, source_column: str, heatmap_rec_list: pd.DataFrame, selection: List[int]
) -> "pn.widgets.Tabulator | pn.pane.Markdown":
if not selection:
column = source_column
rec = None
else:
column, rec = self._update_column_selection(heatmap_rec_list, selection)
value_comparisons = {
"Source Value": self.source[column].dropna().unique()[:5],
}

candidate_df = self.candidates_dfs[column]
for idx, row in candidate_df.iterrows():
candidate = row["Candidate"]
values = row["Values (sample)"].split(", ")[:5]
value_comparisons[candidate] = values

frozen_columns = ["Source Value"]
if rec:
frozen_columns.append(rec)

return pn.widgets.Tabulator(
pd.DataFrame(
dict([(k, pd.Series(v)) for k, v in value_comparisons.items()])
).fillna(""),
frozen_columns=frozen_columns,
show_index=False,
width=700,
)

def _plot_pane(
self,
select_column: Optional[str] = None,
Expand Down Expand Up @@ -676,16 +750,28 @@ def _plot_pane(
heatmap_pane.selection.param.single,
)

column_hist = self._plot_column_histogram(select_column, self.source)

value_matches = pn.bind(
self._plot_value_matches,
column_hist = pn.bind(
self._plot_source_histogram,
select_column,
heatmap_rec_list,
heatmap_pane.selection.param.single,
)

# value_matches = pn.bind(
# self._plot_value_matches,
# heatmap_rec_list,
# heatmap_pane.selection.param.single,
# )

plot_history = self._plot_history()

value_comparisons = pn.bind(
self._plot_value_comparisons,
select_column,
heatmap_rec_list,
heatmap_pane.selection.param.single,
)

return pn.Column(
pn.FloatPanel(
plot_history,
Expand All @@ -701,11 +787,17 @@ def _plot_pane(
),
pn.Spacer(height=5),
pn.Card(
value_matches,
title="Value Matches",
value_comparisons,
title="Value Comparisons",
styles={"background": "WhiteSmoke"},
scroll=True,
),
# pn.Card(
# value_matches,
# title="Value Matches",
# styles={"background": "WhiteSmoke"},
# scroll=True,
# ),
pn.Card(
pn.Row(
pn.Column(column_hist, width=500),
Expand Down

0 comments on commit 453156b

Please sign in to comment.