Skip to content

Commit

Permalink
Add a simple notebook that actually runs some diffs.
Browse files Browse the repository at this point in the history
  • Loading branch information
jdangerx committed Dec 6, 2023
1 parent c7b662c commit 5d83e0c
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
91 changes: 91 additions & 0 deletions devtools/sqlite-table-diff.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"Example of diffing tables across multiple different SQLite DBs.\n",
"\n",
"The tables must have the same name/schema. This is intended for use in\n",
"investigating validation test errors.\n",
"\"\"\"\n",
"import sqlite3\n",
"from pathlib import Path\n",
"from typing import Iterable\n",
"\n",
"import pandas as pd\n",
"\n",
"from pudl.helpers import diff_wide_tables, TableDiff\n",
"from pudl.metadata.classes import Resource\n",
"from pudl.metadata.fields import apply_pudl_dtypes\n",
"\n",
"\n",
"def table_diff(\n",
" table_name: str,\n",
" old_conn,\n",
" new_conn,\n",
" ignore_cols: Iterable[str] = (\"plant_id_ferc1\",),\n",
" addl_key_cols: Iterable[str] = (),\n",
" ) -> TableDiff:\n",
"\n",
" \"\"\"Diff two versions of the same table that live in SQL databases.\n",
"\n",
" The table has to have the same name + columns in both DBs.\n",
" \"\"\"\n",
" query = f\"SELECT * FROM {table_name}\"\n",
" old_table = apply_pudl_dtypes(pd.read_sql(query, old_conn))\n",
" new_table = apply_pudl_dtypes(pd.read_sql(query, new_conn))\n",
"\n",
" cols = list(set(old_table.columns) - set(ignore_cols))\n",
"\n",
" primary_key = list(set(Resource.from_id(table_name).schema.primary_key).union(set(addl_key_cols)))\n",
" return diff_wide_tables(primary_key, old_table[cols], new_table[cols])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_db = sqlite3.connect(Path(\"~/Downloads/pudl.sqlite\").expanduser())\n",
"old_db = sqlite3.connect(Path(\"~/Downloads/pudl (2).sqlite\").expanduser())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table_name = \"denorm_plants_steam_ferc1\"\n",
"diff = table_diff(table_name, old_db, new_db, ignore_cols=(\"plant_id_ferc1\", \"plant_id_pudl\"), addl_key_cols=(\"report_year\", \"utility_id_pudl\"))\n",
"diff.changed"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pudl-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
9 changes: 9 additions & 0 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1889,6 +1889,15 @@ def diff_wide_tables(
)
old_aligned, new_aligned = old_melted.align(new_melted)
comparison = old_aligned.compare(new_aligned, result_names=("old", "new"))
if comparison.empty:
return TableDiff(
deleted=pd.DataFrame(),
added=pd.DataFrame(),
changed=pd.DataFrame(),
old_df=old,
new_df=new,
)

old_values = comparison[("value", "old")]
new_values = comparison[("value", "new")]
added = comparison[old_values.isna() & new_values.notna()]
Expand Down

0 comments on commit 5d83e0c

Please sign in to comment.