Skip to content

Commit

Permalink
Fixed handling of old-entities when the table becomes empty (having p…
Browse files Browse the repository at this point in the history
…reviously contained data).
  • Loading branch information
cjohns-scottlogic authored Oct 17, 2024
1 parent 5fdf9fa commit 41deb56
Show file tree
Hide file tree
Showing 12 changed files with 108 additions and 29 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ dmypy.json
.data
.db-data
.idea
*.csv
*.sqlite*
task/*.csv
task/*.sqlite*
.DS_Store
.secrets
1 change: 1 addition & 0 deletions task/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
ADD . /src
WORKDIR /src

RUN python3 --version
RUN pip install --user -U pip
RUN pip install --user --no-cache-dir -r requirements.txt

Expand Down
7 changes: 6 additions & 1 deletion task/load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ elif [[ $DATABASE_NAME != "entity" ]]; then
echo "$EVENT_ID: extracting entity data from $DATABASE"
cat sql/export_entity.sql | sed "s/\${DATABASE_NAME}/$DATABASE_NAME/g" | sqlite3 "$DATABASE"

if [ $? != 0 ]; then
echo "Failed to export data from $DATABASE"
exit 1
fi

if ! [ -f exported_entity.csv ] || ! [ -f exported_old_entity.csv ]; then
echo "$EVENT_ID: failed to extract data from $DATABASE"
exit 1
Expand All @@ -84,7 +89,7 @@ fi
echo "$EVENT_ID: successfully extracted data from $DATABASE"

echo "$EVENT_ID: loading data into postgres"
python3 -m pgload.load --source="$DATABASE_NAME" || \
python3 -m pgload.load --source="$DATABASE_NAME" --sqlite-db="$DATABASE" || \
(echo "$EVENT_ID: failed to load $DATABASE" && exit 1)

echo "$EVENT_ID: loading of $DATABASE_NAME completed successfully"
7 changes: 6 additions & 1 deletion task/load_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ elif [[ $DATABASE_NAME != "entity" ]]; then
echo "$EVENT_ID: extracting entity data from $DATABASE"
cat sql/export_entity.sql | sed "s/\${DATABASE_NAME}/$DATABASE_NAME/g" | sqlite3 "$DATABASE"

if [ $? != 0 ]; then
echo "Failed to export data from $DATABASE"
exit 1
fi

if ! [ -f exported_entity.csv ] || ! [ -f exported_old_entity.csv ]; then
echo "$EVENT_ID: failed to extract data from $DATABASE"
exit 1
Expand All @@ -85,7 +90,7 @@ fi
echo "$EVENT_ID: successfully extracted data from $DATABASE"

echo "$EVENT_ID: loading data into postgres"
python3 -m pgload.load --source="$DATABASE_NAME" || \
python3 -m pgload.load --source="$DATABASE_NAME" --sqlite-db="$DATABASE" || \
(echo "$EVENT_ID: failed to load $DATABASE" && exit 1)

echo "$EVENT_ID: loading of $DATABASE_NAME completed successfully"
60 changes: 38 additions & 22 deletions task/pgload/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import psycopg2.extensions
import urllib.parse as urlparse
import click
import sqlite3

# load in specification
from digital_land.specification import Specification
Expand Down Expand Up @@ -51,22 +52,24 @@ def get_valid_datasets(specification):

@click.command()
@click.option("--source", required=True)
@click.option("--sqlite-db", required=True)
@click.option(
"--specification-dir", type=click.Path(exists=True), default="specification/"
)
def do_replace_cli(source, specification_dir):
def do_replace_cli(source, sqlite_db, specification_dir):
specification = Specification(path=specification_dir)
sqlite_conn = sqlite3.connect(sqlite_db)
valid_datasets = get_valid_datasets(specification)

if source == "digital-land" or source in valid_datasets:
do_replace(source)
do_replace(source, sqlite_conn)
if source == "digital-land":
remove_invalid_datasets(valid_datasets)

return


def get_connection():
def get_pg_connection():
try:
url = urlparse.urlparse(os.getenv("WRITE_DATABASE_URL"))
database = url.path[1:]
Expand All @@ -92,34 +95,47 @@ def get_connection():
return connection


def do_replace(source, tables_to_export=None):
if tables_to_export is None:
tables_to_export = export_tables[source]
def do_replace_table(table, source, csv_filename, postgress_conn, sqlite_conn):
with open(csv_filename, "r") as f:
reader = csv.DictReader(f, delimiter="|")
fieldnames = reader.fieldnames

connection = get_connection()
sql = SQL(table=table, fields=fieldnames, source=source)

for table in tables_to_export:
logger.info(f"Loading from database: {source} table: {table}")
# If we don't get any fieldnames, the file is probably blank. Check the table in the sqlite3.
if not fieldnames:
rows = (
sqlite_conn.cursor().execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
)
if rows == 0:
if source not in ["digital-land", "entity"]:
with postgress_conn.cursor() as cursor:
cursor.execute(sql.update_tables())
postgress_conn.commit()

csv_filename = f"exported_{table}.csv"
with postgress_conn.cursor() as cursor:
call_sql_queries(source, table, csv_filename, fieldnames, sql, cursor)

with open(csv_filename, "r") as f:
reader = csv.DictReader(f, delimiter="|")
fieldnames = reader.fieldnames
postgress_conn.commit()

sql = SQL(table=table, fields=fieldnames, source=source)
logger.info(f"Finished loading from database: {source} table: {table}")

with connection.cursor() as cursor:
call_sql_queries(source, table, csv_filename, fieldnames, sql, cursor)
if source != "entity" and table == "entity":
make_valid_multipolygon(postgress_conn, source)

connection.commit()
make_valid_with_handle_geometry_collection(postgress_conn, source)

logger.info(f"Finished loading from database: {source} table: {table}")

if source != "entity" and table == "entity":
make_valid_multipolygon(connection, source)
def do_replace(source, sqlite_conn, tables_to_export=None):
if tables_to_export is None:
tables_to_export = export_tables[source]

for table in tables_to_export:
logger.info(f"Loading from database: {source} table: {table}")

csv_filename = f"exported_{table}.csv"

make_valid_with_handle_geometry_collection(connection, source)
do_replace_table(table, source, csv_filename, get_pg_connection(), sqlite_conn)


def remove_invalid_datasets(valid_datasets):
Expand All @@ -128,7 +144,7 @@ def remove_invalid_datasets(valid_datasets):
from the postgres database. This keeps the site aligned with the spec
"""
valid_datasets_str = "', '".join(valid_datasets)
connection = get_connection()
connection = get_pg_connection()
# remove datasets not in valid_datasets from entity
with connection.cursor() as cursor:
sql = f"""
Expand Down
1 change: 0 additions & 1 deletion task/pgload/load_facts.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def load_facts():


def load_facts_into_postgres(rows):

url = urlparse.urlparse(os.getenv("WRITE_DATABASE_URL"))
database = url.path[1:]
user = url.username
Expand Down
1 change: 1 addition & 0 deletions task/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
click==8.0.3
psycopg2-binary==2.9.3
requests
datasette<0.65.0
47 changes: 45 additions & 2 deletions tests/integration/pg_load/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import sys
import pytest
import sqlite3

parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
sys.path.insert(0, parent_dir)
Expand All @@ -11,6 +12,7 @@
SQL,
call_sql_queries,
export_tables,
do_replace_table,
)


Expand Down Expand Up @@ -91,7 +93,6 @@ def test_do_replace(sources, postgresql_conn, create_db):


def test_make_valid_multipolygon(postgresql_conn, sources):

cursor = postgresql_conn.cursor()
for source in sources:
make_valid_multipolygon(postgresql_conn, source)
Expand All @@ -101,10 +102,52 @@ def test_make_valid_multipolygon(postgresql_conn, sources):


def test_make_valid_with_handle_geometry_collection(postgresql_conn, sources):

cursor = postgresql_conn.cursor()
for source in sources:
make_valid_with_handle_geometry_collection(postgresql_conn, source)
handle_geometry_collection_check(cursor, source)
postgresql_conn.commit()
cursor.close()


def test_unretired_entities(postgresql_conn):
source = "certificate-of-immunity"
table = "old_entity"

def make_sqlite3_conn(rows):
test_conn = sqlite3.connect(":memory:")
test_conn.cursor().execute(
"CREATE TABLE old_entity ("
"end_date TEXT, entity INTEGER, entry_date TEXT, notes TEXT, "
"old_entity TEXT PRIMARY KEY, start_date TEXT, status TEXT)"
)
for row in rows:
test_conn.cursor().execute(
"INSERT INTO old_entity VALUES (?, ?, ?, ?, ?, ?, ?)", row
)

return test_conn

for filename, sqlite_conn, expected_count in [
("exported_old_entity_2.csv", make_sqlite3_conn([]), 1),
("exported_old_entity_3.csv", make_sqlite3_conn([]), 0),
("exported_old_entity_4.csv", make_sqlite3_conn([]), 0),
("exported_old_entity_4.csv", make_sqlite3_conn([]), 0),
(
"exported_old_entity_4.csv",
make_sqlite3_conn([("", "", "", "", "2300000", "", "")]),
5,
),
]:
for file in ["exported_old_entity_1.csv", filename]:
csv_filename = os.path.join("tests/test_data/", file)

do_replace_table(table, source, csv_filename, postgresql_conn, sqlite_conn)

cursor = postgresql_conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM old_entity WHERE old_entity >= 2300000 AND old_entity <= 2300100",
)
rowcount = cursor.fetchone()[0]
cursor.close()
assert rowcount == expected_count
6 changes: 6 additions & 0 deletions tests/test_data/exported_old_entity_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
entity|old_entity|entry_date|start_date|end_date|status|notes|dataset
|2300001||||410||certificate-of-immunity
|2300002||||410||certificate-of-immunity
|2300003||||410||certificate-of-immunity
|2300004||||410||certificate-of-immunity
|2300005||||410||certificate-of-immunity
2 changes: 2 additions & 0 deletions tests/test_data/exported_old_entity_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
entity|old_entity|entry_date|start_date|end_date|status|notes|dataset
|2300100||||410||certificate-of-immunity
1 change: 1 addition & 0 deletions tests/test_data/exported_old_entity_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
entity|old_entity|entry_date|start_date|end_date|status|notes|dataset
Empty file.

0 comments on commit 41deb56

Please sign in to comment.