From 161183cccb4164bd0cddf633e5fe5267c6d5a99c Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Tue, 10 Oct 2023 11:40:58 +0100 Subject: [PATCH] Optimizing the make valid geometry query --- task/pgload/load.py | 33 ++++++++++------ tests/integration/pg_load/test_load.py | 52 ++++++++++++++------------ 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/task/pgload/load.py b/task/pgload/load.py index 3073fb8..01e03ad 100644 --- a/task/pgload/load.py +++ b/task/pgload/load.py @@ -117,9 +117,9 @@ def do_replace(source, tables_to_export=None): logger.info(f"Finished loading from database: {source} table: {table}") if source != "entity" and table == "entity": - make_valid_multipolygon(connection) + make_valid_multipolygon(connection,source) - make_valid_with_handle_geometry_collection(connection) + make_valid_with_handle_geometry_collection(connection,source) def remove_invalid_datasets(valid_datasets): @@ -161,15 +161,18 @@ def call_sql_queries(source, table, csv_filename, fieldnames, sql, cursor): logger.info(f"No data found in database: {source} table: {table}") -def make_valid_with_handle_geometry_collection(connection): +def make_valid_with_handle_geometry_collection(connection, source): make_valid_with_handle_geometry_collection = """ UPDATE entity SET geometry = ST_CollectionExtract(ST_MakeValid(geometry)) - WHERE geometry IS NOT NULL AND NOT ST_IsValid(geometry) - AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_GeometryCollection'; + WHERE geometry IS NOT NULL + AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_GeometryCollection' AND dataset = %s + AND ( + (ST_IsSimple(geometry) AND NOT ST_IsValid(geometry)) + OR NOT ST_IsSimple(geometry)); """.strip() with connection.cursor() as cursor: - cursor.execute(make_valid_with_handle_geometry_collection) + cursor.execute(make_valid_with_handle_geometry_collection, (source,)) rowcount = cursor.rowcount connection.commit() @@ -178,15 +181,21 @@ def make_valid_with_handle_geometry_collection(connection): ) -def make_valid_multipolygon(connection): +def make_valid_multipolygon(connection,source): make_valid_multipolygon = """ - UPDATE entity set geometry = ST_MakeValid(geometry) - WHERE geometry IS NOT NULL AND NOT ST_IsValid(geometry) - AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_MultiPolygon'; - """.strip() + UPDATE entity + SET geometry = ST_MakeValid(geometry) + WHERE geometry IS NOT NULL + AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_MultiPolygon' + AND dataset = %s + AND ( + (ST_IsSimple(geometry) AND NOT ST_IsValid(geometry)) + OR NOT ST_IsSimple(geometry)); + """.strip() + with connection.cursor() as cursor: - cursor.execute(make_valid_multipolygon) + cursor.execute(make_valid_multipolygon, (source,)) rowcount = cursor.rowcount connection.commit() diff --git a/tests/integration/pg_load/test_load.py b/tests/integration/pg_load/test_load.py index 07ac50b..81647eb 100644 --- a/tests/integration/pg_load/test_load.py +++ b/tests/integration/pg_load/test_load.py @@ -22,27 +22,32 @@ def sources(): # function to check if invalid data is updated correctly -def multipolygon_check(cursor): - cursor.execute( - """ +def multipolygon_check(cursor,source): + cursor.execute(""" SELECT COUNT(*) FROM entity - WHERE geometry IS NOT NULL AND NOT ST_IsValid(geometry) - AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_MultiPolygon'; - """ - ) + WHERE geometry IS NOT NULL + AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_MultiPolygon' + AND dataset = %s + AND ( + (ST_IsSimple(geometry) AND NOT ST_IsValid(geometry)) + OR NOT ST_IsSimple(geometry)); + """, (source,)) rowcount = cursor.fetchone()[0] assert rowcount == 0 # function to check if invalid data is updated correctly -def handle_geometry_collection_check(cursor): +def handle_geometry_collection_check(cursor,source): cursor.execute( """ SELECT COUNT(*) FROM entity - WHERE geometry IS NOT NULL AND NOT ST_IsValid(geometry) - AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_GeometryCollection'; - """ - ) + WHERE geometry IS NOT NULL + AND ST_GeometryType(ST_MakeValid(geometry)) = 'ST_MultiPolygon' + AND dataset = %s + AND ( + (ST_IsSimple(geometry) AND NOT ST_IsValid(geometry)) + OR NOT ST_IsSimple(geometry)); + """, (source,)) rowcount = cursor.fetchone()[0] assert rowcount == 0 @@ -80,21 +85,20 @@ def test_do_replace(sources, postgresql_conn, create_db): print("Testing do_replace method for source successful:: ", source) -def test_make_valid_multipolygon(postgresql_conn): - cursor = postgresql_conn.cursor() - make_valid_multipolygon(postgresql_conn) - - multipolygon_check(cursor) +def test_make_valid_multipolygon(postgresql_conn,sources): + cursor = postgresql_conn.cursor() + for source in sources: + make_valid_multipolygon(postgresql_conn,source) + multipolygon_check(cursor,source) postgresql_conn.commit() cursor.close() - - -def test_make_valid_with_handle_geometry_collection(postgresql_conn): + +def test_make_valid_with_handle_geometry_collection(postgresql_conn,sources): + cursor = postgresql_conn.cursor() - make_valid_with_handle_geometry_collection(postgresql_conn) - - handle_geometry_collection_check(cursor) - + for source in sources: + make_valid_with_handle_geometry_collection(postgresql_conn,source) + handle_geometry_collection_check(cursor,source) postgresql_conn.commit() cursor.close()