Flowminder · greenape · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -245,6 +245,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 #### Added
 - `inflows` and `outflows` exposed via API endpoint + added to flowclient [#2029](https://github.com/Flowminder/FlowKit/issues/2029), [#4866](https://github.com/Flowminder/FlowKit/issues/4866)
+- Added a new `@pre_flight` decorator which `Query` subclasses may use to indicate a method should be run to confirm the query is runnable
+- Added a new `preflight()` method to query, which calls all applicable pre-flight check methods for the query
 
 ### Changed
 - __Action Needed__ Airflow updated to version 2.3.3; **backup flowetl_db before applying update** [#4940](https://github.com/Flowminder/FlowKit/pull/4940)
@@ -255,6 +257,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - FlowDB now always creates a role named `flowmachine.`
 - Flowmachine will set the state of a query being stored to cancelled if interrupted while the store is running.
 - Flowmachine now supports sqlalchemy >=1.4 [#5140](https://github.com/Flowminder/FlowKit/issues/5140)
+- `get_cached_query_objects_ordered_by_score` is now a generator. [#3116](https://github.com/Flowminder/FlowKit/issues/3116)
+- Queries should no longer require communication with the database during `__init__`, any checks that require database access must now be implemented as a method of the class and use the `@pre_flight` decorator
+- When specifying tables in Flowmachine, the `events.` prefix is no longer required.
 
 ### Fixed
 - Flowmachine now makes the built in `flowmachine` role owner of cache tables as a post-action when a query is `store`d. [#4714](https://github.com/Flowminder/FlowKit/issues/4714)
@@ -265,6 +270,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Removed
 - `use_file_flux_sensor` removed entirely. [#2812](https://github.com/Flowminder/FlowKit/issues/2812)
 - `Model`, `ModelResult` and `Louvain` have been removed. [#5168](https://github.com/Flowminder/FlowKit/issues/5168)
+- `Table` no longer automatically infers columns from the database, they must be specified.
 
 ## [1.16.0]
 

diff --git a/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb b/docs/source/analyst/advanced_usage/worked_examples/mobile-data-usage.ipynb
@@ -92,7 +92,7 @@
     "data_events_query = flowmachine.features.TotalLocationEvents(\n",
     "    start=\"2016-01-01\",\n",
     "    stop=\"2016-01-08\",\n",
-    "    table=\"events.mds\",\n",
+    "    table=\"mds\",\n",
     "    spatial_unit=make_spatial_unit(\"versioned-cell\"),\n",
     "    interval=\"hour\",\n",
     ")"

diff --git a/flowdb/bin/build/0020_schema_cache.sql b/flowdb/bin/build/0020_schema_cache.sql
@@ -48,4 +48,7 @@ CREATE TABLE IF NOT EXISTS cache.dependencies
 CREATE TABLE cache.cache_config (key text, value text);
 INSERT INTO cache.cache_config (key, value) VALUES ('half_life', NULL);
 INSERT INTO cache.cache_config (key, value) VALUES ('cache_size', NULL);
-INSERT INTO cache.cache_config (key, value) VALUES ('cache_protected_period', NULL);
+INSERT INTO cache.cache_config (key, value) VALUES ('cache_protected_period', NULL);
+
+CREATE TABLE cache.zero_cache (object_class text);
+INSERT INTO cache.zero_cache (object_class) VALUES ('Table'), ('GeoTable'), ('CallsTable'), ('SmsTable'), ('MdsTable'), ('TopupsTable'), ('ForwardsTable'), ('TacsTable'), ('CellsTable'), ('SitesTable');
diff --git a/flowdb/bin/build/0030_utilities.sql b/flowdb/bin/build/0030_utilities.sql
@@ -243,9 +243,10 @@ $$
   DECLARE score float;
   BEGIN
   UPDATE cache.cached SET last_accessed = NOW(), access_count = access_count + 1,
-        cache_score_multiplier = CASE WHEN class='Table' THEN 0 ELSE
+        cache_score_multiplier = CASE WHEN class=ANY(no_score.classes) THEN 0 ELSE
           cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2)
         END
+        FROM (SELECT array_agg(object_class) as classes FROM cache.zero_cache) AS no_score
-        cache_score_multiplier = CASE WHEN class=ANY(no_score.classes) THEN 0 ELSE
-          cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2)
-        END
-        FROM (SELECT array_agg(object_class) as classes FROM cache.zero_cache) AS no_score
+        cache_score_multiplier = CASE 
+          WHEN EXISTS (SELECT 1 FROM cache.zero_cache WHERE object_class = class) THEN 0 
+          ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2)
+        END
-        cache_score_multiplier = CASE WHEN class=ANY(no_score.classes) THEN 0 ELSE
-          cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2)
-        END
-        FROM (SELECT array_agg(object_class) as classes FROM cache.zero_cache) AS no_score
+        cache_score_multiplier = CASE 
+          WHEN EXISTS (SELECT 1 FROM cache.zero_cache WHERE object_class = class) THEN 0 
+          ELSE cache_score_multiplier+POWER(1 + ln(2) / cache_half_life(), nextval('cache.cache_touches') - 2)
+        END
         WHERE query_id=cached_query_id
         RETURNING cache_score(cache_score_multiplier, compute_time, greatest(table_size(tablename, schema), 0.00001)) INTO score;
         IF NOT FOUND THEN RAISE EXCEPTION 'Cache record % not found', cached_query_id;

diff --git a/flowmachine/flowmachine/core/cache.py b/flowmachine/flowmachine/core/cache.py
@@ -12,7 +12,7 @@
 import sqlalchemy.engine
 from contextvars import copy_context
 from concurrent.futures import Executor, TimeoutError
-from functools import partial
+from functools import partial, lru_cache
 from sqlalchemy.exc import ResourceClosedError
 
 from typing import TYPE_CHECKING, Tuple, List, Callable, Optional
@@ -29,6 +29,7 @@
 from flowmachine.core.query_state import QueryStateMachine, QueryEvent
 from flowmachine import __version__
 
+
 if TYPE_CHECKING:
     from .query import Query
     from .connection import Connection
@@ -191,8 +192,10 @@ def write_query_to_cache(
         if this_thread_is_owner:
             logger.debug(f"In charge of executing '{query.query_id}'.")
             try:
+                query.preflight()
                 query_ddl_ops = ddl_ops_func(name, schema)
             except Exception as exc:
+                q_state_machine.raise_error()
                 logger.error(f"Error generating SQL. Error was {exc}")
                 raise exc
             logger.debug("Made SQL.")
@@ -204,6 +207,7 @@ def write_query_to_cache(
                     )
                     logger.debug("Executed queries.")
                 except Exception as exc:
+                    q_state_machine.raise_error()
                     logger.error(f"Error executing SQL. Error was {exc}")
                     raise exc
                 if analyze:
@@ -219,6 +223,7 @@ def write_query_to_cache(
                             executed_sql=";\n".join(query_ddl_ops),
                         )
                     except Exception as exc:
+                        q_state_machine.raise_error()
                         logger.error(f"Error writing cache metadata. Error was {exc}")
                         raise exc
             q_state_machine.finish()
@@ -229,7 +234,6 @@ def write_query_to_cache(
     finally:
         if this_thread_is_owner and not q_state_machine.is_finished_executing:
             q_state_machine.cancel()
-
     q_state_machine.wait_until_complete(sleep_duration=sleep_duration)
     if q_state_machine.is_completed:
         return query
@@ -301,6 +305,7 @@ def write_cache_metadata(
                 psycopg2.Binary(self_storage),
             ),
         )
+        logger.debug("Touching cache.", query_id=query.query_id, query=str(query))
         connection.exec_driver_sql(
             "SELECT touch_cache(%(ident)s);", dict(ident=query.query_id)
         )
@@ -334,6 +339,7 @@ def touch_cache(connection: "Connection", query_id: str) -> float:
         The new cache score
     """
     try:
+        logger.debug("Touching cache.", query_id=query_id)
         with connection.engine.begin() as trans:
             return float(
                 trans.exec_driver_sql(f"SELECT touch_cache('{query_id}')").fetchall()[
@@ -481,6 +487,19 @@ def get_query_object_by_id(connection: "Connection", query_id: str) -> "Query":
         raise ValueError(f"Query id '{query_id}' is not in cache on this connection.")
 
 
+@lru_cache(maxsize=1)
+def _get_protected_classes():
+    from flowmachine.core.events_table import events_table_map
+    from flowmachine.core.infrastructure_table import infrastructure_table_map
+
+    return [
+        "Table",
+        "GeoTable",
+        *[cls.__name__ for cls in events_table_map.values()],
+        *[cls.__name__ for cls in infrastructure_table_map.values()],
+    ]
+
+
 def get_cached_query_objects_ordered_by_score(
     connection: "Connection",
     protected_period: Optional[int] = None,
@@ -502,14 +521,15 @@ def get_cached_query_objects_ordered_by_score(
         Returns a list of cached Query objects with their on disk sizes
 
     """
+
     protected_period_clause = (
         (f" AND NOW()-created > INTERVAL '{protected_period} seconds'")
         if protected_period is not None
         else " AND NOW()-created > (cache_protected_period()*INTERVAL '1 seconds')"
     )
     qry = f"""SELECT query_id, table_size(tablename, schema) as table_size
         FROM cache.cached
-        WHERE cached.class!='Table' AND cached.class!='GeoTable'
+        WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}))
         {protected_period_clause}
         ORDER BY cache_score(cache_score_multiplier, compute_time, table_size(tablename, schema)) ASC
         """
@@ -689,9 +709,9 @@ def get_size_of_cache(connection: "Connection") -> int:
         Number of bytes in total used by cache tables
 
     """
-    sql = """SELECT sum(table_size(tablename, schema)) as total_bytes 
+    sql = f"""SELECT sum(table_size(tablename, schema)) as total_bytes 
         FROM cache.cached  
-        WHERE cached.class!='Table' AND cached.class!='GeoTable'"""
+        WHERE NOT (cached.class=ANY(ARRAY{_get_protected_classes()}))"""
     cache_bytes = connection.fetch(sql)[0][0]
     return 0 if cache_bytes is None else int(cache_bytes)
 

diff --git a/flowmachine/flowmachine/core/errors/flowmachine_errors.py b/flowmachine/flowmachine/core/errors/flowmachine_errors.py
@@ -6,6 +6,27 @@
 """
 Custom errors raised by flowmachine.
 """
+from typing import List, Dict
+
+
+class PreFlightFailedException(Exception):
+    """
+    Exception indicating that preflight checks for a query failed.
+
+    Parameters
+    ----------
+    query_id : str
+        Identifier of the query
+    errors : dict
+        Mapping from query reps to lists of exceptions raised in preflight
+    """
+
+    def __init__(self, query_id: str, errors: Dict[str, List[Exception]]):
+        self.errors = errors
+        self.query_id = query_id
+        Exception.__init__(
+            self, f"Pre-flight failed for '{self.query_id}'. Errors: {errors}"
+        )
 
 
 class StoreFailedException(Exception):

diff --git a/flowmachine/flowmachine/core/events_table.py b/flowmachine/flowmachine/core/events_table.py
@@ -0,0 +1,120 @@
+from flowmachine.core.flowdb_table import FlowDBTable
+
+
+class EventsTable(FlowDBTable):
+    def __init__(self, *, name, columns):
+        super().__init__(schema="events", name=name, columns=columns)
+
+
+class CallsTable(EventsTable):
+    all_columns = [
+        "id",
+        "outgoing",
+        "datetime",
+        "duration",
+        "network",
+        "msisdn",
+        "msisdn_counterpart",
+        "location_id",
+        "imsi",
+        "imei",
+        "tac",
+        "operator_code",
+        "country_code",
+    ]
+
+    def __init__(self, *, columns=None):
+        super().__init__(name="calls", columns=columns)
+
+
+class ForwardsTable(EventsTable):
+    all_columns = [
+        "id",
+        "outgoing",
+        "datetime",
+        "network",
+        "msisdn",
+        "msisdn_counterpart",
+        "location_id",
+        "imsi",
+        "imei",
+        "tac",
+        "operator_code",
+        "country_code",
+    ]
+
+    def __init__(self, *, columns=None):
+        super().__init__(name="forwards", columns=columns)
+
+
+class SmsTable(EventsTable):
+    all_columns = [
+        "id",
+        "outgoing",
+        "datetime",
+        "network",
+        "msisdn",
+        "msisdn_counterpart",
+        "location_id",
+        "imsi",
+        "imei",
+        "tac",
+        "operator_code",
+        "country_code",
+    ]
+
+    def __init__(self, *, columns):
+        super().__init__(name="sms", columns=columns)
+
+
+class MdsTable(EventsTable):
+    all_columns = [
+        "id",
+        "datetime",
+        "duration",
+        "volume_total",
+        "volume_upload",
+        "volume_download",
+        "msisdn",
+        "location_id",
+        "imsi",
+        "imei",
+        "tac",
+        "operator_code",
+        "country_code",
+    ]
+
+    def __init__(self, *, columns):
+        super().__init__(name="mds", columns=columns)
+
+
+class TopupsTable(EventsTable):
+    all_columns = [
+        "id",
+        "datetime",
+        "type",
+        "recharge_amount",
+        "airtime_fee",
+        "tax_and_fee",
+        "pre_event_balance",
+        "post_event_balance",
+        "msisdn",
+        "location_id",
+        "imsi",
+        "imei",
+        "tac",
+        "operator_code",
+        "country_code",
+    ]
+
+    def __init__(self, *, columns):
+        super().__init__(name="topups", columns=columns)
+
+
+events_table_map = dict(
+    calls=CallsTable,
+    sms=SmsTable,
+    mds=MdsTable,
+    topups=TopupsTable,
+    forwards=ForwardsTable,
+)
diff --git a/flowmachine/flowmachine/core/flowdb_table.py b/flowmachine/flowmachine/core/flowdb_table.py
@@ -0,0 +1,20 @@
+from abc import ABCMeta
+
+from flowmachine.core.table import Table
+
+
+
+class FlowDBTable(Table, metaclass=ABCMeta):
+    def __init__(self, *, name, schema, columns):
+        if columns is None:
+            columns = self.all_columns
+        if set(columns).issubset(self.all_columns):
+            super().__init__(schema=schema, name=name, columns=columns)
+        else:
+            raise ValueError(
+                f"Columns {columns} must be a subset of {self.all_columns}"
+            )
-    def __init__(self, *, name, schema, columns):
-        if columns is None:
-            columns = self.all_columns
-        if set(columns).issubset(self.all_columns):
-            super().__init__(schema=schema, name=name, columns=columns)
-        else:
-            raise ValueError(
-                f"Columns {columns} must be a subset of {self.all_columns}"
-            )
+    def __init__(self, *, name: str, schema: str, columns: list[str] | None = None) -> None:
+        """
+        Initialize a FlowDB table with specified name, schema, and columns.
+
+        Args:
+            name: The name of the table
+            schema: The database schema containing the table
+            columns: List of column names. If None, uses all available columns
+
+        Raises:
+            ValueError: If provided columns are not valid or if name/schema are empty
+            AttributeError: If all_columns is not defined in the subclass
+        """
+        if not name or not schema:
+            raise ValueError("Both name and schema must be non-empty strings")
+
+        if not hasattr(self, 'all_columns'):
+            raise AttributeError("Subclass must define all_columns")
+
+        if columns is None:
+            columns = self.all_columns
+        if set(columns).issubset(self.all_columns):
+            super().__init__(schema=schema, name=name, columns=columns)
+        else:
+            raise ValueError(
+                f"Invalid columns: {set(columns) - set(self.all_columns)}. "
+                f"Must be a subset of: {self.all_columns}"
+            )
-    def __init__(self, *, name, schema, columns):
-        if columns is None:
-            columns = self.all_columns
-        if set(columns).issubset(self.all_columns):
-            super().__init__(schema=schema, name=name, columns=columns)
-        else:
-            raise ValueError(
-                f"Columns {columns} must be a subset of {self.all_columns}"
-            )
+    def __init__(self, *, name: str, schema: str, columns: list[str] | None = None) -> None:
+        """
+        Initialize a FlowDB table with specified name, schema, and columns.
+
+        Args:
+            name: The name of the table
+            schema: The database schema containing the table
+            columns: List of column names. If None, uses all available columns
+
+        Raises:
+            ValueError: If provided columns are not valid or if name/schema are empty
+            AttributeError: If all_columns is not defined in the subclass
+        """
+        if not name or not schema:
+            raise ValueError("Both name and schema must be non-empty strings")
+
+        if not hasattr(self, 'all_columns'):
+            raise AttributeError("Subclass must define all_columns")
+
+        if columns is None:
+            columns = self.all_columns
+        if set(columns).issubset(self.all_columns):
+            super().__init__(schema=schema, name=name, columns=columns)
+        else:
+            raise ValueError(
+                f"Invalid columns: {set(columns) - set(self.all_columns)}. "
+                f"Must be a subset of: {self.all_columns}"
+            )
+
+    @property
+    def all_columns(self):
+        raise NotImplementedError
diff --git a/flowmachine/flowmachine/core/geotable.py b/flowmachine/flowmachine/core/geotable.py
@@ -6,6 +6,7 @@
 """
 Simple utility class that represents tables with geometry.
 """
+from typing import Optional, List
 
 from . import Table
 from .mixins import GeoDataMixin
@@ -47,19 +48,25 @@ class GeoTable(GeoDataMixin, Table):
     """
 
     def __init__(
-        self, name=None, schema=None, columns=None, geom_column="geom", gid_column=None
+        self,
+        name: str,
+        *,
+        schema: Optional[str] = None,
+        columns: List[str],
+        geom_column: str = "geom",
+        gid_column: Optional[str] = None,
     ):
         self.geom_column = geom_column
         self.gid_column = gid_column
-        super().__init__(name=name, schema=schema, columns=columns)
-        if geom_column not in self.column_names:
+        if self.geom_column not in columns:
             raise ValueError(
-                "geom_column: {} is not a column in this table.".format(geom_column)
+                f"geom_column: {self.geom_column} is not a column in this table."
             )
-        if gid_column is not None and gid_column not in self.column_names:
+        if self.gid_column is not None and self.gid_column not in columns:
             raise ValueError(
-                "gid_column: {} is not a column in this table.".format(gid_column)
+                f"gid_column: {self.gid_column} is not a column in this table."
             )
+        super().__init__(name=name, schema=schema, columns=columns)
 
     def _geo_augmented_query(self):
         if self.gid_column is None: