ibis-project · NickCrews · Jun 29, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -1643,6 +1643,7 @@ def _register_udf(self, udf_node: ops.ScalarUDF):
             for param in udf_node.__signature__.parameters.values()
         ]
         output_type = type_mapper.to_string(udf_node.dtype)
+        config = udf_node.__config__
 
         def register_udf(con):
             return con.create_function(
@@ -1651,6 +1652,7 @@ def register_udf(con):
                 input_types,
                 output_type,
                 type=_UDF_INPUT_TYPE_MAPPING[udf_node.__input_type__],
+                **config,
             )
 
         return register_udf

diff --git a/ibis/backends/sql/compilers/postgres.py b/ibis/backends/sql/compilers/postgres.py
@@ -160,13 +160,13 @@ def _compile_python_udf(self, udf_node: ops.ScalarUDF):
         type_mapper = self.type_mapper
         argnames = udf_node.argnames
         return """\
-    CREATE OR REPLACE FUNCTION {ident}({signature})
-    RETURNS {return_type}
-    LANGUAGE {language}
-    AS $$
-    {source}
-    return {name}({args})
-    $$""".format(
+CREATE OR REPLACE FUNCTION {ident}({signature})
+RETURNS {return_type}
+LANGUAGE {language}
+AS $$
+{source}
+return {name}({args})
+$$""".format(
             name=type(udf_node).__name__,
             ident=self.__sql_name__(udf_node),
             signature=", ".join(

diff --git a/ibis/backends/sql/rewrites.py b/ibis/backends/sql/rewrites.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import operator
+import sys
 from collections.abc import Mapping
 from functools import reduce
 from typing import TYPE_CHECKING, Any
@@ -230,6 +231,9 @@ def complexity(node):
     def accum(node, *args):
         if isinstance(node, ops.Field):
             return 1
+        elif isinstance(node, ops.Impure):
+            # consider (potentially) impure functions maximally complex
+            return sys.maxsize
         else:
             return 1 + sum(args)
 

diff --git a/ibis/backends/tests/test_impure.py b/ibis/backends/tests/test_impure.py
@@ -0,0 +1,221 @@
+from __future__ import annotations
+
+import sys
+
+import pytest
+
+import ibis
+import ibis.common.exceptions as com
+from ibis import _
+from ibis.backends.tests.errors import Py4JJavaError
+
+tm = pytest.importorskip("pandas.testing")
+
+pytestmark = pytest.mark.xdist_group("impure")
+
+no_randoms = [
+    pytest.mark.notimpl(
+        ["polars", "druid", "risingwave"], raises=com.OperationNotDefinedError
+    ),
+]
+
+no_udfs = [
+    pytest.mark.notyet("datafusion", raises=NotImplementedError),
+    pytest.mark.notimpl(
+        [
+            "bigquery",
+            "clickhouse",
+            "druid",
+            "exasol",
+            "impala",
+            "mssql",
+            "mysql",
+            "oracle",
+            "trino",
+            "risingwave",
+        ]
+    ),
+    pytest.mark.notyet(
+        "flink",
+        condition=sys.version_info >= (3, 11),
+        raises=Py4JJavaError,
+        reason="Docker image has Python 3.10, results in `cloudpickle` version mismatch",
+    ),
+]
+
+no_uuids = [
+    pytest.mark.notimpl(
+        ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave"],
+        raises=com.OperationNotDefinedError,
+    ),
+    pytest.mark.notyet("mssql", reason="Unrelated bug: Incorrect syntax near '('"),
+]
+
+
+@ibis.udf.scalar.python(side_effects=True)
+def my_random(x: float) -> float:
+    # need to make the whole UDF self-contained for postgres to work
+    import random
+
+    return random.random()  # noqa: S311
+
+
+mark_impures = pytest.mark.parametrize(
+    "impure",
+    [
+        pytest.param(lambda _: ibis.random(), marks=no_randoms, id="random"),
+        pytest.param(
+            lambda _: ibis.uuid().cast(str).contains("a").ifelse(1, 0),
+            marks=[
+                *no_uuids,
+                pytest.mark.notyet("impala", reason="instances are uncorrelated"),
+            ],
+            id="uuid",
+        ),
+        pytest.param(
+            lambda table: my_random(table.float_col),
+            marks=[
+                *no_udfs,
+                pytest.mark.notyet(["flink"], reason="instances are uncorrelated"),
+            ],
+            id="udf",
+        ),
+    ],
+)
+
+
+# You can work around this by .cache()ing the table.
+@pytest.mark.notyet("sqlite", reason="instances are uncorrelated")
+@mark_impures
+def test_impure_correlated(alltypes, impure):
+    # An "impure" expression is random(), uuid(), or some other non-deterministic UDF.
+    # If we evaluate it for two different rows in the same relation,
+    # we might get different results. This is expected.
+    # But, as soon as we .select() it into a new relation, then that "locks in" the
+    # value, and any further references to it will be the same.
+    # eg if you look at the following SQL:
+    # WITH
+    #   t AS (SELECT random() AS common)
+    # SELECT common as x, common as y FROM t
+    # Then both x and y should have the same value.
+    expr = alltypes.select(common=impure(alltypes)).select(x=_.common, y=_.common)
+    df = expr.execute()
+    tm.assert_series_equal(df.x, df.y, check_names=False)
+
+
+# You can work around this by .cache()ing the table.
+@pytest.mark.notyet("sqlite", reason="instances are uncorrelated")
+@mark_impures
+def test_chained_selections(alltypes, impure):
+    # https://github.com/ibis-project/ibis/issues/8921#issue-2234327722
+    # This is a slightly more complex version of test_impure_correlated.
+    # consider this SQL:
+    # WITH
+    #   t AS (SELECT random() AS num)
+    # SELECT num, num > 0.5 AS isbig FROM t
+    # We would expect that the value of num and isbig are consistent,
+    # since we "lock in" the value of num by selecting it into t.
+    t = alltypes.select(num=impure(alltypes))
+    t = t.mutate(isbig=(t.num > 0.5))
+    df = t.execute()
+    df["expected"] = df.num > 0.5
+    tm.assert_series_equal(df.isbig, df.expected, check_names=False)
+
+
+impure_params_uncorrelated = pytest.mark.parametrize(
+    "impure",
+    [
+        pytest.param(
+            lambda _: ibis.random(),
+            marks=[
+                *no_randoms,
+                pytest.mark.notyet(["impala"], reason="instances are correlated"),
+            ],
+            id="random",
+        ),
+        pytest.param(
+            # make this a float so we can compare to .5
+            lambda _: ibis.uuid().cast(str).contains("a").ifelse(1, 0),
+            marks=[
+                *no_uuids,
+                pytest.mark.notyet(["mysql"], reason="instances are correlated"),
+            ],
+            id="uuid",
+        ),
+        pytest.param(
+            lambda table: my_random(table.float_col),
+            marks=[
+                *no_udfs,
+                # no "impure" argument for pyspark yet
+                pytest.mark.notimpl("pyspark"),
+            ],
+            id="udf",
+        ),
+    ],
+)
+
+
+# You can work around this by doing .select().cache().select()
+@pytest.mark.notyet(["clickhouse"], reason="instances are correlated")
+@impure_params_uncorrelated
+def test_impure_uncorrelated_different_id(alltypes, impure):
+    # This is the opposite of test_impure_correlated.
+    # If we evaluate an impure expression for two different rows in the same relation,
+    # the should be uncorrelated.
+    # eg if you look at the following SQL:
+    # select random() as x, random() as y
+    # Then x and y should be uncorrelated.
+    expr = alltypes.select(x=impure(alltypes), y=impure(alltypes))
+    df = expr.execute()
+    assert (df.x != df.y).any()
+
+
+# You can work around this by doing .select().cache().select()
+@pytest.mark.notyet(["clickhouse"], reason="instances are correlated")
+@impure_params_uncorrelated
+def test_impure_uncorrelated_same_id(alltypes, impure):
+    # Similar to test_impure_uncorrelated_different_id, but the two expressions
+    # have the same ID. Still, they should be uncorrelated.
+    common = impure(alltypes)
+    expr = alltypes.select(x=common, y=common)
+    df = expr.execute()
+    assert (df.x != df.y).any()
+
+
+@pytest.mark.notyet(
+    [
+        "duckdb",
+        "clickhouse",
+        "datafusion",
+        "mysql",
+        "impala",
+        "mssql",
+        "trino",
+        "flink",
+        "bigquery",
+    ],
+    raises=AssertionError,
+    reason="instances are not correlated but ideally they would be",
+)
+@pytest.mark.notyet(
+    ["sqlite"],
+    raises=AssertionError,
+    reason="instances are *sometimes* correlated but ideally they would always be",
+    strict=False,
+)
+@pytest.mark.notimpl(
+    ["polars", "risingwave", "druid", "exasol", "oracle", "pyspark"],
+    raises=com.OperationNotDefinedError,
+)
+def test_self_join_with_generated_keys(con):
+    # Even with CTEs in the generated SQL, the backends still
+    # materialize a new value every time it is referenced.
+    # This isn't ideal behavior, but there is nothing we can do about it
+    # on the ibis side. The best you can do is to .cache() the table
+    # right after you assign the uuid().
+    # https://github.com/ibis-project/ibis/pull/9014#issuecomment-2399449665
+    left = ibis.memtable({"idx": list(range(5))}).mutate(key=ibis.uuid())
+    right = left.filter(left.idx < 3)
+    expr = left.join(right, "key")
+    result = con.execute(expr.count())
+    assert result == 3
diff --git a/ibis/expr/decompile.py b/ibis/expr/decompile.py
@@ -38,7 +38,6 @@
     ops.StringContains: "contains",
     ops.StringSQLILike: "ilike",
     ops.StringSQLLike: "like",
-    ops.TimestampNow: "now",
 }
 
 
@@ -84,6 +83,11 @@ def translate(op, *args, **kwargs):
     raise NotImplementedError(op)
 
 
+@translate.register(ops.TimestampNow)
+def now(_):
+    return "ibis.now()"
+
+
 @translate.register(ops.Value)
 def value(op, *args, **kwargs):
     method = _get_method_name(op)

diff --git a/ibis/expr/operations/generic.py b/ibis/expr/operations/generic.py
@@ -189,17 +189,19 @@ class Impure(Value):
 
 
 @public
-class TimestampNow(Constant):
+class TimestampNow(Impure):
     """Return the current timestamp."""
 
     dtype = dt.timestamp
+    shape = ds.scalar
 
 
 @public
-class DateNow(Constant):
+class DateNow(Impure):
     """Return the current date."""
 
     dtype = dt.date
+    shape = ds.scalar
 
 
 @public

diff --git a/ibis/expr/operations/udf.py b/ibis/expr/operations/udf.py
@@ -51,7 +51,7 @@ class InputType(enum.Enum):
 
 
 @public
-class ScalarUDF(ops.Value):
+class ScalarUDF(ops.Impure):
     @attribute
     def shape(self):
         if not (args := getattr(self, "args")):  # noqa: B009
@@ -65,7 +65,7 @@ def shape(self):
 
 
 @public
-class AggUDF(ops.Reduction):
+class AggUDF(ops.Reduction, ops.Impure):
     where: Optional[ops.Value[dt.Boolean]] = None