Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues with step conversion times #5174

Merged
merged 12 commits into from
Jul 22, 2021
8 changes: 8 additions & 0 deletions ee/clickhouse/queries/funnels/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,14 @@ def _get_count_columns(self, max_steps: int):

return ", ".join(cols)

def _get_step_time_names(self, max_steps: int):
names = []
for i in range(1, max_steps):
names.append(f"step_{i}_average_conversion_time")

formatted = ",".join(names)
return f", {formatted}" if formatted else ""

def _get_step_time_avgs(self, max_steps: int):
conditions: List[str] = []
for i in range(1, max_steps):
Expand Down
11 changes: 8 additions & 3 deletions ee/clickhouse/queries/funnels/funnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,14 @@ def get_step_counts_query(self):
max_steps = len(self._filter.entities)
breakdown_clause = self._get_breakdown_prop()

return f"""SELECT person_id, max(steps) AS steps {self._get_step_time_avgs(max_steps)} {breakdown_clause} FROM (
{steps_per_person_query}
) GROUP BY person_id {breakdown_clause}
return f"""
SELECT person_id, steps {self._get_step_time_avgs(max_steps)} {breakdown_clause} FROM (
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When some sequence of steps is a substring of itself, this previously led to incorrect results (since you want to average out times to convert of only the most-completed attempts)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wasn't fully clear^

wrote a new test to explain this: test_funnel_with_multiple_incomplete_tries - when you do multiple incomplete tries, and multiple complete tries, only take into account the complete tries, since those are the ones that get counted (max(steps))

SELECT person_id, steps, max(steps) over (PARTITION BY person_id {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} FROM (
{steps_per_person_query}
)
) GROUP BY person_id, steps {breakdown_clause}
HAVING steps = max_steps
SETTINGS allow_experimental_window_functions = 1
"""

def _format_results(self, results):
Expand Down
12 changes: 7 additions & 5 deletions ee/clickhouse/queries/funnels/funnel_strict.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ def get_query(self):

def get_step_counts_query(self):

steps_per_person_query = self.get_step_counts_without_aggregation_query()
max_steps = len(self._filter.entities)

formatted_query = self.get_step_counts_without_aggregation_query()
breakdown_clause = self._get_breakdown_prop()

return f"""
SELECT person_id, max(steps) AS steps {self._get_step_time_avgs(max_steps)} {breakdown_clause} FROM (
{formatted_query}
) GROUP BY person_id {breakdown_clause}
SELECT person_id, steps {self._get_step_time_avgs(max_steps)} {breakdown_clause} FROM (
SELECT person_id, steps, max(steps) over (PARTITION BY person_id {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} FROM (
{steps_per_person_query}
)
) GROUP BY person_id, steps {breakdown_clause}
HAVING steps = max_steps
"""

def get_step_counts_without_aggregation_query(self):
Expand Down
8 changes: 0 additions & 8 deletions ee/clickhouse/queries/funnels/funnel_unordered.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,6 @@ def get_step_counts_without_aggregation_query(self):

return " UNION ALL ".join(union_queries)

def _get_step_time_names(self, max_steps: int):
names = []
for i in range(1, max_steps):
names.append(f"step_{i}_average_conversion_time")

formatted = ",".join(names)
return f", {formatted}" if formatted else ""

def _get_step_times(self, max_steps: int):
conditions: List[str] = []

Expand Down
22 changes: 18 additions & 4 deletions ee/clickhouse/queries/funnels/test/test_funnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ee.clickhouse.client import sync_execute
from ee.clickhouse.models.event import create_event
from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel
from ee.clickhouse.queries.funnels.funnel_event_query import FunnelEventQuery
from ee.clickhouse.queries.funnels.funnel_persons import ClickhouseFunnelPersons
from ee.clickhouse.queries.funnels.test.breakdown_cases import funnel_breakdown_test_factory
from ee.clickhouse.util import ClickhouseTestMixin
Expand All @@ -15,6 +16,7 @@
from posthog.models.filters import Filter
from posthog.models.person import Person
from posthog.queries.test.test_funnel import funnel_test_factory
from posthog.settings import SHELL_PLUS_PRINT_SQL

FORMAT_TIME = "%Y-%m-%d 00:00:00"
MAX_STEP_COLUMN = 0
Expand Down Expand Up @@ -97,21 +99,33 @@ def test_basic_funnel_with_repeat_steps(self):
],
"insight": INSIGHT_FUNNELS,
"funnel_window_days": 14,
"date_from": "2020-01-01",
"date_to": "2020-01-14",
}

filter = Filter(data=filters)
funnel = ClickhouseFunnel(filter, self.team)

# event
person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk)
_create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1")
_create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1")
_create_event(
team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2020-01-02T14:00:00"
)
_create_event(
team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2020-01-03T14:00:00"
)

person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk)
_create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup2")
_create_event(
team=self.team, event="user signed up", distinct_id="stopped_after_signup2", timestamp="2020-01-02T14:00:00"
)

with self.settings(DEBUG=True):
event_query, params = FunnelEventQuery(filter=filter, team_id=self.team.pk).get_query()
print(sync_execute(event_query, params))
print(sync_execute(funnel.get_step_counts_query(), funnel.params))
result = funnel.run()

print(result)
self.assertEqual(result[0]["name"], "user signed up")
self.assertEqual(result[0]["count"], 2)
self.assertEqual(len(result[0]["people"]), 2)
Expand Down
55 changes: 55 additions & 0 deletions ee/clickhouse/queries/funnels/test/test_funnel_time_to_convert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from collections import defaultdict
from uuid import uuid4

from ee.clickhouse.client import sync_execute
from ee.clickhouse.models.event import create_event
from ee.clickhouse.queries.funnels import ClickhouseFunnel, ClickhouseFunnelStrict, ClickhouseFunnelUnordered
from ee.clickhouse.queries.funnels.funnel_time_to_convert import ClickhouseFunnelTimeToConvert
Expand Down Expand Up @@ -78,6 +80,59 @@ def test_auto_bin_count_single_step(self):
},
)

def test_auto_bin_count_single_step_duplicate_events(self):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This demonstrates the CH bug, will probably move this into a separate PR, since this edge case will remain flakey till its fixed upstream.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking of removing these from consideration in our time to convert funnels as a patch.

_create_person(distinct_ids=["user a"], team=self.team)
_create_person(distinct_ids=["user b"], team=self.team)
_create_person(distinct_ids=["user c"], team=self.team)

_create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00")
_create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00")
# Converted from 0 to 1 in 3600 s
_create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00")

_create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00")
_create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00")
# Converted from 0 to 1 in 2200 s

_create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00")
_create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00")
# Converted from 0 to 1 in 82_800 s

from django.core.cache import cache

cache.clear()

filter = Filter(
data={
"insight": INSIGHT_FUNNELS,
"interval": "day",
"date_from": "2021-06-07 00:00:00",
"date_to": "2021-06-13 23:59:59",
"funnel_from_step": 0,
"funnel_to_step": 1,
"funnel_window_days": 7,
"events": [
{"id": "step one", "order": 0},
{"id": "step one", "order": 1},
{"id": "step one", "order": 2},
],
}
)

funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnel)
results = funnel_trends.run()

# Autobinned using the minimum time to convert, maximum time to convert, and sample count
self.assertEqual(
results,
[
(2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 29_080 s - users A and B
(29080.0, 0), # Analogous to above, just an interval (in this case 26_880 s) up - no users
(55940.0, 0), # Same as above
(82800.0, 1), # Reached step 1 from step 0 in at least 82_800 s but less than 109_680 s - user C
],
)

def test_custom_bin_count_single_step(self):
_create_person(distinct_ids=["user a"], team=self.team)
_create_person(distinct_ids=["user b"], team=self.team)
Expand Down