From 768d52bcb3e0c0f9d3a55706431670db102489d2 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 15 Jan 2019 21:07:12 +0000 Subject: [PATCH 1/5] limit remote device lists to 1000 entries per user --- synapse/handlers/device.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 9e017116a904..6f80a7dce97a 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -532,6 +532,20 @@ def _handle_device_updates(self, user_id): stream_id = result["stream_id"] devices = result["devices"] + + # Emergency hack to prevent DoS from + # @bot:oliviervandertoorn.nl and @bot:matrix-beta.igalia.com + # on Jan 15 2019: only store the most recent 1000 devices for + # a given user. (We assume we receive them in chronological + # order, which is dubious given _get_e2e_device_keys_txn does + # not explicitly order its results). Otherwise it can take + # longer than 60s to persist the >100K devices, at which point + # the internal replication request to handle the + # m.device_list_update EDU times out, causing the remote + # server to retry the transaction and thus DoS synapse master + # CPU and DB. + devices = devices[-1000:] + yield self.store.update_remote_device_list_cache( user_id, devices, stream_id, ) From 8ff9c86d1e0dde6eaf27f361824b55729703fad8 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 15 Jan 2019 21:38:07 +0000 Subject: [PATCH 2/5] don't store remote device lists if they have more than 10K devices --- synapse/handlers/device.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 6f80a7dce97a..5bca62418ef7 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -533,18 +533,19 @@ def _handle_device_updates(self, user_id): stream_id = result["stream_id"] devices = result["devices"] - # Emergency hack to prevent DoS from - # @bot:oliviervandertoorn.nl and @bot:matrix-beta.igalia.com - # on Jan 15 2019: only store the most recent 1000 devices for - # a given user. (We assume we receive them in chronological - # order, which is dubious given _get_e2e_device_keys_txn does - # not explicitly order its results). Otherwise it can take - # longer than 60s to persist the >100K devices, at which point - # the internal replication request to handle the - # m.device_list_update EDU times out, causing the remote - # server to retry the transaction and thus DoS synapse master - # CPU and DB. - devices = devices[-1000:] + # If the remote server has more than ~10000 devices for this user + # we assume that something is going horribly wrong (e.g. a bot + # that logs in and creates a new device every time it tries to + # send a message). Maintaining lots of devices per user in the + # cache can cause serious performance issues as if this request + # takes more than 60s to complete, internal replication from the + # inbound federation worker to the synapse master may time out + # causing the inbound federation to fail and causing the remote + # server to retry, causing a DoS. So in this scenario we give + # up on storing the total list of devices and only handle the + # delta instead. + if len(devices) > 10000: + devices = [] yield self.store.update_remote_device_list_cache( user_id, devices, stream_id, From 36e30403aeba3e78a11f4704eade6e70ff418921 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 15 Jan 2019 21:46:29 +0000 Subject: [PATCH 3/5] drop the limit to 1K as e2e will be hosed beyond that point anyway --- synapse/handlers/device.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index 5bca62418ef7..a93dfd1d631f 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -533,7 +533,7 @@ def _handle_device_updates(self, user_id): stream_id = result["stream_id"] devices = result["devices"] - # If the remote server has more than ~10000 devices for this user + # If the remote server has more than ~1000 devices for this user # we assume that something is going horribly wrong (e.g. a bot # that logs in and creates a new device every time it tries to # send a message). Maintaining lots of devices per user in the @@ -544,7 +544,7 @@ def _handle_device_updates(self, user_id): # server to retry, causing a DoS. So in this scenario we give # up on storing the total list of devices and only handle the # delta instead. - if len(devices) > 10000: + if len(devices) > 1000: devices = [] yield self.store.update_remote_device_list_cache( From aa70be22a3145e0caf56bdec64b1b7a762348a32 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 15 Jan 2019 22:05:05 +0000 Subject: [PATCH 4/5] changelog --- changelog.d/4397.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4397.bugfix diff --git a/changelog.d/4397.bugfix b/changelog.d/4397.bugfix new file mode 100644 index 000000000000..e7526d4454bd --- /dev/null +++ b/changelog.d/4397.bugfix @@ -0,0 +1 @@ +Fix high CPU usage due to remote devicelist updates From 3b31303f3eb49c53926365cfdc52ce884e60615c Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 15 Jan 2019 22:10:44 +0000 Subject: [PATCH 5/5] warn if we ignore device lists --- synapse/handlers/device.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index a93dfd1d631f..8955cde4ed35 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -545,6 +545,10 @@ def _handle_device_updates(self, user_id): # up on storing the total list of devices and only handle the # delta instead. if len(devices) > 1000: + logger.warn( + "Ignoring device list snapshot for %s as it has >1K devs (%d)", + user_id, len(devices) + ) devices = [] yield self.store.update_remote_device_list_cache(