From 94377a45cc0f8b49acbf27c6821cc6daacdd07e7 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 17:39:46 +0530 Subject: [PATCH 1/8] adding new manager_param -> memery_watchdog --- automation/default_manager_params.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/automation/default_manager_params.json b/automation/default_manager_params.json index e92cdae55..e3e7b8926 100644 --- a/automation/default_manager_params.json +++ b/automation/default_manager_params.json @@ -7,5 +7,6 @@ "failure_limit": null, "testing": false, "s3_bucket": null, - "s3_directory": null + "s3_directory": null, + "memory_watchdog": false } From 3d255a46f36d0006f9ee8855624b01b5bb6ab9b5 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 17:40:58 +0530 Subject: [PATCH 2/8] adding check to see memory_watchdog is set or not --- automation/TaskManager.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/automation/TaskManager.py b/automation/TaskManager.py index dea7ffa0e..850f092ce 100644 --- a/automation/TaskManager.py +++ b/automation/TaskManager.py @@ -30,6 +30,7 @@ BROWSER_MEMORY_LIMIT = 1500 # in MB AGGREGATOR_QUEUE_LIMIT = 10000 # number of records in the queue +MEMORY_WATCHDOG = "memory_watchdog" def load_default_params( @@ -224,30 +225,31 @@ def _manager_watchdog(self) -> None: time.sleep(10) # Check browser memory usage - for browser in self.browsers: - try: - # Sum the memory used by the geckodriver process, the - # main Firefox process and all its child processes. - # Use the USS metric for child processes, to avoid - # double-counting memory shared with their parent. - geckodriver = psutil.Process(browser.geckodriver_pid) - mem_bytes = geckodriver.memory_info().rss - children = geckodriver.children() - if children: - firefox = children[0] - mem_bytes += firefox.memory_info().rss - for child in firefox.children(): - mem_bytes += child.memory_full_info().uss - mem = mem_bytes / 2 ** 20 - if mem > BROWSER_MEMORY_LIMIT: - self.logger.info( - "BROWSER %i: Memory usage: %iMB" - ", exceeding limit of %iMB" - % (browser.browser_id, int(mem), BROWSER_MEMORY_LIMIT) - ) - browser.restart_required = True - except psutil.NoSuchProcess: - pass + if self.manager_params[MEMORY_WATCHDOG]: + for browser in self.browsers: + try: + # Sum the memory used by the geckodriver process, the + # main Firefox process and all its child processes. + # Use the USS metric for child processes, to avoid + # double-counting memory shared with their parent. + geckodriver = psutil.Process(browser.geckodriver_pid) + mem_bytes = geckodriver.memory_info().rss + children = geckodriver.children() + if children: + firefox = children[0] + mem_bytes += firefox.memory_info().rss + for child in firefox.children(): + mem_bytes += child.memory_full_info().uss + mem = mem_bytes / 2 ** 20 + if mem > BROWSER_MEMORY_LIMIT: + self.logger.info( + "BROWSER %i: Memory usage: %iMB" + ", exceeding limit of %iMB" + % (browser.browser_id, int(mem), BROWSER_MEMORY_LIMIT) + ) + browser.restart_required = True + except psutil.NoSuchProcess: + pass # Check for browsers or displays that were not closed correctly # 300 second buffer to avoid killing freshly launched browsers From ba90c6dce8b9ed35cd95a9b0eeaad88c93edf1e6 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 17:39:46 +0530 Subject: [PATCH 3/8] adding new manager_param -> memery_watchdog --- automation/default_manager_params.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/automation/default_manager_params.json b/automation/default_manager_params.json index e92cdae55..e3e7b8926 100644 --- a/automation/default_manager_params.json +++ b/automation/default_manager_params.json @@ -7,5 +7,6 @@ "failure_limit": null, "testing": false, "s3_bucket": null, - "s3_directory": null + "s3_directory": null, + "memory_watchdog": false } From 2641fa30f03c15b8c5b89c1e035d108e607f0625 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 17:40:58 +0530 Subject: [PATCH 4/8] adding check to see memory_watchdog is set or not --- automation/TaskManager.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/automation/TaskManager.py b/automation/TaskManager.py index dea7ffa0e..850f092ce 100644 --- a/automation/TaskManager.py +++ b/automation/TaskManager.py @@ -30,6 +30,7 @@ BROWSER_MEMORY_LIMIT = 1500 # in MB AGGREGATOR_QUEUE_LIMIT = 10000 # number of records in the queue +MEMORY_WATCHDOG = "memory_watchdog" def load_default_params( @@ -224,30 +225,31 @@ def _manager_watchdog(self) -> None: time.sleep(10) # Check browser memory usage - for browser in self.browsers: - try: - # Sum the memory used by the geckodriver process, the - # main Firefox process and all its child processes. - # Use the USS metric for child processes, to avoid - # double-counting memory shared with their parent. - geckodriver = psutil.Process(browser.geckodriver_pid) - mem_bytes = geckodriver.memory_info().rss - children = geckodriver.children() - if children: - firefox = children[0] - mem_bytes += firefox.memory_info().rss - for child in firefox.children(): - mem_bytes += child.memory_full_info().uss - mem = mem_bytes / 2 ** 20 - if mem > BROWSER_MEMORY_LIMIT: - self.logger.info( - "BROWSER %i: Memory usage: %iMB" - ", exceeding limit of %iMB" - % (browser.browser_id, int(mem), BROWSER_MEMORY_LIMIT) - ) - browser.restart_required = True - except psutil.NoSuchProcess: - pass + if self.manager_params[MEMORY_WATCHDOG]: + for browser in self.browsers: + try: + # Sum the memory used by the geckodriver process, the + # main Firefox process and all its child processes. + # Use the USS metric for child processes, to avoid + # double-counting memory shared with their parent. + geckodriver = psutil.Process(browser.geckodriver_pid) + mem_bytes = geckodriver.memory_info().rss + children = geckodriver.children() + if children: + firefox = children[0] + mem_bytes += firefox.memory_info().rss + for child in firefox.children(): + mem_bytes += child.memory_full_info().uss + mem = mem_bytes / 2 ** 20 + if mem > BROWSER_MEMORY_LIMIT: + self.logger.info( + "BROWSER %i: Memory usage: %iMB" + ", exceeding limit of %iMB" + % (browser.browser_id, int(mem), BROWSER_MEMORY_LIMIT) + ) + browser.restart_required = True + except psutil.NoSuchProcess: + pass # Check for browsers or displays that were not closed correctly # 300 second buffer to avoid killing freshly launched browsers From c193ec7f1df7121bd17a8f78d4464a8ada184ea2 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 18:23:12 +0530 Subject: [PATCH 5/8] set memory_watchdog --- demo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/demo.py b/demo.py index 821b9e135..bfb5e1665 100644 --- a/demo.py +++ b/demo.py @@ -34,6 +34,7 @@ # Update TaskManager configuration (use this for crawl-wide settings) manager_params["data_directory"] = "~/Desktop/" manager_params["log_directory"] = "~/Desktop/" +manager_params["memory_watchdog"] = True # Instantiates the measurement platform # Commands time out by default after 60 seconds From bb311ecc1e312bbddb739b95a2c3294706172d51 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 18:59:55 +0530 Subject: [PATCH 6/8] adding memory_watchdog documentation --- docs/Configuration.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/Configuration.md b/docs/Configuration.md index 7241b2285..4355a87e1 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -63,6 +63,9 @@ of configuration dictionaries. on-the-fly. Depending on where you would like to add test functionality, you may need to propagate the flag. * This is not something you should enable during normal crawls. +* `memory_watchdog` + * A watchdog that tries to ensure that no Firefox instance takes up to much memory. It is set to false by default + * It is mostly useful for long running cloud crawls # Browser Configuration Options From eec94bff25369a4a74a61778589533ef50d8173b Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 19:00:38 +0530 Subject: [PATCH 7/8] adding OpenWPM watchdogs documentation --- docs/Platform-Architecture.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/Platform-Architecture.md b/docs/Platform-Architecture.md index 754860324..6797dda34 100644 --- a/docs/Platform-Architecture.md +++ b/docs/Platform-Architecture.md @@ -20,7 +20,13 @@ to get the default parameters. To learn more about the `manager_params` and `browser_params` have a look at [Configuration.md](Configuration.md) -`` is an optional parameter that can be passed to the `TaskManager` to create another thread that kills off all processes named `Xvfb` or `firefox` that haven't been spawned by OpenWPM. +## Watchdogs +In OpenWPM we have a so called watchdog that tries to ensure two things. +- `process_watchdog` + * It is an optional parameter that can be passed to the `TaskManager` to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server) +- `memory_watchdog` + * A watchdog that tries to ensure that no Firefox instance takes up to much memory. It is set to false by default + * It is mostly useful for long running cloud crawls ## Issuing commands From 51da058369c4b8598e59467dab5254c108774bd1 Mon Sep 17 00:00:00 2001 From: ankushduacodes <61025943+ankushduacodes@users.noreply.github.com> Date: Tue, 10 Nov 2020 19:12:05 +0530 Subject: [PATCH 8/8] chnge debug msg to refect current state of OpenWPM --- automation/TaskManager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/automation/TaskManager.py b/automation/TaskManager.py index 850f092ce..1f10531ac 100644 --- a/automation/TaskManager.py +++ b/automation/TaskManager.py @@ -275,9 +275,9 @@ def _manager_watchdog(self) -> None: ) ): self.logger.debug( - "Process: %s (pid: %i) with start " - "time %s found running but not in " - "browser process list. Killing." + "Process %s (pid: %i) with start " + "time %s isn't controlled by any BrowserManager." + "Killing it now." % (process.name(), process.pid, process.create_time()) ) kill_process_and_children(process, self.logger)