diff --git a/queue_job/data/queue_data.xml b/queue_job/data/queue_data.xml
index ca5a747746..754f9316ff 100644
--- a/queue_job/data/queue_data.xml
+++ b/queue_job/data/queue_data.xml
@@ -10,6 +10,15 @@
code
model.requeue_stuck_jobs()
+
+ Take care of unresponsive jobs
+ 15
+ minutes
+ -1
+
+ code
+ model.fail_dead_jobs(240)
+
Job failed
diff --git a/queue_job/models/queue_job.py b/queue_job/models/queue_job.py
index 03aa60d60d..3e5710ebfb 100644
--- a/queue_job/models/queue_job.py
+++ b/queue_job/models/queue_job.py
@@ -418,6 +418,61 @@ def requeue_stuck_jobs(self, enqueued_delta=5, started_delta=0):
).requeue()
return True
+ def fail_dead_jobs(self, started_delta, force_low_delta=False):
+ """Set as failed job started since too long ago.
+
+ Workers can be dead without anyone noticing
+ Dead workers stuck the channel and provoke
+ famine.
+
+ This function, mark jobs started longtime ago
+ as failed.
+
+ Cause of death can be CPU Time limit reached
+ a SIGTERM, a power shortage, we can't know, etc.
+
+ This mechanism should be very exceptionnal.
+ It may help, for instance, if someone forget to configure
+ properly his system.
+
+ :param started_delta: lookup time in minutes for jobs
+ that are in started state,
+
+ :param force_low_delta: force a started_delta in less 10min
+ = you know what you do
+ """
+ now = fields.datetime.now()
+ started_dl = now - timedelta(minutes=started_delta)
+ if started_delta <= 10 and not force_low_delta:
+ raise exceptions.ValidationError(
+ _(
+ "started_delta is too low. Set at least 10min"
+ " or set argument force_low_delta=True"
+ )
+ )
+ domain = [
+ "&",
+ ("date_started", "<=", fields.Datetime.to_string(started_dl)),
+ ("state", "=", "started"),
+ ]
+ job_model = self.env["queue.job"]
+ stuck_jobs = job_model.search(domain)
+ msg = {
+ "exc_info": "",
+ "exc_name": "Not responding worker. Is it dead ?",
+ "exc_message": (
+ "Check for odoo.service.server logs."
+ "Investigate logs for CPU time limit reached or check system log"
+ ),
+ }
+ for job in stuck_jobs:
+ # TODO: manage retry:
+ # if retry < max_retry: retry=+1 and enqueue job instead
+ # else: set_failed
+ job_ = Job.load(self.env, job.uuid)
+ job_.set_failed(**msg)
+ job_.store()
+
def _get_stuck_jobs_domain(self, queue_dl, started_dl):
domain = []
now = fields.datetime.now()
diff --git a/queue_job/readme/CONTRIBUTORS.rst b/queue_job/readme/CONTRIBUTORS.rst
index 4b34823abe..337601843a 100644
--- a/queue_job/readme/CONTRIBUTORS.rst
+++ b/queue_job/readme/CONTRIBUTORS.rst
@@ -10,3 +10,4 @@
* Souheil Bejaoui
* Eric Antones
* Simone Orsi
+* Raphaƫl Reverdy