OCA · hparfr · May 23, 2024 · May 31, 2024 · Jun 3, 2024 · simahawk
diff --git a/queue_job/data/queue_data.xml b/queue_job/data/queue_data.xml
@@ -10,6 +10,15 @@
             <field name="state">code</field>
             <field name="code">model.requeue_stuck_jobs()</field>
         </record>
+        <record id="ir_cron_queue_job_fail_dead_jobs" model="ir.cron">
+            <field name="name">Take care of unresponsive jobs</field>
+            <field name="interval_number">15</field>
+            <field name="interval_type">minutes</field>
+            <field name="numbercall">-1</field>
+            <field name="model_id" ref="model_queue_job" />
+            <field name="state">code</field>
+            <field name="code">model.fail_dead_jobs(240)</field>
+        </record>
         <!-- Queue-job-related subtypes for messaging / Chatter -->
         <record id="mt_job_failed" model="mail.message.subtype">
             <field name="name">Job failed</field>

diff --git a/queue_job/models/queue_job.py b/queue_job/models/queue_job.py
@@ -418,6 +418,61 @@
         ).requeue()
         return True
 
+    def fail_dead_jobs(self, started_delta, force_low_delta=False):
-    def fail_dead_jobs(self, started_delta, force_low_delta=False):
+    def gc_dead_jobs(self, started_delta, force_low_delta=False):
-    def fail_dead_jobs(self, started_delta, force_low_delta=False):
+    def gc_dead_jobs(self, started_delta, force_low_delta=False):
+        """Set as failed job started since too long ago.
+
+        Workers can be dead without anyone noticing
+        Dead workers stuck the channel and provoke
+        famine.
+
+        This function, mark jobs started longtime ago
+        as failed.
+
+        Cause of death can be CPU Time limit reached
+        a SIGTERM, a power shortage, we can't know, etc.
+
+        This mechanism should be very exceptionnal.
+        It may help, for instance, if someone forget to configure
+        properly his system.
-        This function, mark jobs started longtime ago
-        as failed.
-
-        Cause of death can be CPU Time limit reached
-        a SIGTERM, a power shortage, we can't know, etc.
-
-        This mechanism should be very exceptionnal.
-        It may help, for instance, if someone forget to configure
-        properly his system.
+        This function marks jobs started longtime ago
+        as failed.
+
+        Cause of death can be CPU Time limit reached
+        a SIGTERM, a power shortage, etc. We can't know.
+
+        This mechanism should be very exceptional.
+        It may help, for instance, if someone forgot to configure
+        properly his system.
-        This function, mark jobs started longtime ago
-        as failed.
-
-        Cause of death can be CPU Time limit reached
-        a SIGTERM, a power shortage, we can't know, etc.
-
-        This mechanism should be very exceptionnal.
-        It may help, for instance, if someone forget to configure
-        properly his system.
+        This function marks jobs started longtime ago
+        as failed.
+
+        Cause of death can be CPU Time limit reached
+        a SIGTERM, a power shortage, etc. We can't know.
+
+        This mechanism should be very exceptional.
+        It may help, for instance, if someone forgot to configure
+        properly his system.
+
+        :param started_delta: lookup time in minutes for jobs
+                                that are in started state,
+
+        :param force_low_delta: force a started_delta in less 10min
+                                = you know what you do
+        """
+        now = fields.datetime.now()
+        started_dl = now - timedelta(minutes=started_delta)
-        started_dl = now - timedelta(minutes=started_delta)
+        started_dl = fields.Datetime.subtract(now, minutes=started_delta)
-        started_dl = now - timedelta(minutes=started_delta)
+        started_dl = fields.Datetime.subtract(now, minutes=started_delta)
+        if started_delta <= 10 and not force_low_delta:
+            raise exceptions.ValidationError(
+                _(
+                    "started_delta is too low. Set at least 10min"
+                    " or set argument force_low_delta=True"
+                )
+            )
+        domain = [
+            "&",
+            ("date_started", "<=", fields.Datetime.to_string(started_dl)),
+            ("state", "=", "started"),
+        ]
+        job_model = self.env["queue.job"]
+        stuck_jobs = job_model.search(domain)
+        msg = {
+            "exc_info": "",
+            "exc_name": "Not responding worker. Is it dead ?",
+            "exc_message": (
+                "Check for odoo.service.server logs."
+                "Investigate logs for CPU time limit reached or check system log"
+            ),
+        }
+        for job in stuck_jobs:
+            # TODO: manage retry:
+            # if retry < max_retry: retry=+1 and enqueue job instead
+            # else: set_failed
+            job_ = Job.load(self.env, job.uuid)
+            job_.set_failed(**msg)
+            job_.store()
+
     def _get_stuck_jobs_domain(self, queue_dl, started_dl):
         domain = []
         now = fields.datetime.now()

diff --git a/queue_job/readme/CONTRIBUTORS.rst b/queue_job/readme/CONTRIBUTORS.rst
@@ -10,3 +10,4 @@
 * Souheil Bejaoui <[email protected]>
 * Eric Antones <[email protected]>
 * Simone Orsi <[email protected]>
+* Raphaël Reverdy <[email protected]>