OCA
diff --git a/‎queue_job/__manifest__.py‎
Lines changed: 1 addition & 1 deletion b/‎queue_job/__manifest__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎queue_job/controllers/main.py‎
Lines changed: 78 additions & 44 deletions b/‎queue_job/controllers/main.py‎
Lines changed: 78 additions & 44 deletions
diff --git a/‎queue_job/job.py‎
Lines changed: 9 additions & 14 deletions b/‎queue_job/job.py‎
Lines changed: 9 additions & 14 deletions
diff --git a/‎queue_job/jobrunner/runner.py‎
Lines changed: 26 additions & 17 deletions b/‎queue_job/jobrunner/runner.py‎
Lines changed: 26 additions & 17 deletions
@@ -29,7 +29,7 @@
     },
     "installable": True,
     "development_status": "Mature",
-    "maintainers": ["guewen"],
+    "maintainers": ["guewen", "sbidoul"],
     "post_init_hook": "post_init_hook",
     "post_load": "post_load",
 }
@@ -27,15 +27,48 @@
 
 
 class RunJobController(http.Controller):
-    def _try_perform_job(self, env, job):
-        """Try to perform the job."""
+    @classmethod
+    def _acquire_job(cls, env: api.Environment, job_uuid: str) -> Job | None:
+        """Acquire a job for execution.
+
+        - make sure it is in ENQUEUED state
+        - mark it as STARTED and commit the state change
+        - acquire the job lock
+
+        If successful, return the Job instance, otherwise return None. This
+        function may fail to acquire the job is not in the expected state or is
+        already locked by another worker.
+        """
+        env.cr.execute(
+            "SELECT uuid FROM queue_job WHERE uuid=%s AND state=%s "
+            "FOR NO KEY UPDATE SKIP LOCKED",
+            (job_uuid, ENQUEUED),
+        )
+        if not env.cr.fetchone():
+            _logger.warning(
+                "was requested to run job %s, but it does not exist, "
+                "or is not in state %s, or is being handled by another worker",
+                job_uuid,
+                ENQUEUED,
+            )
+            return None
+        job = Job.load(env, job_uuid)
+        assert job and job.state == ENQUEUED
         job.set_started()
         job.store()
         env.cr.commit()
-        job.lock()
+        if not job.lock():
+            _logger.warning(
+                "was requested to run job %s, but it could not be locked",
+                job_uuid,
+            )
+            return None
+        return job
 
+    @classmethod
+    def _try_perform_job(cls, env, job):
+        """Try to perform the job, mark it done and commit if successful."""
         _logger.debug("%s started", job)
-
         job.perform()
         # Triggers any stored computed fields before calling 'set_done'
         # so that will be part of the 'exec_time'
@@ -46,18 +79,20 @@ def _try_perform_job(self, env, job):
         env.cr.commit()
         _logger.debug("%s done", job)
 
-    def _enqueue_dependent_jobs(self, env, job):
+    @classmethod
+    def _enqueue_dependent_jobs(cls, env, job):
         tries = 0
         while True:
             try:
-                job.enqueue_waiting()
+                with job.env.cr.savepoint():
+                    job.enqueue_waiting()
             except OperationalError as err:
                 # Automatically retry the typical transaction serialization
                 # errors
                 if err.pgcode not in PG_CONCURRENCY_ERRORS_TO_RETRY:
                     raise
                 if tries >= DEPENDS_MAX_TRIES_ON_CONCURRENCY_FAILURE:
-                    _logger.info(
+                    _logger.error(
                         "%s, maximum number of tries reached to update dependencies",
                         errorcodes.lookup(err.pgcode),
                     )
@@ -75,17 +110,8 @@ def _enqueue_dependent_jobs(self, env, job):
             else:
                 break
 
-    @http.route(
-        "/queue_job/runjob",
-        type="http",
-        auth="none",
-        save_session=False,
-        readonly=False,
-    )
-    def runjob(self, db, job_uuid, **kw):
-        http.request.session.db = db
-        env = http.request.env(user=SUPERUSER_ID)
-
+    @classmethod
+    def _runjob(cls, env: api.Environment, job: Job) -> None:
         def retry_postpone(job, message, seconds=None):
             job.env.clear()
             with Registry(job.env.cr.dbname).cursor() as new_cr:
@@ -94,26 +120,9 @@ def retry_postpone(job, message, seconds=None):
                 job.set_pending(reset_retry=False)
                 job.store()
 
-        # ensure the job to run is in the correct state and lock the record
-        env.cr.execute(
-            "SELECT state FROM queue_job WHERE uuid=%s AND state=%s FOR UPDATE",
-            (job_uuid, ENQUEUED),
-        )
-        if not env.cr.fetchone():
-            _logger.warning(
-                "was requested to run job %s, but it does not exist, "
-                "or is not in state %s",
-                job_uuid,
-                ENQUEUED,
-            )
-            return ""
-
-        job = Job.load(env, job_uuid)
-        assert job and job.state == ENQUEUED
-
         try:
             try:
-                self._try_perform_job(env, job)
+                cls._try_perform_job(env, job)
             except OperationalError as err:
                 # Automatically retry the typical transaction serialization
                 # errors
@@ -131,7 +140,6 @@ def retry_postpone(job, message, seconds=None):
             # traceback in the logs we should have the traceback when all
             # retries are exhausted
             env.cr.rollback()
-            return ""
 
         except (FailedJobError, Exception) as orig_exception:
             buff = StringIO()
@@ -141,19 +149,18 @@ def retry_postpone(job, message, seconds=None):
             job.env.clear()
             with Registry(job.env.cr.dbname).cursor() as new_cr:
                 job.env = job.env(cr=new_cr)
-                vals = self._get_failure_values(job, traceback_txt, orig_exception)
+                vals = cls._get_failure_values(job, traceback_txt, orig_exception)
                 job.set_failed(**vals)
                 job.store()
                 buff.close()
             raise
 
         _logger.debug("%s enqueue depends started", job)
-        self._enqueue_dependent_jobs(env, job)
+        cls._enqueue_dependent_jobs(env, job)
         _logger.debug("%s enqueue depends done", job)
 
-        return ""
-
-    def _get_failure_values(self, job, traceback_txt, orig_exception):
+    @classmethod
+    def _get_failure_values(cls, job, traceback_txt, orig_exception):
         """Collect relevant data from exception."""
         exception_name = orig_exception.__class__.__name__
         if hasattr(orig_exception, "__module__"):
@@ -167,6 +174,22 @@ def _get_failure_values(self, job, traceback_txt, orig_exception):
             "exc_message": exc_message,
         }
 
+    @http.route(
+        "/queue_job/runjob",
+        type="http",
+        auth="none",
+        save_session=False,
+        readonly=False,
+    )
+    def runjob(self, db, job_uuid, **kw):
+        http.request.session.db = db
+        env = http.request.env(user=SUPERUSER_ID)
+        job = self._acquire_job(env, job_uuid)
+        if not job:
+            return ""
+        self._runjob(env, job)
+        return ""
+
     # flake8: noqa: C901
     @http.route("/queue_job/create_test_job", type="http", auth="user")
     def create_test_job(
@@ -177,6 +200,7 @@ def create_test_job(
         description="Test job",
         size=1,
         failure_rate=0,
+        job_duration=0,
     ):
         if not http.request.env.user.has_group("base.group_erp_manager"):
             raise Forbidden(http.request.env._("Access Denied"))
@@ -187,6 +211,12 @@ def create_test_job(
             except (ValueError, TypeError):
                 failure_rate = 0
 
+        if job_duration is not None:
+            try:
+                job_duration = float(job_duration)
+            except (ValueError, TypeError):
+                job_duration = 0
+
         if not (0 <= failure_rate <= 1):
             raise BadRequest("failure_rate must be between 0 and 1")
 
@@ -215,6 +245,7 @@ def create_test_job(
                 channel=channel,
                 description=description,
                 failure_rate=failure_rate,
+                job_duration=job_duration,
             )
 
         if size > 1:
@@ -225,6 +256,7 @@ def create_test_job(
                 channel=channel,
                 description=description,
                 failure_rate=failure_rate,
+                job_duration=job_duration,
             )
         return ""
 
@@ -236,6 +268,7 @@ def _create_single_test_job(
         description="Test job",
         size=1,
         failure_rate=0,
+        job_duration=0,
     ):
         delayed = (
             http.request.env["queue.job"]
@@ -245,7 +278,7 @@ def _create_single_test_job(
                 channel=channel,
                 description=description,
             )
-            ._test_job(failure_rate=failure_rate)
+            ._test_job(failure_rate=failure_rate, job_duration=job_duration)
         )
         return f"job uuid: {delayed.db_record().uuid}"
 
@@ -259,6 +292,7 @@ def _create_graph_test_jobs(
         channel=None,
         description="Test job",
         failure_rate=0,
+        job_duration=0,
     ):
         model = http.request.env["queue.job"]
         current_count = 0
@@ -281,7 +315,7 @@ def _create_graph_test_jobs(
                         max_retries=max_retries,
                         channel=channel,
                         description=f"{description} #{current_count}",
-                    )._test_job(failure_rate=failure_rate)
+                    )._test_job(failure_rate=failure_rate, job_duration=job_duration)
                 )
 
             grouping = random.choice(possible_grouping_methods)
 
@@ -222,7 +222,7 @@ def load_many(cls, env, job_uuids):
         recordset = cls.db_records_from_uuids(env, job_uuids)
         return {cls._load_from_db_record(record) for record in recordset}
 
-    def add_lock_record(self):
+    def add_lock_record(self) -> None:
         """
         Create row in db to be locked while the job is being performed.
         """
@@ -242,13 +242,11 @@ def add_lock_record(self):
             [self.uuid],
         )
 
-    def lock(self):
-        """
-        Lock row of job that is being performed
+    def lock(self) -> bool:
+        """Lock row of job that is being performed.
 
-        If a job cannot be locked,
-        it means that the job wasn't started,
-        a RetryableJobError is thrown.
+        Return False if a job cannot be locked: it means that the job is not in
+        STARTED state or is already locked by another worker.
         """
         self.env.cr.execute(
             """
@@ -264,18 +262,15 @@ def lock(self):
                         queue_job
                     WHERE
                         uuid = %s
-                        AND state='started'
+                        AND state = %s
                 )
-            FOR UPDATE;
+            FOR NO KEY UPDATE SKIP LOCKED;
         """,
-            [self.uuid],
+            [self.uuid, STARTED],
         )
 
         # 1 job should be locked
-        if 1 != len(self.env.cr.fetchall()):
-            raise RetryableJobError(
-                f"Trying to lock job that wasn't started, uuid: {self.uuid}"
-            )
+        return bool(self.env.cr.fetchall())
 
     @classmethod
     def _load_from_db_record(cls, job_db_record):
 
@@ -357,23 +357,26 @@ def _query_requeue_dead_jobs(self):
                         ELSE exc_info
                     END)
             WHERE
-                id in (
-                    SELECT
-                        queue_job_id
-                    FROM
-                        queue_job_lock
-                    WHERE
-                        queue_job_id in (
-                            SELECT
-                                id
-                            FROM
-                                queue_job
-                            WHERE
-                                state IN ('enqueued','started')
-                                AND date_enqueued <
-                                (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
-                        )
-                    FOR UPDATE SKIP LOCKED
+                state IN ('enqueued','started')
+                AND date_enqueued < (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
+                AND (
+                    id in (
+                        SELECT
+                            queue_job_id
+                        FROM
+                            queue_job_lock
+                        WHERE
+                            queue_job_lock.queue_job_id = queue_job.id
+                        FOR NO KEY UPDATE SKIP LOCKED
+                    )
+                    OR NOT EXISTS (
+                        SELECT
+                            1
+                        FROM
+                            queue_job_lock
+                        WHERE
+                            queue_job_lock.queue_job_id = queue_job.id
+                    )
                 )
             RETURNING uuid
             """
@@ -396,6 +399,12 @@ def requeue_dead_jobs(self):
         However, when the Odoo server crashes or is otherwise force-stopped,
         running jobs are interrupted while the runner has no chance to know
         they have been aborted.
+
+        This also handles orphaned jobs (enqueued but never started, no lock).
+        This edge case occurs when the runner marks a job as 'enqueued'
+        but the HTTP request to start the job never reaches the Odoo server
+        (e.g., due to server shutdown/crash between setting enqueued and
+        the controller receiving the request).
         """
 
         with closing(self.conn.cursor()) as cr:
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`},`
`30`	`30`	`"installable": True,`
`31`	`31`	`"development_status": "Mature",`
`32`		`- "maintainers": ["guewen"],`
	`32`	`+ "maintainers": ["guewen", "sbidoul"],`
`33`	`33`	`"post_init_hook": "post_init_hook",`
`34`	`34`	`"post_load": "post_load",`
`35`	`35`	`}`