Commit 225f8752 authored by Philippe Pepiot's avatar Philippe Pepiot
Browse files

[ccplugin] celery-monitor: retry failed items

When multiple instance of celery-monitor are running, we could have an
integrity errors raised if two instance are working on the same task_id or
temporary (network, host) failures. In this case we want to retry handling the
task_id later.

Put pending processed task_id in a "pending queue" and each minutes and if the
monitor queue is empty, requeue pending items.

This change require to handle the "timeout" parameter of loop (only used in
tests) in a different way to ensure not blocking forever in redis "brpoplpush".
parent 95f71105bbaf
...@@ -30,6 +30,9 @@ from cw_celerytask_helpers.utils import get_redis_client ...@@ -30,6 +30,9 @@ from cw_celerytask_helpers.utils import get_redis_client
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
PENDING_KEY = ':'.join([MONITOR_KEY, 'pending'])
class CeleryMonitorCommand(Command): class CeleryMonitorCommand(Command):
"""Synchronize celery task statuses""" """Synchronize celery task statuses"""
...@@ -75,18 +78,35 @@ class CeleryMonitorCommand(Command): ...@@ -75,18 +78,35 @@ class CeleryMonitorCommand(Command):
else: else:
break break
def requeue(client):'requeue pending events')
while client.rpoplpush(PENDING_KEY, MONITOR_KEY):
@staticmethod @staticmethod
def loop(cnx, timeout=None): def loop(cnx, timeout=None):
client = get_redis_client() client = get_redis_client()'Connected to redis')'Connected to redis')
test = (cnx.repo.config.mode == "test") test = (cnx.repo.config.mode == "test")
requeue_timer = timer = time.time()
while True: while True:
data = client.brpop(MONITOR_KEY, timeout=timeout) # pop item from MONITOR_KEY and push it to PENDING_KEY
data = client.brpoplpush(MONITOR_KEY, PENDING_KEY, timeout=1)
if data is None: if data is None:
break now = time.time()
data = json.loads(data[1].decode()) if timeout is not None and abs(timer - now) > timeout:
task_id, task_name = data['task_id'], data['task_name'] break
if abs(requeue_timer - time.time()) > REQUEUE_TIMEOUT:
# no items left in MONITOR_KEY and we reached the
# REQUEUE_TIMEOUT requeue failed items from PENDING_KEY to
requeue_timer = time.time()
payload = json.loads(data.decode())
task_id, task_name = payload['task_id'], payload['task_name']
for adapter in cnx.vreg['adapters']['ICeleryTask']: for adapter in cnx.vreg['adapters']['ICeleryTask']:
try: try:
adapter.sync_task_state(cnx, task_id, adapter.sync_task_state(cnx, task_id,
...@@ -99,6 +119,9 @@ class CeleryMonitorCommand(Command): ...@@ -99,6 +119,9 @@ class CeleryMonitorCommand(Command):
if test: if test:
# we should not hide exceptions in tests # we should not hide exceptions in tests
raise raise
# success, drop item from PENDING_KEY
client.lrem(PENDING_KEY, data, num=1)
CWCTL.register(CeleryMonitorCommand) CWCTL.register(CeleryMonitorCommand)
...@@ -64,7 +64,7 @@ def run_all_tasks(cnx=None): ...@@ -64,7 +64,7 @@ def run_all_tasks(cnx=None):
wf.fire_transition(transition, result.traceback) wf.fire_transition(transition, result.traceback)
else: else:
from cubes.celerytask.ccplugin import CeleryMonitorCommand from cubes.celerytask.ccplugin import CeleryMonitorCommand
CeleryMonitorCommand.loop(cnx, 1) CeleryMonitorCommand.loop(cnx, timeout=0)
return results return results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment