|SUCCESS| [v2,5/5] eal: avoid deadlock in async IPC alarm callback

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Sat May 30 02:02:56 CEST 2026
Previous message (by thread): |SUCCESS| raw/gdtc : update MAINTAINERS entry
Next message (by thread): |WARNING| pw164607-164612 [PATCH] [v3,6/6] eal/memory: add page size
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/164578

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-05-30

commit d12f7e7fd9177180d4c8ac64cb984530ae7e23b5
Author: Anatoly Burakov <anatoly.burakov at intel.com>

eal: avoid deadlock in async IPC alarm callback

This is a review of the five patches in series.patch. The series fixes
multiple bugs in the EAL IPC asynchronous request handling code.

> diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
> index 06f151818c..799c6e81b0 100644
> --- a/lib/eal/common/eal_common_proc.c
> +++ b/lib/eal/common/eal_common_proc.c
> @@ -883,7 +883,7 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
>  	pending_req = calloc(1, sizeof(*pending_req));
>  	reply_msg = calloc(1, sizeof(*reply_msg));
>  	if (pending_req == NULL || reply_msg == NULL) {
> -		EAL_LOG(ERR, "Could not allocate space for sync request");
> +		EAL_LOG(ERR, "Could not allocate space for async request");
>  		rte_errno = ENOMEM;
>  		ret = -1;
>  		goto fail;

[ ... ]

> diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
> index 799c6e81b0..0ec79336a5 100644
> --- a/lib/eal/common/eal_common_proc.c
> +++ b/lib/eal/common/eal_common_proc.c
> @@ -1187,11 +1187,15 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  		ret = mp_request_async(eal_mp_socket_path(), copy, param, ts);
>  
> -		/* if we didn't send anything, put dummy request on the queue */
> +		/* if we didn't send anything, put dummy request on the queue
> +		 * and set a minimum-delay alarm so the callback fires immediately.
> +		 */
>  		if (ret == 0 && reply->nb_sent == 0) {
>  			TAILQ_INSERT_TAIL(&pending_requests.requests, dummy,
>  					next);
>  			dummy_used = true;
> +			if (rte_eal_alarm_set(1, async_reply_handle, dummy) < 0)
> +				EAL_LOG(ERR, "Fail to set alarm for dummy request");
>  		}

Does this code leak dummy if rte_eal_alarm_set fails? The dummy request
is already inserted into the queue. On alarm set failure, does the
callback mechanism ensure cleanup, or should this code remove dummy from
the queue and free it?

[ ... ]

> diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
> index 0ec79336a5..24d3557859 100644
> --- a/lib/eal/common/eal_common_proc.c
> +++ b/lib/eal/common/eal_common_proc.c
> @@ -1203,6 +1203,8 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
>  		/* if we couldn't send anything, clean up */
>  		if (ret != 0)
>  			goto fail;
> +		if (!dummy_used)
> +			free(dummy);
>  		return 0;
>  	}

[ ... ]

> diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
> index 24d3557859..967186382e 100644
> --- a/lib/eal/common/eal_common_proc.c
> +++ b/lib/eal/common/eal_common_proc.c
> @@ -74,6 +74,7 @@ struct async_request_param {
>  
>  struct pending_request {
>  	TAILQ_ENTRY(pending_request) next;
> +	unsigned long id;
>  	enum {
>  		REQUEST_TYPE_SYNC,
>  		REQUEST_TYPE_ASYNC
> @@ -92,6 +93,8 @@ struct pending_request {
>  	};
>  };
>  
> +static unsigned long next_request_id;

Does this code have a race condition when incrementing next_request_id
without atomic operations or lock protection at the point of increment?
Multiple threads could call rte_mp_request_async concurrently.

[ ... ]

> @@ -878,7 +898,29 @@ mp_request_async(const char *dst, struct rte_mp_msg *req,
>  {
>  	struct rte_mp_msg *reply_msg;
>  	struct pending_request *pending_req, *exist;
> -	int ret = -1;
> +	unsigned long id;
> +	int ret;
> +
> +	/* queue already locked by caller */
> +
> +	exist = find_pending_request(dst, req->name);
> +	if (exist) {
> +		EAL_LOG(ERR, "A pending request %s:%s", dst, req->name);
> +		rte_errno = EEXIST;
> +		return -1;
> +	}
> +
> +	/* Set alarm before allocating or sending so request timeout tracking
> +	 * is active as soon as this request ID is reserved.
> +	 */
> +	id = ++next_request_id;
> +	if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000,
> +			async_reply_handle,
> +			(void *)(uintptr_t)id) < 0) {
> +		EAL_LOG(ERR, "Fail to set alarm for request %s:%s",
> +			dst, req->name);
> +		return -1;
> +	}

Does this code leak the alarm if the function later fails after setting
it? The alarm is armed with id before any allocations or queue
insertions. If pending_req allocation fails, does async_reply_handle get
called with a stale id that never gets inserted?

> @@ -1238,13 +1265,39 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
>  		} else if (mp_request_async(path, copy, param, ts))
>  			ret = -1;
>  	}
> +
> +	/*
> +	 * On partial failure, roll back all queued requests. We hold the lock
> +	 * so no one else touches the queue. All requests in this batch share
> +	 * the same param pointer. Stale alarms will fire and harmlessly find
> +	 * nothing via ID-based lookup.
> +	 */
> +	if (ret != 0 && reply->nb_sent > 0) {
> +		struct pending_request *r, *next;
> +
> +		for (r = TAILQ_FIRST(&pending_requests.requests);
> +				r != NULL; r = next) {
> +			next = TAILQ_NEXT(r, next);
> +			if (r->type == REQUEST_TYPE_ASYNC &&
> +					r->async.param == param) {
> +				TAILQ_REMOVE(&pending_requests.requests,
> +						r, next);
> +				free(r->reply);
> +				/* r->request == copy, freed below after the loop */
> +				free(r);
> +			}
> +		}
> +		reply->nb_sent = 0;
> +	}

Does this rollback code correctly handle the dummy request scenario? If
dummy was inserted but alarm_set failed in the earlier nb_sent == 0
path, could dummy be in the queue with the same param pointer, causing
it to be freed here and then double-freed later?

[ ... ]

> diff --git a/lib/eal/common/eal_common_proc.c b/lib/eal/common/eal_common_proc.c
> index 967186382e..67a393f53e 100644
> --- a/lib/eal/common/eal_common_proc.c
> +++ b/lib/eal/common/eal_common_proc.c
> @@ -549,19 +549,6 @@ async_reply_handle_thread_unsafe(struct pending_request *req)
>  
>  	TAILQ_REMOVE(&pending_requests.requests, req, next);
>  
> -	if (rte_eal_alarm_cancel(async_reply_handle,
> -			(void *)(uintptr_t)req->id) < 0) {
> -		/* if we failed to cancel the alarm because it's already in
> -		 * progress, don't proceed because otherwise we will end up
> -		 * handling the same message twice.
> -		 */
> -		if (rte_errno == EINPROGRESS) {
> -			EAL_LOG(DEBUG, "Request handling is already in progress");
> -			goto no_trigger;
> -		}
> -		EAL_LOG(ERR, "Failed to cancel alarm");
> -	}
> -
>  	if (action == ACTION_TRIGGER)
>  		return req;
>  no_trigger:
Previous message (by thread): |SUCCESS| raw/gdtc : update MAINTAINERS entry
Next message (by thread): |WARNING| pw164607-164612 [PATCH] [v3,6/6] eal/memory: add page size
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the test-report mailing list