[dpdk-dev] [PATCH v2 1/4] eal: add channel for multi-process communication

Ananyev, Konstantin konstantin.ananyev at intel.com
Mon Jan 15 20:52:07 CET 2018


Hi Jianfeng,

> 
> Previouly, there are three channels for multi-process
> (i.e., primary/secondary) communication.
>   1. Config-file based channel, in which, the primary process writes
>      info into a pre-defined config file, and the secondary process
>      reads the info out.
>   2. vfio submodule has its own channel based on unix socket for the
>      secondary process to get container fd and group fd from the
>      primary process.
>   3. pdump submodule also has its own channel based on unix socket for
>      packet dump.
> 
> It'd be good to have a generic communication channel for multi-process
> communication to accomodate the requirements including:
>   a. Secondary wants to send info to primary, for example, secondary
>      would like to send request (about some specific vdev to primary).
>   b. Sending info at any time, instead of just initialization time.
>   c. Share FDs with the other side, for vdev like vhost, related FDs
>      (memory region, kick) should be shared.
>   d. A send message request needs the other side to response immediately.
> 
> This patch proposes to create a communication channel, based on datagram
> unix socket, for above requirements. Each process will block on a unix
> socket waiting for messages from the peers.
> 
> Three new APIs are added:
> 
>   1. rte_eal_mp_action_register() is used to register an action,
>      indexed by a string, when a component at receiver side would like
>      to response the messages from the peer processe.
>   2. rte_eal_mp_action_unregister() is used to unregister the action
>      if the calling component does not want to response the messages.
>   3. rte_eal_mp_sendmsg() is used to send a message, and returns
>      immediately. If there are 1:n primary:secondary processes, the
>      primary process will send n messages.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
> ---
>  lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
>  lib/librte_eal/common/eal_filesystem.h  |  17 ++
>  lib/librte_eal/common/eal_private.h     |  10 +
>  lib/librte_eal/common/include/rte_eal.h |  69 ++++++
>  lib/librte_eal/linuxapp/eal/eal.c       |   8 +
>  lib/librte_eal/rte_eal_version.map      |   9 +
>  6 files changed, 501 insertions(+)
> 
> diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
> index 40fa982..d700e9e 100644
> --- a/lib/librte_eal/common/eal_common_proc.c
> +++ b/lib/librte_eal/common/eal_common_proc.c
> @@ -5,11 +5,55 @@
>  #include <stdio.h>
>  #include <fcntl.h>
>  #include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <limits.h>
> +#include <unistd.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +#include <pthread.h>
> +
> +#include <rte_log.h>
>  #include <rte_eal.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
> +#include <rte_common.h>
> 
> +#include "eal_private.h"
>  #include "eal_filesystem.h"
>  #include "eal_internal_cfg.h"
> 
> +#define MAX_SECONDARY_PROCS	8
> +#define MAX_ACTION_NAME_LEN	64
> +#define MAX_UNIX_PATH_LEN	104

Why do you need this?
Why not just PATH_MAX?

> +#define MAX_MSG_LENGTH		1024
> +#define SCM_MAX_FD		253 /* The max amount of fds */
> +
> +static int mp_fd = -1;
> +static char *mp_sec_sockets[MAX_SECONDARY_PROCS];

Who will init it and why it could be only 8?

> +static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
> +
> +struct action_entry {
> +	TAILQ_ENTRY(action_entry) next;      /**< Next attached action entry */
> +
> +#define MAX_ACTION_NAME_LEN	64
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	rte_eal_mp_t action;
> +};
> +
> +/** Double linked list of actions. */
> +TAILQ_HEAD(action_entry_list, action_entry);
> +
> +static struct action_entry_list action_entry_list =
> +	TAILQ_HEAD_INITIALIZER(action_entry_list);
> +
> +struct mp_msghdr {
> +	char action_name[MAX_ACTION_NAME_LEN];
> +	int fds_num;
> +	int len_params;
> +	char params[0];
> +} __rte_packed;
> +
>  int
>  rte_eal_primary_proc_alive(const char *config_file_path)
>  {
> @@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
> 
>  	return !!ret;
>  }
> +
> +static struct action_entry *
> +find_action_entry_by_name(const char *name)
> +{
> +	int len = strlen(name);
> +	struct action_entry *entry;
> +
> +	TAILQ_FOREACH(entry, &action_entry_list, next) {
> +		if (strncmp(entry->action_name, name, len) == 0)

I think it has be just strcmp() here.


> +			break;
> +	}
> +
> +	return entry;
> +}
> +
> +int
> +rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
> +{
> +	struct action_entry *entry = malloc(sizeof(struct action_entry));
> +
> +	if (entry == NULL) {
> +		rte_errno = -ENOMEM;
> +		return -1;
> +	}
> +
> +	if (strlen(action_name) > MAX_ACTION_NAME_LEN) {

No space for '\0' left.
either >= MAX_ACTION_NAME_LEN, or make entry.name[MAX_ACTION_NAME_LEN + 1];
Even better just 
 - allocate new action_entry.
if (snprintf(action->name, "%s", action_name) >= sizeof(action->name)) {
    free(action);
    return -E2BIG;  
} 


> +		rte_errno = -E2BIG;
> +		return -1;
> +	}
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	if (find_action_entry_by_name(action_name) != NULL) {
> +		free(entry);

Forgot to do mutex_unlock().

> +		rte_errno = -EEXIST;
> +		return -1;
> +	}
> +	strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
> +	entry->action = action;
> +	TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return 0;
> +}
> +
> +void
> +rte_eal_mp_action_unregister(const char *name)
> +{
> +	struct action_entry *entry;
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	entry = find_action_entry_by_name(name);
> +	TAILQ_REMOVE(&action_entry_list, entry, next);
> +	free(entry);

Better to do free() after releasing the mutex.

> +	pthread_mutex_unlock(&mp_mutex_action);
> +}
> +
> +static int
> +read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
> +{
> +	int ret;
> +	struct iovec iov;
> +	struct msghdr msgh;
> +	size_t fdsize = fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +
> +	memset(&msgh, 0, sizeof(msgh));
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(fd, &msgh, 0);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
> +		return -1;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, EAL, "truncted msg\n");
> +		return -1;
> +	}
> +
> +	/* read auxiliary FDs if any */
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +process_msg(struct mp_msghdr *hdr, int len, int fds[])
> +{
> +	int ret;
> +	int params_len;
> +	struct action_entry *entry;
> +
> +	RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
> +
> +	pthread_mutex_lock(&mp_mutex_action);
> +	entry = find_action_entry_by_name(hdr->action_name);
> +	if (entry == NULL) {
> +		RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
> +			hdr->action_name);
> +		pthread_mutex_unlock(&mp_mutex_action);
> +		return -1;

If no action is specified for that message - who will free it?
If action() exisits is it a responsibility of action() to free msg?

> +	}
> +
> +	params_len = len - sizeof(struct mp_msghdr);
> +	ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);

Do you really need to do action() with lock held?

> +	pthread_mutex_unlock(&mp_mutex_action);
> +	return ret;
> +
> +}
> +
> +static void *

Why just not 'void' here?

> +mp_handle(void *arg __rte_unused)
> +{
> +	int len;
> +	int fds[SCM_MAX_FD];
> +	char buf[MAX_MSG_LENGTH];
> +
> +	while (1) {
> +		len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
> +		if (len > 0)
> +			process_msg((struct mp_msghdr *)buf, len, fds);
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline const char *
> +get_unix_path(int is_server)
> +{
> +	static char unix_path[MAX_UNIX_PATH_LEN];

PATH_MAX?

Why just not make that function to accept char path[PATH_MAX] as a parameter?

> +	const char *prefix = eal_mp_unix_path();
> +	const char *suffix = (is_server) ? "" : "_c";
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
> +	else
> +		snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
> +			 prefix, suffix, getpid());
> +	return unix_path;
> +}
> +
> +static int
> +open_unix_fd(int is_server)
> +{
> +	int fd;
> +	struct sockaddr_un un;
> +
> +	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
> +	if (fd < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create unix socket\n");
> +		return -1;
> +	}
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
> +		 get_unix_path(is_server));
> +	unlink(un.sun_path); /* May still exist since last run */
> +	if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
> +			un.sun_path, strerror(errno));
> +		close(fd);
> +		return -1;
> +	}
> +
> +	RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
> +	return fd;
> +}
> +
> +int
> +rte_eal_mp_channel_init(void)
> +{
> +	pthread_t tid;
> +	char thread_name[RTE_MAX_THREAD_NAME_LEN];
> +
> +	mp_fd = open_unix_fd(1);
> +	if (mp_fd < 0)
> +		return -1;
> +
> +	if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
> +			strerror(errno));
> +		goto error;
> +	}
> +
> +	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
> +	if (rte_thread_setname(tid, thread_name) < 0) {
> +		RTE_LOG(ERR, EAL, "failed to set thead name\n");

Forgot to terminate thread?

> +		goto error;

As a nit - can we reorder code a bit to avoid 'goto's?

> +	}
> +
> +	return 0;
> +error:
> +	close(mp_fd);
> +	mp_fd = -1;
> +	return -1;
> +}
> +
> +static inline struct mp_msghdr *
> +format_msg(const char *act_name, const void *p, int len_params, int fds_num)
> +{
> +	int len_msg;
> +	struct mp_msghdr *msg;
> +
> +	len_msg = sizeof(struct mp_msghdr) + len_params;
> +	if (len_msg > MAX_MSG_LENGTH) {
> +		RTE_LOG(ERR, EAL, "Message is too long\n");
> +		rte_errno = -EINVAL;
> +		return NULL;
> +	}
> +
> +	msg = malloc(len_msg);
> +	if (!msg) {
> +		RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
> +		rte_errno = -ENOMEM;
> +		return NULL;
> +	}
> +	memset(msg, 0, len_msg);
> +	strcpy(msg->action_name, act_name);
> +	msg->fds_num = fds_num;
> +	msg->len_params = len_params;
> +	memcpy(msg->params, p, len_params);
> +	return msg;
> +}
> +
> +static int
> +send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
> +{
> +	int ret;
> +	struct msghdr msgh;
> +	struct iovec iov;
> +	size_t fd_size = msg->fds_num * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +	struct cmsghdr *cmsg;
> +	struct sockaddr_un dst;
> +
> +	memset(&dst, 0, sizeof(dst));
> +	dst.sun_family = AF_UNIX;
> +	snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
> +
> +	memset(&msgh, 0, sizeof(msgh));
> +	memset(control, 0, sizeof(control));
> +
> +	iov.iov_base = (uint8_t *)msg;
> +	iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
> +
> +	msgh.msg_name = &dst;
> +	msgh.msg_namelen = sizeof(dst);
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	cmsg = CMSG_FIRSTHDR(&msgh);
> +	cmsg->cmsg_len = CMSG_LEN(fd_size);
> +	cmsg->cmsg_level = SOL_SOCKET;
> +	cmsg->cmsg_type = SCM_RIGHTS;
> +	memcpy(CMSG_DATA(cmsg), fds, fd_size);
> +
> +	do {
> +		ret = sendmsg(fd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
> +
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
> +				dst_path);
> +		else if (!rte_eal_primary_proc_alive(NULL))
> +			RTE_LOG(ERR, EAL, "primary process exited\n");

So secondary to secondary are not allowed?

> +
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
> +static int
> +mp_send(const char *action_name,
> +	const void *params,
> +	int len_params,
> +	int fds[],
> +	int fds_num)
> +{
> +	int i;
> +	int n = 0;
> +	int sockfd;
> +	struct mp_msghdr *msg;
> +
> +	if (fds_num > SCM_MAX_FD) {
> +		RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
> +		rte_errno = -E2BIG;
> +		return 0;
> +	}
> +
> +	msg = format_msg(action_name, params, len_params, fds_num);
> +	if (msg == NULL)
> +		return 0;
> +
> +	if ((sockfd = open_unix_fd(0)) < 0) {
> +		free(msg);
> +		return 0;
> +	}
> +
> +	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +		/* broadcast to all secondaries */
> +		for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
> +			if (mp_sec_sockets[i] == NULL)
> +				continue;
> +
> +			n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
> +		}
> +	} else
> +		n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
> +
> +	free(msg);
> +	close(sockfd);
> +	return n;
> +}
> +
> +int
> +rte_eal_mp_sendmsg(const char *action_name,
> +		   const void *params,
> +		   int len_params,
> +		   int fds[],
> +		   int fds_num)
> +{
> +	RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
> +	return mp_send(action_name, params, len_params, fds, fds_num);
> +}
> diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
> index e8959eb..e95399b 100644
> --- a/lib/librte_eal/common/eal_filesystem.h
> +++ b/lib/librte_eal/common/eal_filesystem.h
> @@ -38,6 +38,23 @@ eal_runtime_config_path(void)
>  	return buffer;
>  }
> 
> +/** Path of primary/secondary communication unix socket file. */
> +#define MP_UNIX_PATH_FMT "%s/.%s_unix"
> +static inline const char *
> +eal_mp_unix_path(void)
> +{
> +	static char buffer[PATH_MAX]; /* static so auto-zeroed */
> +	const char *directory = default_config_dir;
> +	const char *home_dir = getenv("HOME");
> +
> +	if (getuid() != 0 && home_dir != NULL)
> +		directory = home_dir;
> +	snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
> +		 directory, internal_config.hugefile_prefix);
> +
> +	return buffer;
> +}
> +
>  /** Path of hugepage info file. */
>  #define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
> 
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index c46dd8f..e36e3b5 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
>   */
>  struct rte_bus *rte_bus_find_by_device_name(const char *str);
> 
> +/**
> + * Create the unix channel for primary/secondary communication.
> + *
> + * @return
> + *   0 on success;
> + *   (<0) on failure.
> + */
> +
> +int rte_eal_mp_channel_init(void);
> +
>  #endif /* _EAL_PRIVATE_H_ */
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index 02fa109..9884c0b 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
>  int rte_eal_primary_proc_alive(const char *config_file_path);
> 
>  /**
> + * Action function typedef used by other components.
> + *
> + * As we create unix socket channel for primary/secondary communication, use
> + * this function typedef to register action for coming messages.
> + */
> +typedef int (*rte_eal_mp_t)(const void *params, int len,
> +			    int fds[], int fds_num);
> +
> +/**
> + * Register an action function for primary/secondary communication.
> + *
> + * Call this function to register an action, if the calling component wants
> + * to response the messages from the corresponding component in its primary
> + * process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + * @param action
> + *   The action argument is the function pointer to the action function.
> + *
> + * @return
> + *  - 0 on success.
> + *  - (<0) on failure.
> + */
> +int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
> +
> +/**
> + * Unregister an action function for primary/secondary communication.
> + *
> + * Call this function to unregister an action  if the calling component does
> + * not want to response the messages from the corresponding component in its
> + * primary process or secondary processes.
> + *
> + * @param action_name
> + *   The action_name argument plays as the nonredundant key to find the action.
> + *
> + */
> +void rte_eal_mp_action_unregister(const char *name);
> +
> +/**
> + * Send a message to the peer process.
> + *
> + * This function will send a message which will be responsed by the action
> + * identified by action_name of the process on the other side.
> + *
> + * @param action_name
> + *   The action_name argument is used to identify which action will be used.
> + *
> + * @param params
> + *   The params argument contains the customized message.
> + *
> + * @param len_params
> + *   The len_params argument is the length of the customized message.
> + *
> + * @param fds
> + *   The fds argument is an array of fds sent with sendmsg.
> + *
> + * @param fds_num
> + *   The fds_num argument is number of fds to be sent with sendmsg.
> + *
> + * @return
> + *  - Returns the number of messages being sent successfully.
> + */
> +int
> +rte_eal_mp_sendmsg(const char *action_name, const void *params,
> +		   int len_params, int fds[], int fds_num);
> +
> +/**
>   * Usage function typedef used by the application usage function.
>   *
>   * Use this function typedef to define and call rte_set_application_usage_hook()
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 229eec9..f231724 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)
> 
>  	eal_check_mem_on_local_socket();
> 
> +	if (rte_eal_mp_channel_init() < 0) {
> +		rte_eal_init_alert("failed to init mp channel\n");
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +			rte_errno = EFAULT;
> +			return -1;
> +		}
> +	}
> +
>  	eal_thread_init_master(rte_config.master_lcore);
> 
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index f4f46c1..5dacde5 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -235,4 +235,13 @@ EXPERIMENTAL {
>  	rte_service_set_stats_enable;
>  	rte_service_start_with_defaults;
> 
> +} DPDK_17.08;
> +
> +DPDK_18.02 {
> +	global:
> +
> +	rte_eal_mp_action_register;
> +	rte_eal_mp_action_unregister;
> +	rte_eal_mp_sendmsg;
> +
>  } DPDK_17.11;
> --
> 2.7.4



More information about the dev mailing list