On Fri, Nov 21, 2025 at 12:18:51PM +0530, Ravi Kishore Koppuravuri wrote:
> User space tool for querying GPU health monitoring RAS events via
> Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
> Available Commands are
>       - List Nodes
>       - Get Error Counters
>       - Query Error Counter
> 
> Signed-off-by: Ravi Kishore Koppuravuri <[email protected]>
> Co-authored-by: Iddamsetty Aravind <[email protected]>
> Cc: Tauro Riana <[email protected]>
> Cc: Gupta Anshuman <[email protected]>
> Cc: Vivi Rodrigo <[email protected]>
> 
> ---
> V2 -> V3:
>       - Created handle_err() function to remove redundant code
>       - Handled more error scenarios while passing command line arguments
>       - Resolved formatting issues (Rodrigo)
> 
> V1 -> V2:
>       - Removed device_id from the input parameters
>       - Updated help() function
>       - Incorporated error handling logic
> ---
> ---
>  include/drm-uapi/drm_ras.h |  79 +++++++

Please sync up with Riana and make this work with her updated series.
But ensuring you have both error id and error string coming from the uapi
header.

Thanks,
Rodrigo.

>  meson.build                |   5 +-
>  tools/drm_ras.c            | 425 +++++++++++++++++++++++++++++++++++++
>  tools/meson.build          |   5 +
>  4 files changed, 513 insertions(+), 1 deletion(-)
>  create mode 100644 include/drm-uapi/drm_ras.h
>  create mode 100644 tools/drm_ras.c
> 
> diff --git a/include/drm-uapi/drm_ras.h b/include/drm-uapi/drm_ras.h
> new file mode 100644
> index 000000000..af893aa36
> --- /dev/null
> +++ b/include/drm-uapi/drm_ras.h
> @@ -0,0 +1,79 @@
> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR 
> BSD-3-Clause) */
> +/* Do not edit directly, auto-generated from: */
> +/*   Documentation/netlink/specs/drm_ras.yaml */
> +/* YNL-GEN uapi header */
> +
> +#ifndef _LINUX_DRM_RAS_H
> +#define _LINUX_DRM_RAS_H
> +
> +#define DRM_RAS_GENL_NAME "drm-ras"
> +#define DRM_RAS_FAMILY_VERSION       1
> +
> +/*
> + * Type of the node. Currently, only error-counter nodes are supported, which
> + * expose reliability counters for a hardware/software component.
> + */
> +enum drm_ras_node_type {
> +     DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
> +};
> +
> +enum {
> +     /* Unique identifier for the node*/
> +     DRM_RAS_NODE_ATTR_NODE_ID = 1,
> +
> +     /* Device name chosen by the driver at the time of registration */
> +     DRM_RAS_NODE_ATTR_DEVICE_NAME,
> +
> +     /* Node name chosen by the driver at registration to identify RAS node 
> inside the device */
> +     DRM_RAS_NODE_ATTR_NODE_NAME,
> +
> +     /* Type of the node, identifying its function */
> +     DRM_RAS_NODE_ATTR_NODE_TYPE,
> +
> +     __DRM_RAS_NODE_ATTR_MAX,
> +     DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
> +};
> +
> +enum {
> +     /* Node ID targeted by this error counter operation */
> +     DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
> +
> +     /* Unique identifier for a specific error counter within an node */
> +     DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
> +
> +     /* Name of the requested error counter */
> +     DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
> +
> +     /* Current value of the requested error counter */
> +     DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
> +
> +     __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
> +     DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
> +};
> +
> +enum drm_genl_error_cmds {
> +     /**
> +      * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of 
> currently registered
> +      * DRM RAS nodes.Each node includes its dynamically assigned ID, name, 
> and type.
> +      * Obtain the Node IDs by calling this command and use it in the 
> subsequent operations
> +      * on the nodes.
> +      */
> +     DRM_RAS_CMD_LIST_NODES = 1,
> +
> +     /**
> +      * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error 
> counters for a given
> +      * node. The response include id, name, and current value of each 
> counter.
> +      */
> +     DRM_RAS_CMD_GET_ERROR_COUNTERS,
> +
> +     /**
> +      * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a 
> specific error counter
> +      * for a given node. Response contains id, name, and current value of 
> the counter.
> +      */
> +     DRM_RAS_CMD_QUERY_ERROR_COUNTER,
> +
> +     __DRM_RAS_CMD_MAX,
> +     DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
> +};
> +
> +#endif /* _LINUX_DRM_RAS_H */
> diff --git a/meson.build b/meson.build
> index db6e09a94..f7807660e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', 
> required : true)
>  libudev = dependency('libudev', required : true)
>  glib = dependency('glib-2.0', required : true)
>  
> +libnl = dependency('libnl-3.0', required: false)
> +libnl_genl = dependency('libnl-genl-3.0', required: false)
> +libnl_cli = dependency('libnl-cli-3.0', required:false)
> +
>  xmlrpc = dependency('xmlrpc', required : false)
>  xmlrpc_util = dependency('xmlrpc_util', required : false)
>  xmlrpc_client = dependency('xmlrpc_client', required : false)
> -
>  xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
>  if not xmlrpc.found() and xmlrpc_cmd.found()
>       libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
> diff --git a/tools/drm_ras.c b/tools/drm_ras.c
> new file mode 100644
> index 000000000..9bb58bc5e
> --- /dev/null
> +++ b/tools/drm_ras.c
> @@ -0,0 +1,425 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +#include <ctype.h>
> +#include <getopt.h>
> +#include <linux/genetlink.h>
> +#include <netlink/netlink.h>
> +#include <netlink/cache.h>
> +#include <netlink/genl/genl.h>
> +#include <netlink/genl/ctrl.h>
> +#include <netlink/cli/utils.h>
> +#include <netlink/cli/link.h>
> +#include "../include/drm-uapi/drm_ras.h"
> +#include "igt_device_scan.h"
> +
> +#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
> +
> +struct nl_sock *mcsock;
> +
> +enum opt_val {
> +     OPT_UNKNOWN = '?',
> +     OPT_END = -1,
> +     OPT_NODEID,
> +     OPT_ERRORID,
> +     OPT_HELP,
> +};
> +
> +enum cmd_ids {
> +     INVALID_CMD = -1,
> +     LIST_NODES = 0,
> +     GET_ERROR_COUNTERS,
> +     QUERY_ERROR_COUNTER,
> +
> +     __MAX_CMDS,
> +};
> +
> +static const char * const cmd_names[] = {
> +     "list_nodes",
> +     "get_error_counters",
> +     "query_error_counter",
> +};
> +
> +struct app_context {
> +     enum drm_genl_error_cmds command;
> +     struct nl_sock *sock;
> +     struct nl_cb *cb;
> +     uint32_t node_id;
> +     uint32_t error_id;
> +     int error_id_set;
> +     int node_id_set;
> +     int error;
> +     int family_id;
> +};
> +
> +static void help(char **argv)
> +{
> +     int i;
> +
> +     printf("Usage: %s command [<command options>]\n", argv[0]);
> +     printf("commands:\n");
> +
> +     for (i = 0; i < __MAX_CMDS; i++) {
> +             switch (i) {
> +             case LIST_NODES:
> +                     printf("%s %s\n",
> +                            argv[0],
> +                            cmd_names[i]);
> +                     break;
> +             case GET_ERROR_COUNTERS:
> +                     printf("%s %s "
> +                             "--node-id=<node-id>\n",
> +                             argv[0],
> +                             cmd_names[i]);
> +                     break;
> +             case QUERY_ERROR_COUNTER:
> +                     printf("%s %s "
> +                             "--node-id=<node-id> "
> +                             "--error-id=<error-id>\n",
> +                             argv[0],
> +                             cmd_names[i]);
> +                     break;
> +             default:
> +                     printf("%s is Unknown Command\n",
> +                            (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i] 
> : "Unknown");
> +             }
> +     }
> +}
> +
> +static int list_nodes_handler(struct nl_msg *msg, void *arg)
> +{
> +     struct nlmsghdr *nlh = nlmsg_hdr(msg);
> +     struct nlattr *nla;
> +     int len, remain;
> +
> +     len = GENL_HDRLEN;
> +     nlmsg_for_each_attr(nla, nlh, len, remain) {
> +             if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
> +                     printf("Unknown Node attribute type: %d\n", 
> nla_type(nla));
> +                     return NL_SKIP;
> +             }
> +
> +             switch (nla_type(nla)) {
> +             case DRM_RAS_NODE_ATTR_NODE_ID:
> +                     printf("%-18u\t", nla_get_u32(nla));
> +                     break;
> +             case DRM_RAS_NODE_ATTR_DEVICE_NAME:
> +                     printf("%-30s\t", nla_get_string(nla));
> +                     break;
> +             case DRM_RAS_NODE_ATTR_NODE_NAME:
> +                     printf("%-30s\t", nla_get_string(nla));
> +                     break;
> +             case DRM_RAS_NODE_ATTR_NODE_TYPE:
> +                     printf("%-18u\n", nla_get_u32(nla));
> +                     break;
> +             default:
> +                     printf("Unknown attribute type: %d\n", nla_type(nla));
> +                     break;
> +             }
> +     }
> +     return NL_OK;
> +}
> +
> +static int query_error_counter(struct nl_msg *msg, void *arg)
> +{
> +     struct nlmsghdr *nlh = nlmsg_hdr(msg);
> +     struct nlattr *attrs[256];
> +     int ret;
> +
> +     ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
> +     if (ret < 0) {
> +             fprintf(stderr, "Failed to parse attributes: %s\n", 
> nl_geterror(ret));
> +             return NL_SKIP;
> +     }
> +
> +     if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
> +             nl_cli_fatal(NLE_FAILURE, 
> "DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
> +             return NL_SKIP;
> +     }
> +
> +     printf("counter value %u\n", 
> nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
> +
> +     return NL_OK;
> +}
> +
> +static int get_error_counters(struct nl_msg *msg, void *arg)
> +{
> +     struct nlmsghdr *nlh = nlmsg_hdr(msg);
> +     struct nlattr *nla;
> +     int len, remain;
> +
> +     len = GENL_HDRLEN;
> +
> +     nlmsg_for_each_attr(nla, nlh, len, remain) {
> +             if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
> +                     printf("Unknown error counter attribute type: %d\n", 
> nla_type(nla));
> +                     return NL_SKIP;
> +             }
> +
> +             switch (nla_type(nla)) {
> +             case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
> +                     printf("%-18u\t", nla_get_u32(nla));
> +                     break;
> +             case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
> +                     printf("%-30s\t", nla_get_string(nla));
> +                     break;
> +             case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
> +                     printf("%-18u\n", nla_get_u32(nla));
> +                     break;
> +             default:
> +                     printf("Unknown attribute type: %d\n", nla_type(nla));
> +                     break;
> +             }
> +     }
> +     return NL_OK;
> +}
> +
> +static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
> +{
> +     struct app_context *ctx = (struct app_context *)arg;
> +     struct nlmsghdr *nlh = nlmsg_hdr(msg);
> +     struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
> +
> +     if (gnlh->cmd != ctx->command) {
> +             fprintf(stderr,
> +                     "Unexpected command response: got %d, expected %d\n",
> +                     gnlh->cmd,
> +                     ctx->command);
> +             return NL_SKIP;
> +     }
> +
> +     switch (ctx->command) {
> +     case DRM_RAS_CMD_LIST_NODES:
> +             return list_nodes_handler(msg, arg);
> +     case DRM_RAS_CMD_GET_ERROR_COUNTERS:
> +             return get_error_counters(msg, arg);
> +     case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
> +             return query_error_counter(msg, arg);
> +     default:
> +             fprintf(stderr, "Unknown command: %d\n", ctx->command);
> +             ctx->error = -EOPNOTSUPP;
> +             return NL_SKIP;
> +     }
> +}
> +
> +static void handle_err(struct nl_sock *sock, int ret, const char *err_msg)
> +{
> +     nl_close(sock);
> +     nl_socket_free(sock);
> +     nl_cli_fatal(ret, err_msg);
> +}
> +
> +static void send_cmd(int cmd, void *arg)
> +{
> +     struct app_context *ctx = (struct app_context *)arg;
> +     struct nl_msg *msg;
> +     void *msg_head;
> +     int ret;
> +
> +     msg = nlmsg_alloc();
> +     if (!msg)
> +             handle_err(ctx->sock, NLE_INVAL, "nlmsg_alloc failed\n");
> +
> +     switch (cmd) {
> +     case DRM_RAS_CMD_LIST_NODES:
> +             msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> +                                    ctx->family_id, 0,
> +                                    NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | 
> NLM_F_MATCH,
> +                                    cmd, 1);
> +             if (!msg_head)
> +                     nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> +             printf("%-18s\t%-30s\t%-30s\t%-18s\n",
> +                    "node-id", "device-name", "node-name", "node-type");
> +             break;
> +     case DRM_RAS_CMD_GET_ERROR_COUNTERS:
> +             if (ctx->node_id == -1) {
> +                     fprintf(stderr, "Error: --node-id is required for %s 
> command\n",
> +                             cmd_names[ctx->command - 1]);
> +                     exit(EXIT_FAILURE);
> +             }
> +             msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> +                                    ctx->family_id, 0,
> +                                    NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | 
> NLM_F_MATCH,
> +                                    cmd, 1);
> +
> +             if (!msg_head)
> +                     nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> +             nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, 
> ctx->node_id);
> +             printf("%-18s\t%-30s\t%-18s\n",
> +                    "error-id", "error-name", "error-value");
> +             break;
> +     case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
> +             if (ctx->node_id == -1 || ctx->error_id == -1) {
> +                     fprintf(stderr,
> +                             "Error: --node-id and --error-id are required "
> +                             "for %s command\n",
> +                             cmd_names[ctx->command - 1]);
> +                     exit(EXIT_FAILURE);
> +             }
> +             msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> +                                    ctx->family_id, 0,
> +                                    NLM_F_REQUEST | NLM_F_ACK,
> +                                    cmd, 1);
> +
> +             if (!msg_head)
> +                     nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> +             nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, 
> ctx->node_id);
> +             nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID, 
> ctx->error_id);
> +             break;
> +     default:
> +             break;
> +     }
> +
> +     ret = nl_send_auto(ctx->sock, msg);
> +     if (ret < 0)
> +             nl_cli_fatal(ret, "Unable to send message: %s", 
> nl_geterror(ret));
> +
> +     ret = nl_recvmsgs_default(ctx->sock);
> +     if (ret < 0)
> +             nl_cli_fatal(ret, "Unable to receive message: %s", 
> nl_geterror(ret));
> +
> +     nlmsg_free(msg);
> +}
> +
> +static int get_cmd(char *cmd_name)
> +{
> +     int i;
> +
> +     if (!cmd_name)
> +             return -1;
> +
> +     for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
> +             if (strcasecmp(cmd_name, cmd_names[i]) == 0)
> +                     return i + 1;
> +     }
> +     return -1;
> +}
> +
> +static int check_for_help(int argc, char **argv)
> +{
> +     for (int i = 1; i < argc; i++) {
> +             if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 
> 0)
> +                     return 1;
> +     }
> +     return 0;
> +}
> +
> +int main(int argc, char **argv)
> +{
> +     char *endptr;
> +     int ret, opt, option_index = 0;
> +     struct app_context ctx = {0};
> +
> +     ctx.error_id = -1;
> +     ctx.node_id = -1;
> +
> +     if (argc < 2) {
> +             fprintf(stderr, "\nNo Arguments were passed.\n\n"
> +                     "Use --help to see the correct usage.\n\n");
> +             exit(EXIT_FAILURE);
> +     }
> +     if (check_for_help(argc, argv)) {
> +             help(argv);
> +             exit(EXIT_SUCCESS);
> +     }
> +
> +     ctx.command = get_cmd(argv[1]);
> +     if (ctx.command < 0) {
> +             fprintf(stderr, "invalid command\n");
> +             help(argv);
> +             exit(EXIT_FAILURE);
> +     }
> +
> +     static struct option options[] = {
> +             {"error-id", optional_argument, NULL, OPT_ERRORID},
> +             {"node-id",  optional_argument, NULL, OPT_NODEID},
> +             {"help",     no_argument,       NULL, OPT_HELP},
> +             {0, 0, 0, 0}
> +     };
> +
> +     optind = 2;
> +     while ((opt = getopt_long(argc, argv, "h", options, &option_index)) != 
> -1) {
> +             switch (opt) {
> +             case OPT_ERRORID:
> +                     if (optarg) {
> +                             ctx.error_id = strtoul(optarg, &endptr, 10);
> +                             if (*endptr != '\0' || !ctx.error_id) {
> +                                     fprintf(stderr,
> +                                             "\ninvalid error-id %s\n\n"
> +                                             "Enter a valid error-id 
> received "
> +                                             "from get_error_counters 
> command\n\n",
> +                                             optarg);
> +                                     exit(EXIT_FAILURE);
> +                             }
> +                     } else {
> +                             printf("error-id not specified. check --help  
> for correct usage\n");
> +                             exit(EXIT_FAILURE);
> +                     }
> +                     break;
> +             case OPT_NODEID:
> +                     if (optarg) {
> +                             ctx.node_id = strtoul(optarg, &endptr, 10);
> +                             if (*endptr != '\0' || !ctx.node_id) {
> +                                     fprintf(stderr,
> +                                             "\ninvalid node id %s\n\n"
> +                                             "Enter a valid node-id received 
> "
> +                                             "from list_nodes command\n\n",
> +                                             optarg);
> +                                     exit(EXIT_FAILURE);
> +                             }
> +                     } else {
> +                             printf("node-id not specified. Check --help for 
> correct usage\n");
> +                             exit(EXIT_FAILURE);
> +                     }
> +                     break;
> +             case OPT_HELP:
> +             case 'h':
> +                     help(argv);
> +                     exit(EXIT_SUCCESS);
> +                     break;
> +             case '?':
> +                     fprintf(stderr,
> +                             "Unknown argument passed\n"
> +                             "Check --help for the correct usage\n\n");
> +                     exit(EXIT_FAILURE);
> +                     break;
> +             default:
> +                     fprintf(stderr, "Unexpected option: %c\n", opt);
> +                     exit(EXIT_FAILURE);
> +                     break;
> +             }
> +     }
> +
> +     ctx.sock = nl_cli_alloc_socket();
> +     if (!ctx.sock)
> +             nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
> +
> +     ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
> +     if (ret < 0)
> +             handle_err(ctx.sock, ret, "Cannot connect handle\n");
> +
> +     ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
> +     if (ctx.family_id < 0)
> +             handle_err(ctx.sock, NLE_INVAL, "Resolving of family name 
> failed\n");
> +
> +     ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM, 
> drm_genl_handle_msg, &ctx);
> +     if (ret < 0)
> +             handle_err(ctx.sock, ret, "Unable to modify valid message 
> callback\n");
> +
> +     send_cmd(ctx.command, &ctx);
> +
> +     nl_close(ctx.sock);
> +     nl_socket_free(ctx.sock);
> +
> +     return 0;
> +}
> diff --git a/tools/meson.build b/tools/meson.build
> index 8185ba160..74ff97713 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -70,6 +70,11 @@ if libudev.found()
>                  install : true)
>  endif
>  
> +executable('drm_ras', 'drm_ras.c',
> +                     dependencies : [tool_deps, libnl, libnl_cli, 
> libnl_genl],
> +                     install_rpath : bindir_rpathdir,
> +                     install : true)
> +
>  executable('gputop', 'gputop.c',
>             install : true,
>             install_rpath : bindir_rpathdir,
> -- 
> 2.34.1
> 

Reply via email to