通过 XRT API 报告错误 - 2022.1 简体中文

Versal ACAP AI 引擎编程环境 用户指南 (UG1076)

Document ID
UG1076
Release Date
2022-05-25
Version
2022.1 简体中文

XRT 提供了错误报告 API。错误报告 API 可分类为两种类型:同步 API 和异步 API。同步错误是在 XRT 运行时函数调用期间检测到的错误。它符合 POSIX 标准。例如:



异步错误可能与当前 XRT 函数调用或正在运行的应用无关。异步错误缓存在驱动程序子系统内,可供用户应用通过异步错误报告 API 来访问。缓存的错误将长久保存直至被显式清除为止。持久存在的错误并不一定表示当前系统状态,例如,开发板可能已复位且正常工作,而先前缓存的错误仍可用。为避免混淆当前状态,异步错误附有时间戳以指示错误发生时间。例如,此时间戳可与最近的 xbutil 复位时间戳进行比较。

驱动程序缓存的错误包含系统错误代码和 xrt_error_code.h 中定义的额外元数据,此元数据在用户空间与内核空间之间共享。

异步错误的错误代码格式如下所示:

/**
 * xrtErrorCode layout
 *
 * This layout is internal to XRT (akin to a POSIX error code).
 *
 * The error code is populated by driver and consumed by XRT
 * implementation where it is translated into an actual error / info /
 * warning that is propagated to the end user.
 *
 * 63 - 48  47 - 40   39 - 32   31 - 24   23 - 16    15 - 0
 * --------------------------------------------------------
 * |    |    |    |    |    |    |    |    |    |    |----| xrtErrorNum
 * |    |    |    |    |    |    |    |    |----|---------- xrtErrorDriver
 * |    |    |    |    |    |    |----|-------------------- xrtErrorSeverity
 * |    |    |    |    |----|------------------------------ xrtErrorModule
 * |    |    |----|---------------------------------------- xrtErrorClass
 * |----|-------------------------------------------------- reserved
 *
 */
typedef uint64_t xrtErrorCode;
typedef uint64_t xrtErrorTime;

#define XRT_ERROR_NUM_MASK		0xFFFFUL
#define XRT_ERROR_NUM_SHIFT		0
#define XRT_ERROR_DRIVER_MASK		0xFUL
#define XRT_ERROR_DRIVER_SHIFT		16
#define XRT_ERROR_SEVERITY_MASK		0xFUL
#define XRT_ERROR_SEVERITY_SHIFT	24
#define XRT_ERROR_MODULE_MASK		0xFUL
#define XRT_ERROR_MODULE_SHIFT		32
#define XRT_ERROR_CLASS_MASK		0xFUL
#define XRT_ERROR_CLASS_SHIFT		40

#define	XRT_ERROR_CODE_BUILD(num, driver, severity, module, eclass) \
	((((num) & XRT_ERROR_NUM_MASK) << XRT_ERROR_NUM_SHIFT) | \
	(((driver) & XRT_ERROR_DRIVER_MASK) << XRT_ERROR_DRIVER_SHIFT) | \
	(((severity) & XRT_ERROR_SEVERITY_MASK) << XRT_ERROR_SEVERITY_SHIFT) | \
	(((module) & XRT_ERROR_MODULE_MASK) << XRT_ERROR_MODULE_SHIFT) | \
	(((eclass) & XRT_ERROR_CLASS_MASK) << XRT_ERROR_CLASS_SHIFT))

#define XRT_ERROR_NUM(code) (((code) >> XRT_ERROR_NUM_SHIFT) & XRT_ERROR_NUM_MASK)
#define XRT_ERROR_DRIVER(code) (((code) >> XRT_ERROR_DRIVER_SHIFT) & XRT_ERROR_DRIVER_MASK)
#define XRT_ERROR_SEVERITY(code) (((code) >> XRT_ERROR_SEVERITY_SHIFT) & XRT_ERROR_SEVERITY_MASK)
#define XRT_ERROR_MODULE(code) (((code) >> XRT_ERROR_MODULE_SHIFT) & XRT_ERROR_MODULE_MASK)
#define XRT_ERROR_CLASS(code) (((code) >> XRT_ERROR_CLASS_SHIFT) & XRT_ERROR_CLASS_MASK)

/**
 * xrt_error_num - XRT specific error numbers
 */

enum xrtErrorNum {
XRT_ERROR_NUM_FIRWWALL_TRIP = 1,
XRT_ERROR_NUM_TEMP_HIGH,
XRT_ERROR_NUM_AIE_SATURATION,
XRT_ERROR_NUM_AIE_FP,
XRT_ERROR_NUM_AIE_STREAM,
XRT_ERROR_NUM_AIE_ACCESS,
XRT_ERROR_NUM_AIE_BUS,
XRT_ERROR_NUM_AIE_INSTRUCTION,
XRT_ERROR_NUM_AIE_ECC,
XRT_ERROR_NUM_AIE_LOCK,
XRT_ERROR_NUM_AIE_DMA,
XRT_ERROR_NUM_AIE_MEM_PARITY,
XRT_ERROR_NUM_UNKNOWN
};

enum xrtErrorDriver {
  XRT_ERROR_DRIVER_XOCL,
  XRT_ERROR_DRIVER_XCLMGMT,
  XRT_ERROR_DRIVER_ZOCL,
  XRT_ERROR_DRIVER_AIE,
  XRT_ERROR_DRIVER_UNKNOWN
};

enum xrtErrorSeverity {
  XRT_ERROR_SEVERITY_EMERGENCY = 0,
  XRT_ERROR_SEVERITY_ALERT,
  XRT_ERROR_SEVERITY_CRITICAL,
  XRT_ERROR_SEVERITY_ERROR,
  XRT_ERROR_SEVERITY_WARNING,
  XRT_ERROR_SEVERITY_NOTICE,
  XRT_ERROR_SEVERITY_INFO,
  XRT_ERROR_SEVERITY_DEBUG,
  XRT_ERROR_SEVERITY_UNKNOWN
};

enum xrtErrorModule {
  XRT_ERROR_MODULE_FIREWALL = 0,
  XRT_ERROR_MODULE_CMC,
  XRT_ERROR_MODULE_AIE_CORE,
  XRT_ERROR_MODULE_AIE_MEMORY,
  XRT_ERROR_MODULE_AIE_SHIM,
  XRT_ERROR_MODULE_AIE_NOC,
  XRT_ERROR_MODULE_AIE_PL,
  XRT_ERROR_MODULE_AIE_UNKNOWN
};

enum xrtErrorClass {
XRT_ERROR_CLASS_FIRST_ENTRY = 1,
XRT_ERROR_CLASS_SYSTEM = XRT_ERROR_CLASS_FIRST_ENTRY,
XRT_ERROR_CLASS_AIE,
XRT_ERROR_CLASS_HARDWARE,
XRT_ERROR_CLASS_UNKNOWN,
XRT_ERROR_CLASS_LAST_ENTRY = XRT_ERROR_CLASS_UNKNOWN
};

API 头文件 experimental/xrt_error.h 用于定义 API 以访问当前缓存的错误。它可提供 xrtErrorGetLast()xrtErrorGetString() API 以检索系统级异步错误。

/**
 * xrtErrorGetLast - Get the last error code and its timestamp of a given error class.
 *
 * @handle:       Device handle.
 * @class:        Error Class for the last error to get.
 * @error:        Returned XRT error code.
 * @timestamp:    The timestamp when the error generated
 *
 * Return:        0 on success or appropriate XRT error code.
 */
int
xrtErrorGetLast(xrtDeviceHandle handle, xrtErrorClass ecl, xrtErrorCode* error, uint64_t* timestamp);

/**
 * xrtErrorGetString - Get the description string of a given error code.
 *
 * @handle:       Device handle.
 * @error:        XRT error code.
 * @out:          Preallocated output buffer for the error string.
 * @len:          Length of output buffer.
 * @out_len:      Output of length of message, ignored if null.
 *
 * Return:        0 on success or appropriate XRT error code.
 *
 * Specifying out_len while passing nullptr for output buffer will
 * return the message length, which can then be used to allocate the
 * output buffer itself.
 */
int
xrtErrorGetString(xrtDeviceHandle, xrtErrorCode error, char* out, size_t len, size_t* out_len);

应用可按给定错误类来调用 xrtErrorGetLast() 以获取最新错误代码。应用可按给定错误代码调用 xrtErrorGetString() 以获取对应于此错误代码的错误字符串。XRT 会维护每个类的最新代码和关联的时间戳(指示错误生成时间)。

xbutil 可用于报告错误。错误报告会累积来自先前各类的所有错误,并按时间戳对其进行排序。此报告会查询驱动程序,了解上次请求复位的时间。此复位将合并(使用时间戳)到报告列表中。

$ xbutil examine -r error -d 0000:00:00.0               
Asynchronous Errors
  Time                               Class               Module              Driver              Severity            Error Code          
  2020-Oct-08 16:40:02               CLASS_SYSTEM        MODULE_FIREWALL     DRIVER_XOCL         SEVERITY_EMERGENCY  FIREWALL_TRIP


$ xbutil2 examine -r error -f JSON-2020.2 -o <OUTPUT_FILE> -d 0000:00:00.0
{
    "schema_version": {
        "schema": "JSON",
        "creation_date": "Fri Oct  9 11:04:24 2020 GMT"
    },
    "devices": [
        {
            "asynchronous_errors": [
                {
                    "timestamp": "1602175202572070700",
                    "class": "CLASS_SYSTEM",
                    "module": "MODULE_FIREWALL",
                    "severity": "SEVERITY_EMERGENCY",
                    "driver": "DRIVER_XOCL",
                    "error_code": {
                        "error_id": "1",
                        "error_msg": "FIREWALL_TRIP"
                    }
                }
            ]
        }
    ]
}

xbutil 还可用于报告 AI 引擎运行状态和读取寄存器以便调试。例如,以下命令会在执行 graph 后读取内核状态。

$ xbutil examine -r aie -d 0000:00:00.0

--------------------------
1/1 [0000:00:00.0] : edge
--------------------------
Aie
  Aie_Metadata
  GRAPH[ 0] Name : gr
          Status : running
    SNo. Core [C:R] Iteration_Memory [C:R] Iteration_Memory_Addresses 
    [ 0] 23:1 23:1 16388 
    [ 1] 23:2 23:0 6980 
    [ 2] 23:3 23:1 4 
    [ 3] 24:1 24:0 4 
    [ 4] 24:2 24:2 4 
    [ 5] 24:3 24:1 4 
    [ 6] 25:1 25:1 4 


Core [ 0]
  Column : 23
  Row : 1
  Core:
    Status : core_done
    Program Counter : 0x00000308
    Link Register : 0x00000290
    Stack Pointer : 0x000340a0
  DMA:
    MM2S:
      Channel:
        Id : 0
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

    S2MM:
      Channel:
        Id : 0
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

  Locks:
    0 : released_for_write
    1 : released_for_write
    2 : released_for_write
    3 : released_for_write
    4 : released_for_write
    5 : released_for_write
    6 : released_for_write
    7 : released_for_write
    8 : released_for_write
    9 : released_for_write
    10 : released_for_write
    11 : released_for_write
    12 : released_for_write
    13 : released_for_write
    14 : released_for_write
    15 : released_for_write


  Events:
    core : 1, 2, 5, 22, 23, 24, 28, 29, 31, 32, 35, 36, 38, 39, 40, 44, 45, 47, 68
    memory : 1, 43, 44, 45, 106, 113

......


Core [ 6]
  Column : 25
  Row : 1
  Core:
    Status : enabled, east_lock_stall
    Program Counter : 0x000001e6
    Link Register : 0x000000b0
    Stack Pointer : 0x00030020
  DMA:
    MM2S:
      Channel:
        Id : 0
        Channel Status : stalled_on_requesting_lock
        Queue Size : 0
        Queue Status : okay
        Current BD : 2

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

    S2MM:
      Channel:
        Id : 0
        Channel Status : running
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0


  Locks:
    0 : acquired_for_write
    1 : released_for_write
    2 : released_for_write
    3 : released_for_write
    4 : released_for_write
    5 : released_for_write
    6 : released_for_write
    7 : released_for_write
    8 : released_for_write
    9 : released_for_write
    10 : released_for_write
    11 : released_for_write
    12 : released_for_write
    13 : released_for_write
    14 : released_for_write
    15 : released_for_write

  Events:
    core : 1, 2, 5, 22, 26, 28, 29, 31, 32, 35, 38, 39, 44
    memory : 1, 20, 21, 23, 35, 43, 44, 106, 113

以下命令可用于读取特定寄存器以便调试。

$ xbutil advanced --read-aie-reg -d 0000:00:0 0 25 Core_Status 
Register Core_Status Value of Row:0 Column:25 is 0x00000201

如需了解 AI 引擎寄存器定义,请参阅 Versal ACAP AI 引擎寄存器参考资料(AM015)。如需了解有关 xbutil 命令使用的详细信息,请参阅 Xilinx Runtime (XRT) 架构