XRT 提供了错误报告 API。错误报告 API 可分类为两种类型:同步 API 和异步 API。同步错误是在 XRT 运行时函数调用期间检测到的错误。它符合 POSIX 标准。例如:
异步错误可能与当前 XRT 函数调用或正在运行的应用无关。异步错误缓存在驱动程序子系统内,可供用户应用通过异步错误报告 API 来访问。缓存的错误将长久保存直至被显式清除为止。持久存在的错误并不一定表示当前系统状态,例如,开发板可能已复位且正常工作,而先前缓存的错误仍可用。为避免混淆当前状态,异步错误附有时间戳以指示错误发生时间。例如,此时间戳可与最近的 xbutil
复位时间戳进行比较。
驱动程序缓存的错误包含系统错误代码和 xrt_error_code.h 中定义的额外元数据,此元数据在用户空间与内核空间之间共享。
异步错误的错误代码格式如下所示:
/**
* xrtErrorCode layout
*
* This layout is internal to XRT (akin to a POSIX error code).
*
* The error code is populated by driver and consumed by XRT
* implementation where it is translated into an actual error / info /
* warning that is propagated to the end user.
*
* 63 - 48 47 - 40 39 - 32 31 - 24 23 - 16 15 - 0
* --------------------------------------------------------
* | | | | | | | | | | |----| xrtErrorNum
* | | | | | | | | |----|---------- xrtErrorDriver
* | | | | | | |----|-------------------- xrtErrorSeverity
* | | | | |----|------------------------------ xrtErrorModule
* | | |----|---------------------------------------- xrtErrorClass
* |----|-------------------------------------------------- reserved
*
*/
typedef uint64_t xrtErrorCode;
typedef uint64_t xrtErrorTime;
#define XRT_ERROR_NUM_MASK 0xFFFFUL
#define XRT_ERROR_NUM_SHIFT 0
#define XRT_ERROR_DRIVER_MASK 0xFUL
#define XRT_ERROR_DRIVER_SHIFT 16
#define XRT_ERROR_SEVERITY_MASK 0xFUL
#define XRT_ERROR_SEVERITY_SHIFT 24
#define XRT_ERROR_MODULE_MASK 0xFUL
#define XRT_ERROR_MODULE_SHIFT 32
#define XRT_ERROR_CLASS_MASK 0xFUL
#define XRT_ERROR_CLASS_SHIFT 40
#define XRT_ERROR_CODE_BUILD(num, driver, severity, module, eclass) \
((((num) & XRT_ERROR_NUM_MASK) << XRT_ERROR_NUM_SHIFT) | \
(((driver) & XRT_ERROR_DRIVER_MASK) << XRT_ERROR_DRIVER_SHIFT) | \
(((severity) & XRT_ERROR_SEVERITY_MASK) << XRT_ERROR_SEVERITY_SHIFT) | \
(((module) & XRT_ERROR_MODULE_MASK) << XRT_ERROR_MODULE_SHIFT) | \
(((eclass) & XRT_ERROR_CLASS_MASK) << XRT_ERROR_CLASS_SHIFT))
#define XRT_ERROR_NUM(code) (((code) >> XRT_ERROR_NUM_SHIFT) & XRT_ERROR_NUM_MASK)
#define XRT_ERROR_DRIVER(code) (((code) >> XRT_ERROR_DRIVER_SHIFT) & XRT_ERROR_DRIVER_MASK)
#define XRT_ERROR_SEVERITY(code) (((code) >> XRT_ERROR_SEVERITY_SHIFT) & XRT_ERROR_SEVERITY_MASK)
#define XRT_ERROR_MODULE(code) (((code) >> XRT_ERROR_MODULE_SHIFT) & XRT_ERROR_MODULE_MASK)
#define XRT_ERROR_CLASS(code) (((code) >> XRT_ERROR_CLASS_SHIFT) & XRT_ERROR_CLASS_MASK)
/**
* xrt_error_num - XRT specific error numbers
*/
enum xrtErrorNum {
XRT_ERROR_NUM_FIRWWALL_TRIP = 1,
XRT_ERROR_NUM_TEMP_HIGH,
XRT_ERROR_NUM_AIE_SATURATION,
XRT_ERROR_NUM_AIE_FP,
XRT_ERROR_NUM_AIE_STREAM,
XRT_ERROR_NUM_AIE_ACCESS,
XRT_ERROR_NUM_AIE_BUS,
XRT_ERROR_NUM_AIE_INSTRUCTION,
XRT_ERROR_NUM_AIE_ECC,
XRT_ERROR_NUM_AIE_LOCK,
XRT_ERROR_NUM_AIE_DMA,
XRT_ERROR_NUM_AIE_MEM_PARITY,
XRT_ERROR_NUM_UNKNOWN
};
enum xrtErrorDriver {
XRT_ERROR_DRIVER_XOCL,
XRT_ERROR_DRIVER_XCLMGMT,
XRT_ERROR_DRIVER_ZOCL,
XRT_ERROR_DRIVER_AIE,
XRT_ERROR_DRIVER_UNKNOWN
};
enum xrtErrorSeverity {
XRT_ERROR_SEVERITY_EMERGENCY = 0,
XRT_ERROR_SEVERITY_ALERT,
XRT_ERROR_SEVERITY_CRITICAL,
XRT_ERROR_SEVERITY_ERROR,
XRT_ERROR_SEVERITY_WARNING,
XRT_ERROR_SEVERITY_NOTICE,
XRT_ERROR_SEVERITY_INFO,
XRT_ERROR_SEVERITY_DEBUG,
XRT_ERROR_SEVERITY_UNKNOWN
};
enum xrtErrorModule {
XRT_ERROR_MODULE_FIREWALL = 0,
XRT_ERROR_MODULE_CMC,
XRT_ERROR_MODULE_AIE_CORE,
XRT_ERROR_MODULE_AIE_MEMORY,
XRT_ERROR_MODULE_AIE_SHIM,
XRT_ERROR_MODULE_AIE_NOC,
XRT_ERROR_MODULE_AIE_PL,
XRT_ERROR_MODULE_AIE_UNKNOWN
};
enum xrtErrorClass {
XRT_ERROR_CLASS_FIRST_ENTRY = 1,
XRT_ERROR_CLASS_SYSTEM = XRT_ERROR_CLASS_FIRST_ENTRY,
XRT_ERROR_CLASS_AIE,
XRT_ERROR_CLASS_HARDWARE,
XRT_ERROR_CLASS_UNKNOWN,
XRT_ERROR_CLASS_LAST_ENTRY = XRT_ERROR_CLASS_UNKNOWN
};
API 头文件 experimental/xrt_error.h 用于定义 API 以访问当前缓存的错误。它可提供 xrtErrorGetLast()
和 xrtErrorGetString()
API 以检索系统级异步错误。
/**
* xrtErrorGetLast - Get the last error code and its timestamp of a given error class.
*
* @handle: Device handle.
* @class: Error Class for the last error to get.
* @error: Returned XRT error code.
* @timestamp: The timestamp when the error generated
*
* Return: 0 on success or appropriate XRT error code.
*/
int
xrtErrorGetLast(xrtDeviceHandle handle, xrtErrorClass ecl, xrtErrorCode* error, uint64_t* timestamp);
/**
* xrtErrorGetString - Get the description string of a given error code.
*
* @handle: Device handle.
* @error: XRT error code.
* @out: Preallocated output buffer for the error string.
* @len: Length of output buffer.
* @out_len: Output of length of message, ignored if null.
*
* Return: 0 on success or appropriate XRT error code.
*
* Specifying out_len while passing nullptr for output buffer will
* return the message length, which can then be used to allocate the
* output buffer itself.
*/
int
xrtErrorGetString(xrtDeviceHandle, xrtErrorCode error, char* out, size_t len, size_t* out_len);
应用可按给定错误类来调用 xrtErrorGetLast()
以获取最新错误代码。应用可按给定错误代码调用 xrtErrorGetString()
以获取对应于此错误代码的错误字符串。XRT 会维护每个类的最新代码和关联的时间戳(指示错误生成时间)。
xbutil
可用于报告错误。错误报告会累积来自先前各类的所有错误,并按时间戳对其进行排序。此报告会查询驱动程序,了解上次请求复位的时间。此复位将合并(使用时间戳)到报告列表中。
$ xbutil examine -r error -d 0000:00:00.0
Asynchronous Errors
Time Class Module Driver Severity Error Code
2020-Oct-08 16:40:02 CLASS_SYSTEM MODULE_FIREWALL DRIVER_XOCL SEVERITY_EMERGENCY FIREWALL_TRIP
$ xbutil2 examine -r error -f JSON-2020.2 -o <OUTPUT_FILE> -d 0000:00:00.0
{
"schema_version": {
"schema": "JSON",
"creation_date": "Fri Oct 9 11:04:24 2020 GMT"
},
"devices": [
{
"asynchronous_errors": [
{
"timestamp": "1602175202572070700",
"class": "CLASS_SYSTEM",
"module": "MODULE_FIREWALL",
"severity": "SEVERITY_EMERGENCY",
"driver": "DRIVER_XOCL",
"error_code": {
"error_id": "1",
"error_msg": "FIREWALL_TRIP"
}
}
]
}
]
}
xbutil
还可用于报告 AI 引擎运行状态和读取寄存器以便调试。例如,以下命令会在执行 graph 后读取内核状态。
$ xbutil examine -r aie -d 0000:00:00.0
--------------------------
1/1 [0000:00:00.0] : edge
--------------------------
Aie
Aie_Metadata
GRAPH[ 0] Name : gr
Status : running
SNo. Core [C:R] Iteration_Memory [C:R] Iteration_Memory_Addresses
[ 0] 23:1 23:1 16388
[ 1] 23:2 23:0 6980
[ 2] 23:3 23:1 4
[ 3] 24:1 24:0 4
[ 4] 24:2 24:2 4
[ 5] 24:3 24:1 4
[ 6] 25:1 25:1 4
Core [ 0]
Column : 23
Row : 1
Core:
Status : core_done
Program Counter : 0x00000308
Link Register : 0x00000290
Stack Pointer : 0x000340a0
DMA:
MM2S:
Channel:
Id : 0
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
Id : 1
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
S2MM:
Channel:
Id : 0
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
Id : 1
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
Locks:
0 : released_for_write
1 : released_for_write
2 : released_for_write
3 : released_for_write
4 : released_for_write
5 : released_for_write
6 : released_for_write
7 : released_for_write
8 : released_for_write
9 : released_for_write
10 : released_for_write
11 : released_for_write
12 : released_for_write
13 : released_for_write
14 : released_for_write
15 : released_for_write
Events:
core : 1, 2, 5, 22, 23, 24, 28, 29, 31, 32, 35, 36, 38, 39, 40, 44, 45, 47, 68
memory : 1, 43, 44, 45, 106, 113
......
Core [ 6]
Column : 25
Row : 1
Core:
Status : enabled, east_lock_stall
Program Counter : 0x000001e6
Link Register : 0x000000b0
Stack Pointer : 0x00030020
DMA:
MM2S:
Channel:
Id : 0
Channel Status : stalled_on_requesting_lock
Queue Size : 0
Queue Status : okay
Current BD : 2
Id : 1
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
S2MM:
Channel:
Id : 0
Channel Status : running
Queue Size : 0
Queue Status : okay
Current BD : 0
Id : 1
Channel Status : idle
Queue Size : 0
Queue Status : okay
Current BD : 0
Locks:
0 : acquired_for_write
1 : released_for_write
2 : released_for_write
3 : released_for_write
4 : released_for_write
5 : released_for_write
6 : released_for_write
7 : released_for_write
8 : released_for_write
9 : released_for_write
10 : released_for_write
11 : released_for_write
12 : released_for_write
13 : released_for_write
14 : released_for_write
15 : released_for_write
Events:
core : 1, 2, 5, 22, 26, 28, 29, 31, 32, 35, 38, 39, 44
memory : 1, 20, 21, 23, 35, 43, 44, 106, 113
以下命令可用于读取特定寄存器以便调试。
$ xbutil advanced --read-aie-reg -d 0000:00:0 0 25 Core_Status
Register Core_Status Value of Row:0 Column:25 is 0x00000201
如需了解 AI 引擎寄存器定义,请参阅
Versal ACAP AI 引擎寄存器参考资料(AM015)。如需了解有关 xbutil
命令使用的详细信息,请参阅 Xilinx Runtime (XRT) 架构。