Error Reporting Through the XRT API - 2021.2 English

Versal ACAP AI Engine Programming Environment User Guide (UG1076)

Document ID
UG1076
Release Date
2021-12-17
Version
2021.2 English

XRT provides error reporting APIs. The error reporting APIs can be categorized into two types: synchronous and asynchronous APIs. Synchronous errors are errors that can be detected during the XRT run-time function call. It is POSIX-compliant. For example:



An asynchronous error might not be related to the current XRT function call or the application that is running. Asynchronous errors are cached in driver subsystems and can be accessed by the user application through the asynchronous error reporting APIs. Cached errors are persistent until explicitly cleared. Persistent errors are not necessarily indicative of the current system state, for example, a board might have been reset and be functioning correctly while previously cached errors are still available. To avoid current state confusion, asynchronous errors have a timestamp attached indicating when the error occurred. The timestamp can be compared to, for example, the timestamp for last xbutil reset.

The errors cached by the driver contain a system error code and additional meta data as defined in xrt_error_code.h, which is shared between the user space and the kernel space.

The error code format for asynchronous errors is as shown here:

/**
 * xrtErrorCode layout
 *
 * This layout is internal to XRT (akin to a POSIX error code).
 *
 * The error code is populated by driver and consumed by XRT
 * implementation where it is translated into an actual error / info /
 * warning that is propagated to the end user.
 *
 * 63 - 48  47 - 40   39 - 32   31 - 24   23 - 16    15 - 0
 * --------------------------------------------------------
 * |    |    |    |    |    |    |    |    |    |    |----| xrtErrorNum
 * |    |    |    |    |    |    |    |    |----|---------- xrtErrorDriver
 * |    |    |    |    |    |    |----|-------------------- xrtErrorSeverity
 * |    |    |    |    |----|------------------------------ xrtErrorModule
 * |    |    |----|---------------------------------------- xrtErrorClass
 * |----|-------------------------------------------------- reserved
 *
 */
typedef uint64_t xrtErrorCode;
typedef uint64_t xrtErrorTime;

#define XRT_ERROR_NUM_MASK		0xFFFFUL
#define XRT_ERROR_NUM_SHIFT		0
#define XRT_ERROR_DRIVER_MASK		0xFUL
#define XRT_ERROR_DRIVER_SHIFT		16
#define XRT_ERROR_SEVERITY_MASK		0xFUL
#define XRT_ERROR_SEVERITY_SHIFT	24
#define XRT_ERROR_MODULE_MASK		0xFUL
#define XRT_ERROR_MODULE_SHIFT		32
#define XRT_ERROR_CLASS_MASK		0xFUL
#define XRT_ERROR_CLASS_SHIFT		40

#define	XRT_ERROR_CODE_BUILD(num, driver, severity, module, eclass) \
	((((num) & XRT_ERROR_NUM_MASK) << XRT_ERROR_NUM_SHIFT) | \
	(((driver) & XRT_ERROR_DRIVER_MASK) << XRT_ERROR_DRIVER_SHIFT) | \
	(((severity) & XRT_ERROR_SEVERITY_MASK) << XRT_ERROR_SEVERITY_SHIFT) | \
	(((module) & XRT_ERROR_MODULE_MASK) << XRT_ERROR_MODULE_SHIFT) | \
	(((eclass) & XRT_ERROR_CLASS_MASK) << XRT_ERROR_CLASS_SHIFT))

#define XRT_ERROR_NUM(code) (((code) >> XRT_ERROR_NUM_SHIFT) & XRT_ERROR_NUM_MASK)
#define XRT_ERROR_DRIVER(code) (((code) >> XRT_ERROR_DRIVER_SHIFT) & XRT_ERROR_DRIVER_MASK)
#define XRT_ERROR_SEVERITY(code) (((code) >> XRT_ERROR_SEVERITY_SHIFT) & XRT_ERROR_SEVERITY_MASK)
#define XRT_ERROR_MODULE(code) (((code) >> XRT_ERROR_MODULE_SHIFT) & XRT_ERROR_MODULE_MASK)
#define XRT_ERROR_CLASS(code) (((code) >> XRT_ERROR_CLASS_SHIFT) & XRT_ERROR_CLASS_MASK)

/**
 * xrt_error_num - XRT specific error numbers
 */

enum xrtErrorNum {
XRT_ERROR_NUM_FIRWWALL_TRIP = 1,
XRT_ERROR_NUM_TEMP_HIGH,
XRT_ERROR_NUM_AIE_SATURATION,
XRT_ERROR_NUM_AIE_FP,
XRT_ERROR_NUM_AIE_STREAM,
XRT_ERROR_NUM_AIE_ACCESS,
XRT_ERROR_NUM_AIE_BUS,
XRT_ERROR_NUM_AIE_INSTRUCTION,
XRT_ERROR_NUM_AIE_ECC,
XRT_ERROR_NUM_AIE_LOCK,
XRT_ERROR_NUM_AIE_DMA,
XRT_ERROR_NUM_AIE_MEM_PARITY,
XRT_ERROR_NUM_UNKNOWN
};

enum xrtErrorDriver {
  XRT_ERROR_DRIVER_XOCL,
  XRT_ERROR_DRIVER_XCLMGMT,
  XRT_ERROR_DRIVER_ZOCL,
  XRT_ERROR_DRIVER_AIE,
  XRT_ERROR_DRIVER_UNKNOWN
};

enum xrtErrorSeverity {
  XRT_ERROR_SEVERITY_EMERGENCY = 0,
  XRT_ERROR_SEVERITY_ALERT,
  XRT_ERROR_SEVERITY_CRITICAL,
  XRT_ERROR_SEVERITY_ERROR,
  XRT_ERROR_SEVERITY_WARNING,
  XRT_ERROR_SEVERITY_NOTICE,
  XRT_ERROR_SEVERITY_INFO,
  XRT_ERROR_SEVERITY_DEBUG,
  XRT_ERROR_SEVERITY_UNKNOWN
};

enum xrtErrorModule {
  XRT_ERROR_MODULE_FIREWALL = 0,
  XRT_ERROR_MODULE_CMC,
  XRT_ERROR_MODULE_AIE_CORE,
  XRT_ERROR_MODULE_AIE_MEMORY,
  XRT_ERROR_MODULE_AIE_SHIM,
  XRT_ERROR_MODULE_AIE_NOC,
  XRT_ERROR_MODULE_AIE_PL,
  XRT_ERROR_MODULE_AIE_UNKNOWN
};

enum xrtErrorClass {
XRT_ERROR_CLASS_FIRST_ENTRY = 1,
XRT_ERROR_CLASS_SYSTEM = XRT_ERROR_CLASS_FIRST_ENTRY,
XRT_ERROR_CLASS_AIE,
XRT_ERROR_CLASS_HARDWARE,
XRT_ERROR_CLASS_UNKNOWN,
XRT_ERROR_CLASS_LAST_ENTRY = XRT_ERROR_CLASS_UNKNOWN
};

The API header file experimental/xrt_error.h defines the APIs for accessing currently cached errors. It provides xrtErrorGetLast() and xrtErrorGetString() APIs to retrieve the system level asynchronous errors.

/**
 * xrtErrorGetLast - Get the last error code and its timestamp of a given error class.
 *
 * @handle:       Device handle.
 * @class:        Error Class for the last error to get.
 * @error:        Returned XRT error code.
 * @timestamp:    The timestamp when the error generated
 *
 * Return:        0 on success or appropriate XRT error code.
 */
int
xrtErrorGetLast(xrtDeviceHandle handle, xrtErrorClass ecl, xrtErrorCode* error, uint64_t* timestamp);

/**
 * xrtErrorGetString - Get the description string of a given error code.
 *
 * @handle:       Device handle.
 * @error:        XRT error code.
 * @out:          Preallocated output buffer for the error string.
 * @len:          Length of output buffer.
 * @out_len:      Output of length of message, ignored if null.
 *
 * Return:        0 on success or appropriate XRT error code.
 *
 * Specifying out_len while passing nullptr for output buffer will
 * return the message length, which can then be used to allocate the
 * output buffer itself.
 */
int
xrtErrorGetString(xrtDeviceHandle, xrtErrorCode error, char* out, size_t len, size_t* out_len);

The application can call xrtErrorGetLast() with a given error class to get the latest error code. The application can call xrtErrorGetString() with a given error code to get the error string corresponding to this error code. XRT maintains the latest error for each class and an associated timestamp for when the error was generated.

xbutil can be used to report errors. The error report accumulates all the errors from the various classes and sorts them by timestamp. The report queries the drivers as to when the last reset was requested. This reset will be merged (using the timestamp) into the report listing.

$ xbutil examine -r error -d 0000:00:00.0               
Asynchronous Errors
  Time                               Class               Module              Driver              Severity            Error Code          
  2020-Oct-08 16:40:02               CLASS_SYSTEM        MODULE_FIREWALL     DRIVER_XOCL         SEVERITY_EMERGENCY  FIREWALL_TRIP


$ xbutil2 examine -r error -f JSON-2020.2 -o <OUTPUT_FILE> -d 0000:00:00.0
{
    "schema_version": {
        "schema": "JSON",
        "creation_date": "Fri Oct  9 11:04:24 2020 GMT"
    },
    "devices": [
        {
            "asynchronous_errors": [
                {
                    "timestamp": "1602175202572070700",
                    "class": "CLASS_SYSTEM",
                    "module": "MODULE_FIREWALL",
                    "severity": "SEVERITY_EMERGENCY",
                    "driver": "DRIVER_XOCL",
                    "error_code": {
                        "error_id": "1",
                        "error_msg": "FIREWALL_TRIP"
                    }
                }
            ]
        }
    ]
}

xbutil can also be used to report AI Engine running status and read registers for debug purposes. For example, the following command reads the status of kernels after the graph has executed.

$ xbutil examine -r aie -d 0000:00:00.0

--------------------------
1/1 [0000:00:00.0] : edge
--------------------------
Aie
  Aie_Metadata
  GRAPH[ 0] Name : gr
          Status : running
    SNo. Core [C:R] Iteration_Memory [C:R] Iteration_Memory_Addresses 
    [ 0] 23:1 23:1 16388 
    [ 1] 23:2 23:0 6980 
    [ 2] 23:3 23:1 4 
    [ 3] 24:1 24:0 4 
    [ 4] 24:2 24:2 4 
    [ 5] 24:3 24:1 4 
    [ 6] 25:1 25:1 4 


Core [ 0]
  Column : 23
  Row : 1
  Core:
    Status : core_done
    Program Counter : 0x00000308
    Link Register : 0x00000290
    Stack Pointer : 0x000340a0
  DMA:
    MM2S:
      Channel:
        Id : 0
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

    S2MM:
      Channel:
        Id : 0
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

  Locks:
    0 : released_for_write
    1 : released_for_write
    2 : released_for_write
    3 : released_for_write
    4 : released_for_write
    5 : released_for_write
    6 : released_for_write
    7 : released_for_write
    8 : released_for_write
    9 : released_for_write
    10 : released_for_write
    11 : released_for_write
    12 : released_for_write
    13 : released_for_write
    14 : released_for_write
    15 : released_for_write


  Events:
    core : 1, 2, 5, 22, 23, 24, 28, 29, 31, 32, 35, 36, 38, 39, 40, 44, 45, 47, 68
    memory : 1, 43, 44, 45, 106, 113

......


Core [ 6]
  Column : 25
  Row : 1
  Core:
    Status : enabled, east_lock_stall
    Program Counter : 0x000001e6
    Link Register : 0x000000b0
    Stack Pointer : 0x00030020
  DMA:
    MM2S:
      Channel:
        Id : 0
        Channel Status : stalled_on_requesting_lock
        Queue Size : 0
        Queue Status : okay
        Current BD : 2

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

    S2MM:
      Channel:
        Id : 0
        Channel Status : running
        Queue Size : 0
        Queue Status : okay
        Current BD : 0

        Id : 1
        Channel Status : idle
        Queue Size : 0
        Queue Status : okay
        Current BD : 0


  Locks:
    0 : acquired_for_write
    1 : released_for_write
    2 : released_for_write
    3 : released_for_write
    4 : released_for_write
    5 : released_for_write
    6 : released_for_write
    7 : released_for_write
    8 : released_for_write
    9 : released_for_write
    10 : released_for_write
    11 : released_for_write
    12 : released_for_write
    13 : released_for_write
    14 : released_for_write
    15 : released_for_write

  Events:
    core : 1, 2, 5, 22, 26, 28, 29, 31, 32, 35, 38, 39, 44
    memory : 1, 20, 21, 23, 35, 43, 44, 106, 113

The following command can be used to read specific registers for debug purposes.

$ xbutil advanced --read-aie-reg -d 0000:00:0 0 25 Core_Status 
Register Core_Status Value of Row:0 Column:25 is 0x00000201

For AI Engine register definitions, see the Versal ACAP AI Engine Register Reference (AM015). For details on xbutil command use, see Xilinx Runtime (XRT) Architecture.