Listening for SEU Detection - 2023.2 English

Standalone Library Documentation: BSP and Libraries Document Collection (UG643)

Document ID
UG643
Release Date
2023-12-13
Version
2023.2 English

The XilSEM library can maintain SEU mitigation operation without any need for the user design to listen for SEU detections. However, it may be desired maintain an event log, or take design or system level actions in response to an event.

Whenever an error is detected, be it correctable or uncorrectable, XilSEM has a mechanism to notify the errors over IPI. This equally applies for errors detected during NPI scanning and CRAM scanning. To receive notification of error detections in the configuration RAM or NPI registers, the following steps are required of an RPU/APU application:

  • GIC initialization
  • IPI initialization
    • IPI configuration and connection with GIC
    • IPI callback registration with IPI interrupt handler (CRAM and NPI)
  • Register error event notification (CRAM and NPI)
  • Check global variables that hold notified event information (CRAM and NPI)

For GIC initialization, use the following code snippet:


#define INTC_DEVICE_ID (XPAR_SCUGIC_SINGLE_DEVICE_ID)
s32 GicSetupInterruptSystem(XScuGic *GicInst)
{
	s32 Status;

	XScuGic_Config *GicCfgPtr = XScuGic_LookupConfig(INTC_DEVICE_ID);
	if (NULL == GicCfgPtr) {
		xil_printf("XScuGic_LookupConfig() failed\r\n");
		goto END;
	}

	Status = XScuGic_CfgInitialize(GicInst, GicCfgPtr, \
			GicCfgPtr->CpuBaseAddress);
	if (XST_SUCCESS != Status) {
		xil_printf("XScuGic_CfgInitialize() failed with error: %d\r\n",\
				Status);
		goto END;
	}

	/*
	 * Connect the interrupt controller interrupt Handler to the
	 * hardware interrupt handling logic in the processor.
	 */
#if defined (__aarch64__)
	Xil_ExceptionRegisterHandler(XIL_EXCEPTION_ID_FIQ_INT,
#elif defined (__arm__)
	Xil_ExceptionRegisterHandler(XIL_EXCEPTION_ID_IRQ_INT,
#endif
		(Xil_ExceptionHandler)XScuGic_InterruptHandler, GicInst);
	Xil_ExceptionEnable();

END:
	return Status;
}

For IPI configuration and connection with GIC, use the following code snippet:

#define IPI_TEST_CHANNEL_ID	
					(XPAR_XIPIPSU_0_DEVICE_ID)
#define IPI_INT_ID		
	   (XPAR_XIPIPSU_0_INT_ID)

/* Allocate one callback pointer for each bit in the register */
static IpiCallback IpiCallbacks[11];

static ssize_t ipimask2idx(u32 m)
{
	return __builtin_ctz(m);
}

/**
 * IpiIrqHandler() - Interrupt handler of IPI peripheral
 * @InstancePtr	Pointer to the IPI data structure
 */
static void IpiIrqHandler(XIpiPsu *InstancePtr)
{
	u32 Mask;

	/* Read status to determine the source CPU (who generated IPI) */
	Mask = XIpiPsu_GetInterruptStatus(InstancePtr);

	/* Handle all IPIs whose bits are set in the mask */
	while (Mask) {
		u32 IpiMask = Mask & (-Mask);
		ssize_t idx = ipimask2idx(IpiMask);

		/* If the callback for this IPI is registered execute it */
		if (idx >= 0 && IpiCallbacks[idx])
			IpiCallbacks[idx](InstancePtr);

		/* Clear the interrupt status of this IPI source */
		XIpiPsu_ClearInterruptStatus(InstancePtr, IpiMask);

		/* Clear this IPI in the Mask */
		Mask &= ~IpiMask;
	}
}
static XStatus IpiConfigure(XIpiPsu * IpiInst, XScuGic * GicInst)
{
	int Status = XST_FAILURE;
	XIpiPsu_Config *IpiCfgPtr;

	if (NULL == IpiInst) {
		goto END;
	}

	if (NULL == GicInst) {
		xil_printf("%s ERROR GIC Instance is NULL\n", __func__);
		goto END;
	}

	/* Look Up the config data */
	IpiCfgPtr = XIpiPsu_LookupConfig(IPI_TEST_CHANNEL_ID);
	if (NULL == IpiCfgPtr) {
		Status = XST_FAILURE;
		xil_printf("%s ERROR in getting CfgPtr\n", __func__);
		goto END;
	}
	/* Init with the Cfg Data */
	Status = XIpiPsu_CfgInitialize(IpiInst, IpiCfgPtr, \
			IpiCfgPtr->BaseAddress);
	if (XST_SUCCESS != Status) {
		xil_printf("%s ERROR #%d in configuring IPI\n", __func__,
				Status);
		goto END;
	}

	/* Clear Any existing Interrupts */
	XIpiPsu_ClearInterruptStatus(IpiInst, XIPIPSU_ALL_MASK);

	Status = XScuGic_Connect(GicInst, IPI_INT_ID,
			(Xil_ExceptionHandler)IpiIrqHandler, IpiInst);
	if (XST_SUCCESS != Status) {
		xil_printf("%s ERROR #%d in GIC connect\n", __func__, Status);
		goto END;
	}
	/* Enable IPI interrupt at GIC */
	XScuGic_Enable(GicInst, IPI_INT_ID);

END:
	return Status;
}

For IPI callback registration with IPI handler, see the following code snippet:

XStatus IpiRegisterCallback(XIpiPsu *const IpiInst, const u32 SrcMask,
		IpiCallback Callback)
{
	ssize_t idx;

	if (!Callback)
		return XST_INVALID_PARAM;

	/* Get index into IpiChannels array */
	idx = ipimask2idx(SrcMask);
	if (idx < 0)
		return XST_INVALID_PARAM;

	/* Check if callback is already registered, return failure if it is */
	if (IpiCallbacks[idx])
		return XST_FAILURE;

	/* Entry is free, register callback */
	IpiCallbacks[idx] = Callback;

	/* Enable reception of IPI from the SrcMask/CPU */
	XIpiPsu_InterruptEnable(IpiInst, SrcMask);

	return XST_SUCCESS;
}

For IPI callback to receive event messages for CRAM, see the following code snippet:

#define SRC_IPI_MASK	
					(XPAR_XIPIPS_TARGET_PSV_PMC_0_CH0_MASK)

/*Global variables to hold the event count when notified*/
u8 EventCnt_UnCorEcc = 0U;
u8 EventCnt_Crc = 0U;
u8 EventCnt_CorEcc = 0U;
u8 EventCnt_IntErr = 0U;

void XSem_IpiCallback(XIpiPsu *const InstancePtr)
{
	int Status;
	u32 Payload[PAYLOAD_ARG_CNT] = {0};

	Status = XIpiPsu_ReadMessage(XSem_IpiGetInst(),SRC_IPI_MASK,Payload,\
			PAYLOAD_ARG_CNT, XIPIPSU_BUF_TYPE_MSG);
	if (Status != XST_SUCCESS) {
		xil_printf("ERROR #%d while reading IPI buffer\n", Status);
		return;
	}

	if ((XSEM_EVENT_ERROR == Payload[0]) && \
			(XSEM_NOTIFY_CRAM == Payload[1])) {
		if (XSEM_EVENT_CRAM_UNCOR_ECC_ERR == Payload[2]) {
			EventCnt_UnCorEcc++;
		} else if (XSEM_EVENT_CRAM_CRC_ERR == Payload[2]) {
			EventCnt_Crc++;
		} else if (XSEM_EVENT_CRAM_INT_ERR == Payload[2]) {
			EventCnt_IntErr++;
		} else if (XSEM_EVENT_CRAM_COR_ECC_ERR == Payload[2]) {
			EventCnt_CorEcc++;
		} else {
			xil_printf("%s Some other callback received: %d:%d:%d\n",
					__func__, Payload[0], \
					Payload[1], Payload[2]);
		}
	} else {
		xil_printf("%s Some other callback received: %d\n", \
				__func__, Payload[0]);
	}
}
Note: In the above code snippets, global counters are incremented when an event has been notified. It is up to you to define and implement any desired design and system response.

For IPI callback to receive event messages for NPI, see the following code snippet:

/*Global variables to hold the event count when notified*/
u8 NPI_CRC_EventCnt = 0U;
u8 NPI_INT_EventCnt = 0U;

void XSem_IpiCallback(XIpiPsu *const InstancePtr)
{
	int Status;
	u32 Payload[PAYLOAD_ARG_CNT] = {0};

	Status = XIpiPsu_ReadMessage(&IpiInst, SRC_IPI_MASK, Payload, PAYLOAD_ARG_CNT,
			XIPIPSU_BUF_TYPE_MSG);
	if (Status != XST_SUCCESS) {
		xil_printf("ERROR #%d while reading IPI buffer\n", Status);
		return;
	}

	if ((XSEM_EVENT_ERROR == Payload[0]) && (XSEM_NOTIFY_NPI == Payload[1])) {
		if (XSEM_EVENT_NPI_CRC_ERR == Payload[2]) {
			NPI_CRC_EventCnt++;
		} else if (XSEM_EVENT_NPI_INT_ERR == Payload[2]) {
			NPI_INT_EventCnt++;
		} else {
			xil_printf("%s Some other callback received: %d:%d:%d\n",
					__func__, Payload[0], Payload[1], Payload[2]);
		}
	} else {
		xil_printf("%s Some other callback received: %d\n", __func__, Payload[0]);
	}
}

For IPI initialization (includes IPI configuration and connection with GIC, IPI callback registration with IPI handler), see the following code snippet:

XStatus IpiInit(XIpiPsu * InstancePtr, XScuGic * GicInst)
{
	int Status;

	Status = IpiConfigure(InstancePtr, GicInst);
	if (XST_SUCCESS != Status) {
		xil_printf("IpiConfigure() failed with error: %d\r\n",
				Status);
	}

	Status = IpiRegisterCallback(InstancePtr, SRC_IPI_MASK, \
			XSem_IpiCallback);
	return Status;
}
Note: CRAM and NPI should run one at time.

For initializing GIC, XilSEM IPI instance and registering ISR handler to process XilSEM notifications from PLM, use the below code snippet:

XStatus XSem_IpiInitApi (void)
{
	XStatus Status = XST_FAILURE;

	/* GIC Initialize */
	Status = GicSetupInterruptSystem(&GicInst);
	if (Status != XST_SUCCESS) {
		xil_printf("GicSetupInterruptSystem failed with error: %d\r\n",\
				Status);
		goto END;
	}

	Status = IpiInit(&IpiInst, &GicInst);
	if (XST_SUCCESS != Status) {
		xil_printf("[%s] IPI Init Error: Status 0x%x\r\n", \
				__func__, Status);
		goto END;
	}

END:
	return Status;
}

For register error event notification with CRAM, see the following code snippet:

XSem_Notifier Notifier = {
        .Module = XSEM_NOTIFY_CRAM,
        .Event = XSEM_EVENT_CRAM_UNCOR_ECC_ERR | XSEM_EVENT_CRAM_CRC_ERR | \
		 XSEM_EVENT_CRAM_INT_ERR | XSEM_EVENT_CRAM_COR_ECC_ERR,
	.Flag = 1U,
};
int Status;
Status = XSem_RegisterEvent(&IpiInst, &Notifier);
if (XST_SUCCESS == Status) {
	xil_printf("Success: Event registration \n\r");
} else {
	xil_printf("Error: Event registration failed \n\r");
	goto END;
}

For register error event notification with NPI, see the following code snippet:

XSem_Notifier Notifier = {
        .Module = XSEM_NOTIFY_NPI,
        .Event = XSEM_EVENT_NPI_CRC_ERR | XSEM_EVENT_NPI_INT_ERR,
	.Flag = 1U,
};

int Status;
Status = XSem_RegisterEvent(&IpiInst, &Notifier);
if (XST_SUCCESS == Status) {
	xil_printf("Success: Event registration \n\r");
} else {
	xil_printf("Error: Event registration failed \n\r");
	goto END;
}

To check global variables that holds the count of the notified event for CRAM, use the following code snippet:

if(EventCnt_UnCorEcc > 0){
	xil_printf("Uncorrectable error has been detected in CRAM\n\r");
}else if(EventCnt_Crc > 0){
	xil_printf("CRC error has been detected in CRAM\n\r");
} else if(EventCnt_CorEcc > 0){
	xil_printf("Correctable error has been detected and corrected in CRAM\n\r");
} else if(EventCnt_IntErr > 0){
	xil_printf("Internal error has occurred in CRAM\n\r");
}

To check global variables that holds the count of the notified event for NPI, use the following code snippet:

if(NPI_CRC_EventCnt > 0){
	xil_printf("CRC error has been detected in NPI\n\r");
}else if(NPI_INT_EventCnt > 0){
	xil_printf("Internal error has occurred in NPI\n\r");
}
Note:
  • The IPI notification happens only when you register IPI notifications in your application. It is recommended that before the applications sets in to mission mode, you shall read XilSEM status using the the XSem_CmdCfrGetStatus and XSem_CmdNpiGetStatus APIs.
  • CRAM scan and NPI Scan are independent. If any uncorrectable error is reported by CRAM, the XilSEM software on PLM stops only CRAM scan and NPI scan continues to run.
  • If any uncorrectable error is reported by NPI scan, the XilSEM software on PLM stops only NPI scan and CRAM scan continues to run.
Warning: Do not attempt to define actions for CRAM error bits in PMC HW EAM. This configuration conflicts with the correction options in the CIPS Customize IP GUI.

From 2022.2 release onwards, XilSEM error notification support is extended for A72 Linux and bare-metal application users.

  • A72 bare-metal application users can receive XilSEM error notifications through XilPM client. interface XPm_RegisterNotifier with event ID as XilSEM events (XIL_EVENT_ERROR_MASK_XSEM_CRAM_CE (0x00000020U), XIL_EVENT_ERROR_MASK_XSEM_CRAM_UE (0x00000040U), and XIL_EVENT_ERROR_MASK_XSEM_NPI_UE (0x00000080U)) and node ID as XIL_NODETYPE_EVENT_ERROR_SW_ERR (0x28110000U).
  • A72 Linux users can receive XilSEM error notifications through XilSEM EDAC driver.
Note: From 2023.1 release onwards, Linux XilSEM EDAC driver supports sysfs interface to perform XilSEM scan operations initialize, start, stop scan, error inject, read ECC, and configuration values. For more information, see this page.

For more information, refer Event Management Framework in Versal Adaptive SoC System Software Developers Guide (UG1304).

An additional method exists for the user design to listen for SEU detections. Unlike the IP integrator method which is used for RPU user software, this method is used for PL user logic. It involves PMC_PL_GPO outputs of the PMC routed into the PL, supplying quick access to key event information. To access this feature, it must be enabled during XilSEM library configuration.

  • Bit 0: CRAM correctable error

    The GPO is set to High when the error is detected. The GPO is cleared when the error is corrected. If you have disabled correction in the CIPS configuration, the GPO is set and remains High until CRAM scan is re-initialized.

  • Bit 1: CRAM/NPI uncorrectable error

    The GPO is set to High when the error is detected.

  • Bit 2: CRAM/NPI/XilSEM internal error
  • Bit 3: Reserved

Additional error information related to CRAM and NPI scan is maintained in PMC RTCA locations 0cF2014050 to 0xF20140C4. For more information, see Versal Adaptive SoC Register Reference (AM012).

The following diagram provides a XilSEM-centric view of information conduits to user software and user logic implemented in a Versal adaptive SoC.

Figure 1. Flow of Information on a Versal adaptive SoC using a XilSEM Subsystem
Figure 2. Software Flow for CRAM Scan
Figure 3. Software Flow for NPI Scan

Selection of system integration methods necessarily depends on the system requirements, particularly with regard to control and status of a XilSEM subsystem. The following table shows a variety of use cases that can be supported with the available integration methods.

Table 1. Supported Use-cases with the Available Integration Method
Use Cases System Integration Methods
XilSEM Client on R5 PMC GPO to PL (Output)
Background operation without notification (requires use of immediate start options for Configuration RAM scan and NPI Register scan)
Background with critical event notification to PL design (requires use of immediate start options for Configuration RAM scan and NPI Register scan) Available
Interactive with event and detailed status notification by IPI Available
Interactive with event and detailed status notification by IPI with critical event notification to PL design Available Available