Kernel Code (AI Engine API) - 2023.2 English

Vitis Tutorials: AI Engine (XD100)

Document ID
XD100
Release Date
2024-03-05
Version
2023.2 English
#include <aie_api/aie_adf.hpp>

#include "kernel.hpp"

template<unsigned id>
void SecondOrderSection(
	adf::input_buffer<float> & __restrict idata,	// 8 input samples per iteration
	adf::output_buffer<float> & __restrict odata,	// 8 output samples per iteration
	const float (&C_e)[48],		// run-time parameter: SIMD matrix of coefficients (even columns)
	const float (&C_o)[48]		// run-time parameter: SIMD matrix of coefficients (odd columns)
) {
	static Vector8f state_reg = aie::zeros<float, 8>();	// clear states
	// input/output iterators
	auto inIter = aie::begin_vector<8>(idata);
	auto outIter = aie::begin_vector<8>(odata);
	for (auto i = 0; i < burst_cnt; i++) {
		Vector8f xreg_hi = *inIter++;		// fetch input samples
		Vector16f xreg = aie::concat(state_reg, xreg_hi);
		auto ecoeff_iter = aie::begin_vector<8>(&C_e[0]);
		auto ocoeff_iter = aie::begin_vector<8>(&C_o[0]);
		VAcc8f acc_e = aie::zeros<accfloat, 8>();	// even accumulator
		VAcc8f acc_o = aie::zeros<accfloat, 8>();	// odd accumulator
		for (auto j = 0; j < 6; j++) {
			acc_e = aie::mac(acc_e, xreg.get(2 * j + 4), *ecoeff_iter++);	// even columns
			acc_o = aie::mac(acc_o, xreg.get(2 * j + 5), *ocoeff_iter++);	// odd columns
		} // end for (auto j = 0; j < 6; j ++)
		acc_o = aie::add(acc_o, acc_e.to_vector());	// acc_o += acc_e
		Vector8f yout = acc_o.to_vector();
		// update states
		state_reg = xreg_hi;
		state_reg[4] = yout[6];
		state_reg[5] = yout[7];
		*outIter++ = yout;
	} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()

Note the two loops in the function:

for (auto i = 0; i < burst_cnt; i++) {	// process more samples to reduce overhead
	...
	for (auto j = 0; j < 6; j++) {	// matrix-vector multiplication
		...
	}
}

The outer for loop is added such that more samples can be processed during each function call, thereby reducing the ratio of function call cycles to processing cycles and improving throughput.