As a first step, we use the kernel code as follows:
template<unsigned id>
void SecondOrderSection(
adf::input_buffer<float> & __restrict idata, // 8 input samples per iteration
adf::output_buffer<float> & __restrict odata, // 8 output samples per iteration
const float (&C)[96] // RTP port for coefficient matrix
) {
static Vector8f state_reg = aie::zeros<float, 8>(); // clear states
// input/output iterators
auto inIter = aie::begin_vector<8>(idata);
auto outIter = aie::begin_vector<8>(odata);
Vector8f xreg_hi = *inIter++; // fetch input samples
Vector16f xreg = aie::concat(state_reg, xreg_hi); // xreg[4]: ym2; xreg[5]: ym1; xreg[6]: xm2; xreg[7]: xm1; xreg[8:15]: x0:x7
Vector8f coeff;
VAcc8f acc = aie::zeros<accfloat, 8>();
for (auto i = 0; i < 12; i++) {
coeff = aie::load_v<8>(&C[8 * i]);
float xval = xreg[i + 4];
acc = aie::mac(acc, coeff, xval);
} // end for (auto i = 0; i < 12; i++)
Vector8f yout = acc; // transfer accumulator register to vector register to update states
// update states
state_reg = xreg_hi;
state_reg[4] = yout[6];
state_reg[5] = yout[7];
*outIter++ = yout;
} // end SecondOrderSection()
The for
loop scales each column of the coefficient matrix with an element in xreg
and accumulates the result. This performs the matrix and vector multiplication in eqn. (4).