#include <aie_api/aie_adf.hpp>
#include "kernel.hpp"
template<unsigned id>
void SecondOrderSection(
adf::input_buffer<float> & __restrict idata, // 8 input samples per iteration
adf::output_buffer<float> & __restrict odata, // 8 output samples per iteration
const float (&C_e)[48], // run-time parameter: SIMD matrix of coefficients (even columns)
const float (&C_o)[48] // run-time parameter: SIMD matrix of coefficients (odd columns)
) {
static Vector8f state_reg = aie::zeros<float, 8>(); // clear states
// input/output iterators
auto inIter = aie::begin_vector<8>(idata);
auto outIter = aie::begin_vector<8>(odata);
for (auto i = 0; i < burst_cnt; i++) {
Vector8f xreg_hi = *inIter++; // fetch input samples
Vector16f xreg = aie::concat(state_reg, xreg_hi);
auto ecoeff_iter = aie::begin_vector<8>(&C_e[0]);
auto ocoeff_iter = aie::begin_vector<8>(&C_o[0]);
VAcc8f acc_e = aie::zeros<accfloat, 8>(); // even accumulator
VAcc8f acc_o = aie::zeros<accfloat, 8>(); // odd accumulator
for (auto j = 0; j < 6; j++) {
acc_e = aie::mac(acc_e, xreg.get(2 * j + 4), *ecoeff_iter++); // even columns
acc_o = aie::mac(acc_o, xreg.get(2 * j + 5), *ocoeff_iter++); // odd columns
} // end for (auto j = 0; j < 6; j ++)
acc_o = aie::add(acc_o, acc_e.to_vector()); // acc_o += acc_e
Vector8f yout = acc_o.to_vector();
// update states
state_reg = xreg_hi;
state_reg[4] = yout[6];
state_reg[5] = yout[7];
*outIter++ = yout;
} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()
Note the two loops in the function:
for (auto i = 0; i < burst_cnt; i++) { // process more samples to reduce overhead
...
for (auto j = 0; j < 6; j++) { // matrix-vector multiplication
...
}
}
The outer for
loop is added such that more samples can be processed during each function call, thereby reducing the ratio of function call cycles to processing cycles and improving throughput.