#include <aie_api/aie_adf.hpp>
#include "kernel.hpp"
template<unsigned id>
void SecondOrderSection(
adf::input_buffer<float> & __restrict idata, // 8 input samples per iteration
adf::output_buffer<float> & __restrict odata, // 8 output samples per iteration
const float (&C_e)[48], // run-time parameter: SIMD matrix of coefficients (even columns)
const float (&C_o)[48] // run-time parameter: SIMD matrix of coefficients (odd columns)
) {
static v8float state_reg = null_v8float();
// input/output iterators
auto inIter = aie::begin_vector<8>(idata);
auto outIter = aie::begin_vector<8>(odata);
for (auto i = 0; i < burst_cnt; i++) {
v8float xreg_hi = *inIter++;
v16float xreg = concat(state_reg, xreg_hi);
v8float acc_e = null_v8float();
v8float acc_o = null_v8float();
v8float *ptr_coeff_e = (v8float *)(&C_e[0]);
v8float *ptr_coeff_o = (v8float *)(&C_o[0]);
for (auto j = 0; j < 6; j++)
chess_flatten_loop
{
acc_e = fpmac(acc_e, xreg, (2 * j + 4), 0, *ptr_coeff_e++, 0, 0x76543210); // even columns
acc_o = fpmac(acc_o, xreg, (2 * j + 5), 0, *ptr_coeff_o++, 0, 0x76543210); // odd columns
} // end for (auto j = 0; j < 6; j++)
acc_o = fpadd(acc_o, acc_e);
*outIter++ = acc_o;
// update states
state_reg = xreg_hi;
state_reg = upd_elem(state_reg, 4, ext_elem(acc_o, 6));
state_reg = upd_elem(state_reg, 5, ext_elem(acc_o, 7));
} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()
Note:
The use of the
chess_flatten_loop
pragma. This pragma unrolls the loop completely, eliminating the loop construct. Documentation on compiler pragmas can be found in the AI Engine Lounge.In the code provided, selecting between API and LLI is performed by defining or commenting out
USE_API
on line 17 ofkernel.hpp
.
The generated assembly code is as follows:
Note the tighter “spacing” between VFPMAC
s. Also, the SecondOrderSection<1>
function is “absorbed” into the main function, and there are two unrolled matrix-vector multiplication loops, effectively halving the number of iterations of the outer loop.
The measured throughput is as follows (see lli_thruput.xlsx
):
IIR Throughput (with LLI) | | | | | | | | | |—————————|——-|——-|——-|——-|——-|——-|——-| |burst_cnt |1 |8 |16 |32 |64 |128 |256 | |num_samples |8 |64 |128 |256 |512 |1024 |2048 | |num_cycles (LLI) |186 |250 |458 |874 |1706 |3370 |6698 | |LLI Throughput (Msa/sec) |43.01 |256.00 |279.48 |292.91 |300.12 |303.86 |305.76 |
*clk_freq: 1GHz
Comparing the API and LLI throughput:
LLI provides a better throughput than API for the same
burst_cnt
.The throughput “saturates” at around
burst_cnt
= 64.