如 设计分析 中估算所得,每个周期一项输入数据的情况下,必须并行运行 4 个内核才能获得最佳吞吐量。在此类情况下,解决方案是将 32 个系数拆分到 4 个内核中,其中数据广播至 4 个内核。源自内核的部分累加结果可以通过级联在最后一个内核中生成结果。此实现如下图所示:
图 1. 数据广播至 4 个内核
请注意,在后续内核中会丢弃部分初始数据。例如,第二个内核将丢弃前 8 项输入数据。
对于 aie::sliding_mul
,选中的是 4 条通道和 8 个点。数据读取和写入与计算交织。
第一个内核代码如下所示:
alignas(aie::vector_decl_align) static cint16 eq_coef0[8]={{1,2},{3,4},...};
//For storing data between graph iterations
static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core0(input_stream<cint16> * sig_in,
output_stream<cacc48> * cascadeout){
const cint16_t * restrict coeff = eq_coef0;
const aie::vector<cint16,8> coe = aie::load_v<8>(coeff);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
main_loop:for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,0);
writeincr(cascadeout,acc);
//8 MAC produce 4 partial output
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,4);
writeincr(cascadeout,acc);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,8);
writeincr(cascadeout,acc);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,12);
writeincr(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core0_init(){
// Drop samples if not first block
int const Delay = 0;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
注释:
- 可选的
__attribute__((noinline))
用于保留函数层级。 -
chess_prepare_for_pipelining
为可选,因为这类工具可以执行自动流水打拍。 - 每个
aie::sliding_mul<4,8>
都会对 4 条通道 8 个点 MAC 执行乘法运算,并将部分结果通过级联链发送到下一个内核。 - 从
aie::sliding_mul
的data_start
参数开始读取数据buff
。内核代码到达末尾时,以循环方式返回开始位置。
编译报告可在
Work/aie/<COL_ROW>/<COL_ROW>.log
中找到,如需生成详细报告,则需要 -v
选项。在此日志中,搜索关键字(例如,do-loop
)以查找循环的启动时间间隔。在以下 log 日志文件示例中,可以看到循环的启动时间间隔为 16:(resume algo) -> after folding: 16 (folded over 1 iterations)
-> HW do-loop #128 in ".../Vitis/2023.2/aietools/include/adf/stream/me/stream_utils.h", line 1192: (loop #3) : 16 cycles
提示: 在 log 日志文件中获取最新报告的循环周期。
以上内核代码生成部分输出所需耗费的周期数约为 16 (cycles) /
16 (partial results) = 1 cycle
。
其他三个内核也采用类似方式。第二个内核代码如下所示:
alignas(aie::vector_decl_align) static cint16 eq_coef2[8]={{17,18},{19,20},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core1(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cacc48> * cascadeout){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef1);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core1_init()
{
// Drop samples if not first block
int const Delay = 8;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
第三个内核代码如下所示:
alignas(aie::vector_decl_align) static cint16 eq_coef2[8]={{33,34},{35,36},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core2(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cacc48> * cascadeout){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef2);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core2_init(){
// Drop samples if not first block
int const Delay = 16;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
最后一个内核代码如下所示:
alignas(aie::vector_decl_align) static cint16 eq_coef3[8]={{49,50},{51,52},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core3(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cint16> * data_out){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef3);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
}
delay_line = buff;
}
void fir_32tap_core3_init()
{
// Drop samples if not first block
int const Delay = 24;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
最后一个内核会使用 acc.to_vector<cint16>(SHIFT)
将结果写入输出串流。
每个内核都耗费 1 个周期来生成部分输出。当各内核同时工作时,系统性能为一个周期生成一个输出,这满足设计目标。
如需了解有关可在系统设计内满足的计算图构造、串流广播、DMA FIFO 插入、在仿真和硬件中进行剖析、设计停滞和死锁分析的更多信息,请参阅 AI 引擎工具和流程用户指南(UG1076)。