1 Gsps 实现中的 4 个内核之间可能具有多组不同的系数和级联串流。实现如下图所示。
输入数据从串流流至这 4 个内核。但第二个内核将丢弃前 8 项输入数据。第三个内核将丢弃前 16 项输入数据。同样,第四个内核将丢弃前 24 项输入数据。
第一个内核的代码如下所示。
#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;
void fir_32tap_core0(
input_stream_cint16 * sig_in,
output_stream_cacc48 * cascadeout)
{
const cint16_t * __restrict coeff = eq_coef0;
const v8cint16 *coef_ = (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;
v16cint16 buff = delay_line;
v4cacc48 acc;
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc = mul4(buff, 0 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 2, readincr_v4(sig_in));
acc = mac4(acc, buff, 4 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = mul4(buff, 4 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 3, readincr_v4(sig_in));
acc = mac4(acc, buff, 8 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 10, 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = mul4(buff, 8 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 10 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 0, readincr_v4(sig_in));
acc = mac4(acc, buff, 12 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = mul4(buff, 12 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 1, readincr_v4(sig_in));
acc = mac4(acc, buff, 0 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core0_init()
{
// Drop samples if not first block
int const Delay = 0;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
};
请注意,fir_32tap_core0_init
函数将成为 AI 引擎内核 fir_32tap_core0
的初始化函数,在内核起始时仅执行一次。此初始化函数的用途是丢弃不必要的样本,以对齐输入串流。
同样,fir_32tap_core1_init
函数将成为 AI 引擎内核 fir_32tap_core1
的初始化函数,如以下代码所示。这对于初始化函数 fir_32tap_core2_init
和 fir_32tap_core3_init
也同样适用。
第二个内核代码如下所示。
#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;
void fir_32tap_core1(
input_stream_cint16 * sig_in,
input_stream_cacc48 * cascadein,
output_stream_cacc48 * cascadeout)
{
const cint16_t * __restrict coeff = eq_coef1;
const v8cint16 *coef_ = (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;
v16cint16 buff = delay_line;
v4cacc48 acc;
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 0 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 2, readincr_v4(sig_in));
acc = mac4(acc, buff, 4 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 4 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 3, readincr_v4(sig_in));
acc = mac4(acc, buff, 8 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 10, 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 8 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 10 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 0, readincr_v4(sig_in));
acc = mac4(acc, buff, 12 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 12 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 1, readincr_v4(sig_in));
acc = mac4(acc, buff, 0 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core1_init()
{
// Drop samples if not first block
int const Delay = 8;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
};
第三个内核与第二个内核类似。最后一个内核如下所示。
#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;
void fir_32tap_core3(
input_stream_cint16 * sig_in,
input_stream_cacc48 * cascadein,
output_stream_cint16 * data_out)
{
const cint16_t * __restrict coeff = eq_coef3;
const v8cint16 *coef_ = (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;
v16cint16 buff = delay_line;
v4cacc48 acc;
set_rnd(rnd_pos_inf);
set_sat();
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 0 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 2, readincr_v4(sig_in));
acc = mac4(acc, buff, 4 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 4 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 6 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 3, readincr_v4(sig_in));
acc = mac4(acc, buff, 8 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 10, 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 8 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 10 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 0, readincr_v4(sig_in));
acc = mac4(acc, buff, 12 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));
acc = readincr_v4(cascadein);
acc = mac4(acc, buff, 12 , 0x3210, 1, coe, 0, 0x0000, 1);
acc = mac4(acc, buff, 14 , 0x3210, 1, coe, 2, 0x0000, 1);
buff = upd_v(buff, 1, readincr_v4(sig_in));
acc = mac4(acc, buff, 0 , 0x3210, 1, coe, 4, 0x0000, 1);
acc = mac4(acc, buff, 2 , 0x3210, 1, coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));
}
delay_line = buff;
}
void fir_32tap_core3_init()
{
// Drop samples if not first block
int const Delay = 24;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
};
计算图代码如下所示。
#include <adf.h>
#include "kernels.h"
using namespace adf;
class firGraph : public graph {
public:
kernel k0,k1,k2,k3;
input_port in0123;
output_port out;
firGraph()
{
k0 = kernel::create(fir_32tap_core0);
runtime<ratio>(k0) = 0.9;
source(k0) = "fir_32tap_core0.cpp";
connect<stream> n0(in0123,k0.in[0]);
k1 = kernel::create(fir_32tap_core1);
runtime<ratio>(k1) = 0.9;
source(k1) = "fir_32tap_core1.cpp";
connect<stream> n1(in0123,k1.in[0]);
connect<cascade> (k0.out[0],k1.in[1]);
k2 = kernel::create(fir_32tap_core2);
runtime<ratio>(k2) = 0.9;
source(k2) = "fir_32tap_core2.cpp";
connect<stream> n2(in0123,k2.in[0]);
connect<cascade> (k1.out[0],k2.in[1]);
k3 = kernel::create(fir_32tap_core3);
runtime<ratio>(k3) = 0.9;
source(k3) = "fir_32tap_core3.cpp";
connect<stream> n3(in0123,k3.in[0]);
connect<cascade> (k2.out[0],k3.in[1]);
connect<stream> (k3.out[0],out);
initialization_function(k0) = "fir_32tap_core0_init";
initialization_function(k1) = "fir_32tap_core1_init";
initialization_function(k2) = "fir_32tap_core2_init";
initialization_function(k3) = "fir_32tap_core3_init";
};
};
通过级联串流连接的内核应同步操作。级联串流中的冲突可能导致内核停滞。内核中的循环必须具有可用的输入数据才能流畅运行。因此,至关重要的是,在每个内核的相应时间都有输入串流到达。要解决输入串流停滞(如有),可以将足够大的 FIFO 添加到与 AI 引擎内核相连的信号线中。例如:
fifo_depth(n0)=175;
fifo_depth(n1)=150;
fifo_depth(n2)=125;
fifo_depth(n3)=100;
请注意,在先前示例中指定了不同的 FIFO 深度,以防止 FIFO 自动合并,对于所有信号线使用相同 FIFO 深度时可能发生此操作。
为了节省 FIFO 资源,通过观察每个内核中发生 CORE_INSTREAM_WIDE
事件的时间来设置不同的 FIFO 深度。事件发生越早,FIFO 深度需越深。例如:
fifo_depth(n0)=45;
fifo_depth(n1)=33;
fifo_depth(n2)=23;
fifo_depth(n3)=10;
如需了解有关计算图上编码的更多详细信息,请参阅 AI 引擎工具和流程用户指南 (UG1076)。