内部函数编码 - 2023.2 简体中文

AI 引擎内核与计算图编程指南 (UG1079)

Document ID
UG1079
Release Date
2023-12-04
Version
2023.2 简体中文

1 Gsps 实现中的 4 个内核之间可能具有多组不同的系数和级联串流。实现如下图所示。

图 1. 含拆分系数和级联串流的 4 个内核

输入数据从串流流至这 4 个内核。但第二个内核将丢弃前 8 项输入数据。第三个内核将丢弃前 16 项输入数据。同样,第四个内核将丢弃前 24 项输入数据。

第一个内核的代码如下所示。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core0(
	input_stream_cint16 * sig_in,
	output_stream_cacc48 * cascadeout)
{
	const cint16_t * __restrict coeff = eq_coef0;
	const v8cint16 *coef_  =  (v8cint16 const*)coeff;
	const v8cint16 coe = *coef_;

	v16cint16 buff = delay_line;
	v4cacc48 acc;
	const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

	for (unsigned int i = 0; i < LSIZE; ++i)
	chess_prepare_for_pipelining
	chess_loop_range(4,)
	{
		acc  = mul4(buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 2, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 3, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 0, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 1, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);
    }
    delay_line = buff;
}

void fir_32tap_core0_init()
{
	// Drop samples if not first block
	int const Delay = 0;
	for (int i = 0; i < Delay; ++i)
	{
		get_ss(0);
	}

};

请注意,fir_32tap_core0_init 函数将成为 AI 引擎内核 fir_32tap_core0 的初始化函数,在内核起始时仅执行一次。此初始化函数的用途是丢弃不必要的样本,以对齐输入串流。

同样,fir_32tap_core1_init 函数将成为 AI 引擎内核 fir_32tap_core1 的初始化函数,如以下代码所示。这对于初始化函数 fir_32tap_core2_initfir_32tap_core3_init 也同样适用。

第二个内核代码如下所示。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core1(
	input_stream_cint16 * sig_in,
	input_stream_cacc48 * cascadein,
	output_stream_cacc48 * cascadeout)
{
    const cint16_t * __restrict coeff = eq_coef1;
    const v8cint16 *coef_  =  (v8cint16 const*)coeff;
    const v8cint16 coe = *coef_;

    v16cint16 buff = delay_line;
    v4cacc48 acc;
    const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

    for (unsigned int i = 0; i < LSIZE; ++i)
    chess_prepare_for_pipelining
    chess_loop_range(4,)
    {
        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 2, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 3, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 0, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 1, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);
    }
    delay_line = buff;
}

void fir_32tap_core1_init()
{
	// Drop samples if not first block
    int const Delay = 8;
    for (int i = 0; i < Delay; ++i)
    {
        get_ss(0);
    }
};

第三个内核与第二个内核类似。最后一个内核如下所示。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core3(
	input_stream_cint16 * sig_in,
	input_stream_cacc48 * cascadein,
	output_stream_cint16 * data_out)
{
	const cint16_t * __restrict coeff = eq_coef3;
	const v8cint16 *coef_  =  (v8cint16 const*)coeff;
	const v8cint16 coe = *coef_;

	v16cint16 buff = delay_line;

	v4cacc48 acc;

	set_rnd(rnd_pos_inf);
	set_sat();
	const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

	for (unsigned int i = 0; i < LSIZE; ++i)
	chess_prepare_for_pipelining
	chess_loop_range(4,)
    	{
		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 2, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 3, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 0, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 1, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));
	}
    	delay_line = buff;
}

void fir_32tap_core3_init()
{
	// Drop samples if not first block
	int const Delay = 24;
	for (int i = 0; i < Delay; ++i)
	{
		get_ss(0);
	}
};

计算图代码如下所示。

#include <adf.h>
#include "kernels.h"
using namespace adf;
class firGraph : public graph {
	public:
	kernel k0,k1,k2,k3;
	input_port in0123;
	output_port out;
	firGraph()
	{
		k0 = kernel::create(fir_32tap_core0);
		runtime<ratio>(k0) = 0.9;
		source(k0) = "fir_32tap_core0.cpp";
		connect<stream> n0(in0123,k0.in[0]);

		k1 = kernel::create(fir_32tap_core1);
		runtime<ratio>(k1) = 0.9;
		source(k1) = "fir_32tap_core1.cpp";
		connect<stream> n1(in0123,k1.in[0]);
		connect<cascade> (k0.out[0],k1.in[1]);

		k2 = kernel::create(fir_32tap_core2);
		runtime<ratio>(k2) = 0.9;
		source(k2) = "fir_32tap_core2.cpp";
		connect<stream> n2(in0123,k2.in[0]);
		connect<cascade> (k1.out[0],k2.in[1]);

		k3 = kernel::create(fir_32tap_core3);
		runtime<ratio>(k3) = 0.9;
		source(k3) = "fir_32tap_core3.cpp";
		connect<stream> n3(in0123,k3.in[0]);
		connect<cascade> (k2.out[0],k3.in[1]);
		connect<stream> (k3.out[0],out);

		initialization_function(k0) = "fir_32tap_core0_init";
		initialization_function(k1) = "fir_32tap_core1_init";
		initialization_function(k2) = "fir_32tap_core2_init";
		initialization_function(k3) = "fir_32tap_core3_init";
	};
};

通过级联串流连接的内核应同步操作。级联串流中的冲突可能导致内核停滞。内核中的循环必须具有可用的输入数据才能流畅运行。因此,至关重要的是,在每个内核的相应时间都有输入串流到达。要解决输入串流停滞(如有),可以将足够大的 FIFO 添加到与 AI 引擎内核相连的信号线中。例如:

fifo_depth(n0)=175;
fifo_depth(n1)=150;
fifo_depth(n2)=125;
fifo_depth(n3)=100;

请注意,在先前示例中指定了不同的 FIFO 深度,以防止 FIFO 自动合并,对于所有信号线使用相同 FIFO 深度时可能发生此操作。

为了节省 FIFO 资源,通过观察每个内核中发生 CORE_INSTREAM_WIDE 事件的时间来设置不同的 FIFO 深度。事件发生越早,FIFO 深度需越深。例如:

fifo_depth(n0)=45;
fifo_depth(n1)=33;
fifo_depth(n2)=23;
fifo_depth(n3)=10;

如需了解有关计算图上编码的更多详细信息,请参阅 AI 引擎工具和流程用户指南 (UG1076)