組み込み関数を使用したコード記述

組み込み関数を使用したコード記述 - 2023.2 日本語

AI エンジンカーネルおよびグラフプログラミングガイド (UG1079)

Document ID

UG1079

Release Date

2023-12-04

Version

2023.2 日本語

1 GSP インプリメンテーションの 4 つのカーネルにそれぞれ異なる係数のセットを使用し、その間でストリームをカスケード接続できます。このインプリメンテーションを次の図に示します。

図 1. 分割係数とカスケードストリームを使用した 4 つのカーネル

入力データは、ストリームからこれらの 4 つのカーネルに流れます。ただし、2 番目のカーネルでは、最初の 8 つの入力データが破棄されます。3 番目のカーネルでは、最初の 16 個の入力データが破棄されます。同様に、4 番目のカーネルでは、最初の 24 個の入力データが破棄されます。

最初のカーネルのコードは、次のとおりです。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core0(
	input_stream_cint16 * sig_in,
	output_stream_cacc48 * cascadeout)
{
	const cint16_t * __restrict coeff = eq_coef0;
	const v8cint16 *coef_  =  (v8cint16 const*)coeff;
	const v8cint16 coe = *coef_;

	v16cint16 buff = delay_line;
	v4cacc48 acc;
	const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

	for (unsigned int i = 0; i < LSIZE; ++i)
	chess_prepare_for_pipelining
	chess_loop_range(4,)
	{
		acc  = mul4(buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 2, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 3, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 0, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);

		acc  = mul4(buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 1, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(cascadeout,acc);
    }
    delay_line = buff;
}

void fir_32tap_core0_init()
{
	// Drop samples if not first block
	int const Delay = 0;
	for (int i = 0; i < Delay; ++i)
	{
		get_ss(0);
	}

};

関数 fir_32tap_core0_init は、AI エンジンカーネル fir_32tap_core0 の初期化関数となり、カーネルの起動時に一度だけ実行されることに注意してください。この初期化関数の目的は、入力ストリームを揃えるために不要なサンプルを破棄することです。

同様に、関数 fir_32tap_core1_init は AI エンジンカーネル fir_32tap_core1 の初期化関数となり、コードは次のとおりです。初期化関数 fir_32tap_core2_init および fir_32tap_core3_init も同様です。

2 番目のカーネルのコードは次のとおりです。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core1(
	input_stream_cint16 * sig_in,
	input_stream_cacc48 * cascadein,
	output_stream_cacc48 * cascadeout)
{
    const cint16_t * __restrict coeff = eq_coef1;
    const v8cint16 *coef_  =  (v8cint16 const*)coeff;
    const v8cint16 coe = *coef_;

    v16cint16 buff = delay_line;
    v4cacc48 acc;
    const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

    for (unsigned int i = 0; i < LSIZE; ++i)
    chess_prepare_for_pipelining
    chess_loop_range(4,)
    {
        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 2, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 3, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 0, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);

        acc = readincr_v4(cascadein);
        acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
        acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
        buff = upd_v(buff, 1, readincr_v4(sig_in));
        acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
        acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
        writeincr_v4(cascadeout,acc);
    }
    delay_line = buff;
}

void fir_32tap_core1_init()
{
	// Drop samples if not first block
    int const Delay = 8;
    for (int i = 0; i < Delay; ++i)
    {
        get_ss(0);
    }
};

3 番目のカーネルのコードは、2 番目のカーネルのコードと同様です。最後のカーネルのコードは次のとおりです。

#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core3(
	input_stream_cint16 * sig_in,
	input_stream_cacc48 * cascadein,
	output_stream_cint16 * data_out)
{
	const cint16_t * __restrict coeff = eq_coef3;
	const v8cint16 *coef_  =  (v8cint16 const*)coeff;
	const v8cint16 coe = *coef_;

	v16cint16 buff = delay_line;

	v4cacc48 acc;

	set_rnd(rnd_pos_inf);
	set_sat();
	const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

	for (unsigned int i = 0; i < LSIZE; ++i)
	chess_prepare_for_pipelining
	chess_loop_range(4,)
    	{
		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 2, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 3, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 0, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));

		acc = readincr_v4(cascadein);
		acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
		acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
		buff = upd_v(buff, 1, readincr_v4(sig_in));
		acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
		acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
		writeincr_v4(data_out,srs(acc,shift));
	}
    	delay_line = buff;
}

void fir_32tap_core3_init()
{
	// Drop samples if not first block
	int const Delay = 24;
	for (int i = 0; i < Delay; ++i)
	{
		get_ss(0);
	}
};

グラフコードは次のとおりです。

#include <adf.h>
#include "kernels.h"
using namespace adf;
class firGraph : public graph {
	public:
	kernel k0,k1,k2,k3;
	input_port in0123;
	output_port out;
	firGraph()
	{
		k0 = kernel::create(fir_32tap_core0);
		runtime<ratio>(k0) = 0.9;
		source(k0) = "fir_32tap_core0.cpp";
		connect<stream> n0(in0123,k0.in[0]);

		k1 = kernel::create(fir_32tap_core1);
		runtime<ratio>(k1) = 0.9;
		source(k1) = "fir_32tap_core1.cpp";
		connect<stream> n1(in0123,k1.in[0]);
		connect<cascade> (k0.out[0],k1.in[1]);

		k2 = kernel::create(fir_32tap_core2);
		runtime<ratio>(k2) = 0.9;
		source(k2) = "fir_32tap_core2.cpp";
		connect<stream> n2(in0123,k2.in[0]);
		connect<cascade> (k1.out[0],k2.in[1]);

		k3 = kernel::create(fir_32tap_core3);
		runtime<ratio>(k3) = 0.9;
		source(k3) = "fir_32tap_core3.cpp";
		connect<stream> n3(in0123,k3.in[0]);
		connect<cascade> (k2.out[0],k3.in[1]);
		connect<stream> (k3.out[0],out);

		initialization_function(k0) = "fir_32tap_core0_init";
		initialization_function(k1) = "fir_32tap_core1_init";
		initialization_function(k2) = "fir_32tap_core2_init";
		initialization_function(k3) = "fir_32tap_core3_init";
	};
};

カスケードストリームで接続されたカーネルは、同期動作します。カスケードストリームで競合が発生すると、カーネルがストールする可能性があります。カーネル内のループをスムーズに動作させるためには、入力データが使用可能であることが必要です。入力ストリームが各カーネルに適切なときに到着することが重要です。入力ストリームのストールは、AI エンジンカーネルに接続されているネットに十分に大きな FIFO を追加することで解決できます。次に例を示します。

fifo_depth(n0)=175;
fifo_depth(n1)=150;
fifo_depth(n2)=125;
fifo_depth(n3)=100;

すべてのネットに同じ FIFO 深さを使用すると、自動的に FIFO が結合される可能性があるので、上記の例では異なる FIFO 深さが指定されています。

FIFO リソースを節約するため、各カーネルでイベント CORE_INSTREAM_WIDE がいつ発生するかを調べることにより、個々の FIFO 深さを設定できます。イベントが早く発生するほど、FIFO を深くする必要があります。次に例を示します。

fifo_depth(n0)=45;
fifo_depth(n1)=33;
fifo_depth(n2)=23;
fifo_depth(n3)=10;

グラフコード記述の詳細は、『AI エンジンツールおよびフローユーザーガイド』 (UG1076) を参照してください。