# Coding with Intrinsics - 2021.2 English

## AI Engine Kernel Coding Best Practices Guide (UG1079)

Document ID
UG1079
Release Date
2021-11-10
Version
2021.2 English

The four kernels in the 1 Gsps implementation can have different sets of coefficients and cascade streams between them. An implementation is shown in the following figure.

Figure 1. Four Kernels with Split Coefficient and Cascade Stream

Input data flows from stream to these four kernels. However, the second kernel will discard the first eight input data. The third kernel will discard the first 16 input data. Similarly, the fourth kernel will discard the first 24 input data.

The code for the first kernel is as follows.

``````#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core0(
input_stream_cint16 * sig_in,
{
const cint16_t * __restrict coeff = eq_coef0;
const v8cint16 *coef_  =  (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;

v16cint16 buff = delay_line;
v4cacc48 acc;
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc  = mul4(buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mul4(buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mul4(buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mul4(buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
}
delay_line = buff;
}

void fir_32tap_core0_init()
{
// Drop samples if not first block
int const Delay = 0;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}

};
``````

Note that the function, `fir_32tap_core0_init`, is going to be the initialization function for the AI Engine kernel, `fir_32tap_core0`, which is only executed once at the kernel start. The purpose of this initialization function is to discard the unnecessary samples to align the input stream.

Similarly, the function, `fir_32tap_core1_init`, is going to be the initialization function for the AI Engine kernel, `fir_32tap_core1`, in the following codes. Same applies for the initialization functions, `fir_32tap_core2_init` and `fir_32tap_core3_init`.

The second kernel code is as follows.

``````#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core1(
input_stream_cint16 * sig_in,
{
const cint16_t * __restrict coeff = eq_coef1;
const v8cint16 *coef_  =  (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;

v16cint16 buff = delay_line;
v4cacc48 acc;
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);

acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
}
delay_line = buff;
}

void fir_32tap_core1_init()
{
// Drop samples if not first block
int const Delay = 8;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
};``````

The third kernel is similar to the second one. The last kernel is as follows.

``````#include <adf.h>
#include "fir_32tap.h"
// buffer to keep state
static v16cint16 delay_line;

void fir_32tap_core3(
input_stream_cint16 * sig_in,
output_stream_cint16 * data_out)
{
const cint16_t * __restrict coeff = eq_coef3;
const v8cint16 *coef_  =  (v8cint16 const*)coeff;
const v8cint16 coe = *coef_;

v16cint16 buff = delay_line;

v4cacc48 acc;

set_rnd(rnd_pos_inf);
set_sat();
const unsigned LSIZE = (samples/4/4); // assuming samples is integer power of 2 and greater than 16

for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
chess_loop_range(4,)
{
acc  = mac4(acc, buff, 0 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 2 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));

acc  = mac4(acc, buff, 4 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 6 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 8 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 10, 0x3210, 1,  coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));

acc  = mac4(acc, buff, 8  , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 10 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));

acc  = mac4(acc, buff, 12 , 0x3210, 1,  coe, 0, 0x0000, 1);
acc  = mac4(acc, buff, 14 , 0x3210, 1,  coe, 2, 0x0000, 1);
acc  = mac4(acc, buff, 0  , 0x3210, 1,  coe, 4, 0x0000, 1);
acc  = mac4(acc, buff, 2  , 0x3210, 1,  coe, 6, 0x0000, 1);
writeincr_v4(data_out,srs(acc,shift));
}
delay_line = buff;
}

void fir_32tap_core3_init()
{
// Drop samples if not first block
int const Delay = 24;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
};``````

The graph code is as follows.

``````#include <adf.h>
#include "kernels.h"
class firGraph : public graph {
public:
kernel k0,k1,k2,k3;
input_port in0123;
output_port out;
firGraph()
{
k0 = kernel::create(fir_32tap_core0);
runtime<ratio>(k0) = 0.9;
source(k0) = "fir_32tap_core0.cpp";
connect<stream> n0(in0123,k0.in[0]);

k1 = kernel::create(fir_32tap_core1);
runtime<ratio>(k1) = 0.9;
source(k1) = "fir_32tap_core1.cpp";
connect<stream> n1(in0123,k1.in[0]);

k2 = kernel::create(fir_32tap_core2);
runtime<ratio>(k2) = 0.9;
source(k2) = "fir_32tap_core2.cpp";
connect<stream> n2(in0123,k2.in[0]);

k3 = kernel::create(fir_32tap_core3);
runtime<ratio>(k3) = 0.9;
source(k3) = "fir_32tap_core3.cpp";
connect<stream> n3(in0123,k3.in[0]);
connect<stream> (k3.out[0],out);

initialization_function(k0) = "fir_32tap_core0_init";
initialization_function(k1) = "fir_32tap_core1_init";
initialization_function(k2) = "fir_32tap_core2_init";
initialization_function(k3) = "fir_32tap_core3_init";
};
};``````

The kernels connected through cascade streams are expected to operate synchronously. Conflicts in cascade streams can stall the kernels. Loops in the kernels must have input data available to run smoothly. Hence it is important that the input stream arrives at the appropriate time for each kernel. The input stream stall (if any) can be resolved by adding a large enough FIFO to the net connecting to the AI Engine kernels. For example:

``````fifo_depth(n0)=175;
fifo_depth(n1)=150;
fifo_depth(n2)=125;
fifo_depth(n3)=100;``````

Note that different FIFO depths are specified in the previous example to prevent auto FIFO merge which can occur when a common FIFO depth is used for all nets.

For the purpose of saving FIFO resources, individual FIFO depths can be set by looking at when the event `CORE_INSTREAM_WIDE` occurs for each kernel. The earlier the event occurs, the deeper the FIFO needs to be. For example:

``````fifo_depth(n0)=45;
fifo_depth(n1)=33;
fifo_depth(n2)=23;
fifo_depth(n3)=10;``````

For additional details about coding on graph, refer to the Versal ACAP AI Engine Programming Environment User Guide (UG1076).