如 综合汇总 中所述,Vitis HLS 会发出报告,汇总突发活动并识别突发故障。如果完成可变长度突发,那么报告将提及已推断可变长度突发。编译器还会提供突发消息,在编译器 log 日志文件 vitis_hls.log 中可找到这些消息。这些消息是在调度步骤之前发出的。
简单读写突发推断
以下示例是读取和写入 DDR 并推断读取和写入突发的标准方法。Vitis HLS 编译器将为以下示例报告以下突发推断:
INFO: [HLS 214-115] Burst read of variable length and bit width 32 has been inferred on port 'gmem'
INFO: [HLS 214-115] Burst write of variable length and bit width 32 has been inferred on port 'gmem' (./src/vadd.cpp:75:9).
此示例的代码如下所示:
/****** BEGIN EXAMPLE *******/
#define DATA_SIZE 2048
// Define internal buffer max size
#define BURSTBUFFERSIZE 256
//TRIPCOUNT identifiers
const unsigned int c_min = 1;
const unsigned int c__max = BURSTBUFFERSIZE;
const unsigned int c_chunk_sz = DATA_SIZE;
extern "C" {
void vadd(int *a, int size, int inc_value) {
// Map pointer a to AXI4-master interface for global memory access
#pragma HLS INTERFACE mode=m_axi port=a offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
// We also need to map a and return to a bundled axilite slave interface
#pragma HLS INTERFACE mode=s_axilite port=a bundle=control
#pragma HLS INTERFACE mode=s_axilite port=size bundle=control
#pragma HLS INTERFACE mode=s_axilite port=inc_value bundle=control
#pragma HLS INTERFACE mode=s_axilite port=return bundle=control
int burstbuffer[BURSTBUFFERSIZE];
// Per iteration of this loop perform BURSTBUFFERSIZE vector addition
for (int i = 0; i < size; i += BURSTBUFFERSIZE) {
#pragma HLS LOOP_TRIPCOUNT min=c_min*c_min max=c_chunk_sz*c_chunk_sz/(c_max*c_max)
int chunk_size = BURSTBUFFERSIZE;
//boundary checks
if ((i + BURSTBUFFERSIZE) > size)
chunk_size = size - i;
// Use a for loop to create a burst access to memory
// memcpy is not recommended
// memcpy(burstbuffer, &a[i], chunk_size * sizeof(int));
for (i=0; i < chunk_size; i++) {
burstbuffer[i] = a[i];
}
// Calculate and write results to global memory, the sequential write in a for loop can be
// inferred as a memory burst access
calc_write:
for (int j = 0; j < chunk_size; j++) {
#pragma HLS LOOP_TRIPCOUNT min=c_size_max max=c_chunk_sz
#pragma HLS PIPELINE II=1
burstbuffer[j] = burstbuffer[j] + inc_value;
a[i + j] = burstbuffer[j];
}
}
}
突发之间的流水打拍
以下示例将推断长度为 N 的突发:
for(int x=0; x < k; ++x) {
int off = f(x);
for(int i = 0; i < N; ++i) {
#pragma HLS PIPELINE II=1
... = gmem[off + i];
}
}
但请注意,外层循环并不采用流水打拍。这意味着虽然突发内部存在流水打拍,但是突发之间不存在任何流水打拍。
为了做出补救,您可以将内层循环展开,并将外层循环流水打拍,使突发之间也采用流水打拍。以下示例仍将推断长度为 N 的突发,但现在突发之间同样采用流水打拍,因而吞吐量更高:
for(int x=0; x < k; ++x) {
#pragma HLS PIPELINE II=N
int off = f(x);
for(int i = 0; i < N; ++i) {
#pragma HLS UNROLL
... = gmem[off + i];
}
}
从二维阵列访问行数据
以下是读写二维阵列的示例。Vitis HLS 会推断读取突发和写入突发,并发出以下消息:
INFO: [HLS 214-115] Burst read of length 256 and bit width 512 has been inferred on port 'gmem' (./src/row_array_2d.cpp:43:5)
INFO: [HLS 214-115] Burst write of length 256 and bit width 512 has been inferred on port 'gmem' (./src/row_array_2d.cpp:56:5)
请注意,在此示例内达成的位宽为 512。这比上述简单示例中达成的 32 位宽更高效。增大突发位宽是最优化突发的另一种方法,如 端口宽度自动调整 中所述。
此示例的代码如下所示:
/****** BEGIN EXAMPLE *******/
// Parameters Description:
// NUM_ROWS: matrix height
// WORD_PER_ROW: number of words in a row
// BLOCK_SIZE: number of words in an array
#define NUM_ROWS 64
#define WORD_PER_ROW 64
#define BLOCK_SIZE (WORD_PER_ROW*NUM_ROWS)
// Default datatype is integer
typedef int DTYPE;
typedef hls::stream<DTYPE> my_data_fifo;
// Read data function: reads data from global memory
void read_data(DTYPE *inx, my_data_fifo &inFifo) {
read_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
read_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
inFifo << inx[WORD_PER_ROW * i + jj];
;
}
}
}
// Write data function - writes results to global memory
void write_data(DTYPE *outx, my_data_fifo &outFifo) {
write_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
write_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
outFifo >> outx[WORD_PER_ROW * i + jj];
}
}
}
// Compute function is pretty simple because this example is focused on efficient
// memory access pattern.
void compute(my_data_fifo &inFifo, my_data_fifo &outFifo, int alpha) {
compute_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
compute_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
DTYPE inTmp;
inFifo >> inTmp;
DTYPE outTmp = inTmp * alpha;
outFifo << outTmp;
}
}
}
extern "C" {
void row_array_2d(DTYPE *inx, DTYPE *outx, int alpha) {
// AXI master interface
#pragma HLS INTERFACE mode=m_axi port = inx offset = slave bundle = gmem
#pragma HLS INTERFACE mode=m_axi port = outx offset = slave bundle = gmem
// AXI slave interface
#pragma HLS INTERFACE mode=s_axilite port = inx bundle = control
#pragma HLS INTERFACE mode=s_axilite port = outx bundle = control
#pragma HLS INTERFACE mode=s_axilite port = alpha bundle = control
#pragma HLS INTERFACE mode=s_axilite port = return bundle = control
my_data_fifo inFifo;
// By default the FIFO depth is 2, user can change the depth by using
// #pragma HLS stream variable=inFifo depth=256
my_data_fifo outFifo;
// Dataflow enables task level pipelining, allowing functions and loops to execute
// concurrently. For more details please refer to UG902.
#pragma HLS DATAFLOW
// Read data from each row of 2D array
read_data(inx, inFifo);
// Do computation with the acquired data
compute(inFifo, outFifo, alpha);
// Write data to each row of 2D array
write_data(outx, outFifo);
return;
}
}