Example Usage - 2023.2 English

Vitis Libraries

Release Date
2023-12-20
Version
2023.2 English

The L2 API can be found at Vitis_Libraries/graph/L2/include. A typical code for calling L2 APIs may looks like this:

extern "C" void shortestPath_top(ap_uint<32>* config,
                                 ap_uint<512>* offset,
                                 ap_uint<512>* column,
                                 ap_uint<512>* weight,

                                 ap_uint<512>* ddrQue512,
                                 ap_uint<32>* ddrQue,

                                 ap_uint<512>* result512,
                                 ap_uint<32>* result,
                                 ap_uint<512>* pred512,
                                 ap_uint<32>* pred,
                                 ap_uint<8>* info) {
   const int depth_E = E;
   const int depth_V = V;

#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = config depth = 4
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = offset depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem1 port = column depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem2 port = weight depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue depth = depth_E*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue512 depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = info depth = 8
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result depth = depth_V*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred depth = depth_V*16

   xf::graph::singleSourceShortestPath<32, MAXOUTDEGREE>(config, offset, column, weight, ddrQue512, ddrQue, result512,
                                                         result, pred512, pred, info);
}

It is usually a wrapper function of APIs in Vitis_Libraries/graph/L3/lib. Something interesting might be the following code:

#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = config depth = 4
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = offset depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem1 port = column depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem2 port = weight depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue depth = depth_E*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue512 depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = info depth = 8
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result depth = depth_V*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred depth = depth_V*16

These are the HLS pragmas of the interface. They are responsible for configuring the interface of the FPGA binaries and might be vary with Alveo board. For more information about these pragmas, pleas vitis HLS interface pragma.

The steps to compile the C/C++ code into FPGA binaries is in the Makefile of each testcase. It generally has the following two steps:

  1. v++ --compile to compile the C/C++ code into RTL code. A .xo file is generated in this step.
  2. v++ --link to link the .xo file into FPGA binaries. A .xclbin file is generated in this step.

For more information about compiling the HLS code please visit here

The code to make use of the FPGA binaries is usually C/C++ code with OpenCL APIs and typically contains the following steps:

  1. Create the entire platform and OpenCL kernels
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
cl::Context context(device, NULL, NULL, NULL, &fail);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail);
cl::Program::Binaries xclBins = xcl::import_binary_file(xclbin_path);
devices.resize(1);
cl::Program program(context, devices, xclBins, NULL, &fail);
cl::Kernel shortestPath;
shortestPath = cl::Kernel(program, "shortestPath_top", &fail);
  1. Create CL::Buffers and decide which data needs to be tranfered to FPGA devices and back to host machine.
std::vector<cl::Memory> ob_in;
cl::Buffer offset_buf = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
                        sizeof(ap_uint<32>) * (numVertices + 1), &mext_o[0]);
ob_in.push_back(offset_buf);

std::vector<cl::Memory> ob_out;
cl::Buffer result_buf = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
                        sizeof(float) * ((numVertices + 1023) / 1024) * 1024, &mext_o[6]);
ob_out.push_back(result_buf);
  1. Set arguments for FPGA OpenCL kernels
shortestPath.setArg(j++, config_buf);
shortestPath.setArg(j++, offset_buf);
shortestPath.setArg(j++, column_buf);
shortestPath.setArg(j++, weight_buf);
shortestPath.setArg(j++, ddrQue_buf);
shortestPath.setArg(j++, ddrQue_buf);
shortestPath.setArg(j++, result_buf);
shortestPath.setArg(j++, result_buf);
shortestPath.setArg(j++, pred_buf);
shortestPath.setArg(j++, pred_buf);
shortestPath.setArg(j++, info_buf);
  1. Set up event dependencies
std::vector<cl::Event> events_write(1);
std::vector<cl::Event> events_kernel(1);
std::vector<cl::Event> events_read(1);

q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]);  // Transfer Host data to Device
q.enqueueTask(shortestPath, &events_write, &events_kernel[0]); // execution of the OpenCL kernels (FPGA binaries)
q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); // Transfer Device data to Host
  1. Run OpenCL tasks and execute FPGA binaries
q.finish()