The PS code for hardware emulation and hardware flows is in sw/host.cpp
. You can review the code. It opens the XCLBIN using the following code.
// Open xclbin
auto dhdl = xrtDeviceOpen(0);//device index=0
ret=xrtDeviceLoadXclbinFile(dhdl,xclbinFilename);
xuid_t uuid;
xrtDeviceGetXclbinUUID(dhdl, uuid);
It allocates buffers for mm2s
kernels and s2mm
kernels:
// output memory
xrtBufferHandle out_bo1 = xrtBOAlloc(dhdl, mem_size, 0, /*BANK=*/0);
...
int *host_out1 = (int*)xrtBOMap(out_bo1);
...
// input memory
xrtBufferHandle in_bo1 = xrtBOAlloc(dhdl, mem_size, 0, /*BANK=*/0);
...
int *host_in1 = (int*)xrtBOMap(in_bo1);
...
It initializes the input memory and then syncs the input memory:
// initialize input memory
for(int i=0;i<mem_size/sizeof(int);i++){
*(host_in1+i)=i;
*(host_in2+i)=2*i;
*(host_in3+i)=3*i;
*(host_in4+i)=4*i;
}
// sync input memory
xrtBOSync(in_bo1, XCL_BO_SYNC_BO_TO_DEVICE , mem_size,/*OFFSET=*/ 0);
...
Then it starts the output kernels and input kernels:
// start output kernels
xrtKernelHandle s2mm_k1 = xrtPLKernelOpen(dhdl, uuid, "s2mm:{s2mm_1}");
xrtRunHandle s2mm_r1 = xrtRunOpen(s2mm_k1);
xrtRunSetArg(s2mm_r1, 0, out_bo1);
xrtRunSetArg(s2mm_r1, 2, mem_size/sizeof(int));
xrtRunStart(s2mm_r1);
...
xrtKernelHandle hls_packet_receiver_k = xrtPLKernelOpen(dhdl, uuid, "hls_packet_receiver");
xrtRunHandle hls_packet_receiver_r = xrtRunOpen(hls_packet_receiver_k);
xrtRunSetArg(hls_packet_receiver_r, 5, total_packet_num);
xrtRunStart(hls_packet_receiver_r);
// start input kernels
xrtKernelHandle mm2s_k1 = xrtPLKernelOpen(dhdl, uuid, "mm2s:{mm2s_1}");
xrtRunHandle mm2s_r1 = xrtRunOpen(mm2s_k1);
xrtRunSetArg(mm2s_r1, 0, in_bo1);
xrtRunSetArg(mm2s_r1, 2, mem_size/sizeof(int));
xrtRunStart(mm2s_r1);
...
xrtKernelHandle hls_packet_sender_k = xrtPLKernelOpen(dhdl, uuid, "hls_packet_sender");
xrtRunHandle hls_packet_sender_r = xrtRunOpen(hls_packet_sender_k);
xrtRunSetArg(hls_packet_sender_r, 5, packet_num);
xrtRunStart(hls_packet_sender_r);
Then it starts the graph:
// start graph
adf::registerXRT(dhdl, uuid);
gr.run(2); //Iteration number=2. The amount of data matches for PL kernels and graph
Then it waits for s2mm
kernels to complete, and syncs output memory:
// wait for s2mm to complete
xrtRunWait(s2mm_r1);
...
// sync output memory
xrtBOSync(out_bo1, XCL_BO_SYNC_BO_FROM_DEVICE , mem_size,/*OFFSET=*/ 0);
...
Then, finally, it performs post-processing and releases objects.
Note that there is no special packet switching handling in the PS code. It is already done on the AI Engine and PL side.