Code Modifications - 2023.2 English

Vitis Tutorials: Hardware Acceleration (XD099)

Document ID
XD099
Release Date
2023-11-13
Version
2023.2 English
  1. Navigate to $LAB_WORK_DIR/reference_files, and with a file editor, open run_sw_overlap_multiDDR.cpp.

  2. From the host code, you will need to send the words to both DDR banks alternatively. The DDR bank assignment in the host code is supported by a AMD vendor extension to the OpenCL™ API. Two AMD extension pointer objects (cl_mem_ext_ptr_t) are created, buffer_words_ext[0] and buffer_words_ext[1]. Theflags will determine which DDR bank the buffer will be send to, so that kernel can access it.

     cl_mem_ext_ptr_t buffer_words_ext[2];
    
     buffer_words_ext[0].flags = 1 | XCL_MEM_TOPOLOGY; // DDR[1]
     buffer_words_ext[0].param = 0;
     buffer_words_ext[0].obj   = input_doc_words;
     buffer_words_ext[1].flags = 2 | XCL_MEM_TOPOLOGY; // DDR[2]
     buffer_words_ext[1].param = 0;
     buffer_words_ext[1].obj   = input_doc_words;
    
  3. The next two buffers, buffer_doc_words[0] and buffer_doc_words[1], are created in DDR[1] and DDR[2] as follows.

    buffer_doc_words[0] = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint), &buffer_words_ext[0]);
    buffer_doc_words[1] = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint), &buffer_words_ext[1]);
    buffer_inh_flags    = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, total_size*sizeof(char),output_inh_flags);
    buffer_bloom_filter = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, bloom_filter_size*sizeof(uint),bloom_filter);
    
    // Set buffer kernel arguments (needed to migrate the buffers in the correct memory)
    kernel.setArg(0, buffer_inh_flags);
    kernel.setArg(1, buffer_doc_words[0]);
    kernel.setArg(2, buffer_bloom_filter);
    
    // Make buffers resident in the device
    q.enqueueMigrateMemObjects({buffer_bloom_filter, buffer_doc_words[0], buffer_doc_words[1], buffer_inh_flags}, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED);
    
    // Create sub-buffers, one for each transaction
    unsigned subbuf_doc_sz = total_doc_size/num_iter;
    unsigned subbuf_inh_sz = total_doc_size/num_iter;
    
    cl_buffer_region subbuf_inh_info[num_iter];
    cl_buffer_region subbuf_doc_info[num_iter];
    cl::Buffer subbuf_inh_flags[num_iter];
    cl::Buffer subbuf_doc_words[num_iter];
    
    for (int i=0; i<num_iter; i++) {
        subbuf_inh_info[i]={i*subbuf_inh_sz*sizeof(char), subbuf_inh_sz*sizeof(char)};
        subbuf_doc_info[i]={i*subbuf_doc_sz*sizeof(uint), subbuf_doc_sz*sizeof(uint)};
        subbuf_inh_flags[i] = buffer_inh_flags.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &subbuf_inh_info[i]);
        // The doc words sub-buffers will be alternating in DDR[1] and DDR[2]
        subbuf_doc_words[i] = buffer_doc_words[i%2].createSubBuffer (CL_MEM_READ_ONLY,  CL_BUFFER_CREATE_TYPE_REGION, &subbuf_doc_info[i]);
    }
    
  4. The kernel argument, input word is set to the array of sub-buffers created from buffer_doc_words[0] and buffer_doc_words[1] alternatively; hence, data is sent to DDR bank 1 and 2 alternatively in each kernel execution.

    for (int i=0; i<num_iter; i++)
    {
      cl::Event buffDone, krnlDone, flagDone;
      total_size = subbuf_doc_info[i].size / sizeof(uint);
      load_filter = false;
      kernel.setArg(0, subbuf_inh_flags[i]);
      kernel.setArg(1, subbuf_doc_words[i]);
      kernel.setArg(3, total_size);
      kernel.setArg(4, load_filter);
      q.enqueueMigrateMemObjects({subbuf_doc_words[i]}, 0, &wordWait, &buffDone);
      wordWait.push_back(buffDone);
      q.enqueueTask(kernel, &wordWait, &krnlDone);
      krnlWait.push_back(krnlDone);
      q.enqueueMigrateMemObjects({subbuf_inh_flags[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &krnlWait, &flagDone);
      flagWait.push_back(flagDone);
    }