Software Design Details - 2023.2 English

Vitis Tutorials: AI Engine

Document ID
XD100
Release Date
2023-11-29
Version
2023.2 English

The software design in Matrix Multiplication tutorial consists of the following sections:

Methodology

Methodology

Frequency Selection

The gemm_large_ocm kernel operates at 800 MHz

Timing Closure

For timing closure of the whole design, different implementation properties are used, as mentioned in the make xsa step above. These strategies are required because timing is not met for default implementation settings. Routing Congestion limits operating frequency to 800MHz.

For more information about implementation strategies, see the Vivado Implementation User Guide UG904

Data Flow

Host ps_app writes Matrix A and B data and enables DUT. It then polls for Done signal from DUT. When DUT is done, Host app reads Output URAM and compares the URAM read data with golden data. Golden input Matrix data for Matrix A and B, and golden expected data are stored in arrays which are read by host app.

Top Function

The PS host application (main.cpp) is cross-compiled to get the executable. Flow in main.cpp is as follows -

  1. Include the required headers and define the required macros:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <string>
#include "experimental/xrt_aie.h"
#include "experimental/xrt_kernel.h"
#include "experimental/xrt_bo.h"

2. Include input and output arrays

#include "matrix_A_data.h"
#include "matrix_B_data.h"
#include "output_data.h"

...
  1. Check the command line argument. The beginning of the A72 application is represented by the main function. It takes in one command line argument: an XCLBIN file.

    • Open the device and load the XCLBIN:

    auto dhdl = xrtDeviceOpen(0);
    auto xclbin = load_xclbin(dhdl, xclbinFilename);
    auto top = reinterpret_cast<const axlf*>(xclbin.data());
    
    • Open the GEMM DSP58 kernel and obtain handles to start the kernel.

    ...
    xrtKernelHandle gemm_top_khdl;
    xrtRunHandle gemm_top_rhdl;
    ...
    gemm_top_khdl = xrtPLKernelOpen(dhdl, top->m_header.uuid, gemm_top_obj);
    gemm_top_rhdl = xrtRunOpen(gemm_top_khdl);
    ...
    
4. main Function
int main(int argc, char** argv)

Sub-Function Details

test_gemm - This function programs matrix A and B URAMs from the array data and sets other control registers and then enables the gemm kernel.

check_done - This function polls for Done signal to be set from DUT.

read_perf - This function reads the performance counter value counted by the DUT. Gemm kernel counts the number of clocks required for matrix multiplication operation. Note - this count does not include time required for input and output data movement.

golden_check - This function compares data from Output URAM with the golden data. It maintains error counter which is used to decide if test passed or failed.

gemm_soft_reset_pulse - This function generates soft reset to DUT.

PS Host Application

PS Host Application

void gemm_bring_up(void) 
{

unsigned int i, j;
uint32_t uram_data;
unsigned int waddr;
    printf("Writing into registers\n");
    // 1. Write to Control register with Address autoincrement bit set to 1
    //    Write to address 0x10 data = 0x2
    xrtKernelWriteRegister(gemm_top_khdl, 0x10,  0x2);
    // 2. Write to Indirect address register value of 0
    //    Write to address 0x1C, data = 0
    xrtKernelWriteRegister(gemm_top_khdl, 0x1C,  0x0);

    // 3. Write to indirect address control register, Valid bit = 1, R/W## = 0
    //    Write to address 0x18 data = 0x1
    xrtKernelWriteRegister(gemm_top_khdl, 0x18,  0x1);

    // Write 16 32x32 A Matrices into Row URAMs at adress 0x20
    // Size of each Matrix is 2KB, total size = 32KB
    // Data is arrangde in 32-bit wide entry (4Byte)
    // So total lines = 8K
     
    printf("Writing Matrix A\n");
    waddr = 0;
    for (i=0; i<NUM_ROW_URAM; i=i+1) {   // Only 8 URAMs are populated
       for (j=0; j<(MATRIX_A_SIZE/NUM_ROW_URAM); j=j+1) { // 1024 locations written to 8 URAMs
          uram_data = matrix_A_data[MATRIX_A_SIZE/NUM_COL_URAM*i+j]; 
          xrtKernelWriteRegister (gemm_top_khdl, 0x20, uram_data);
       }
       // Increment the address
       waddr += 0x8000;
       xrtKernelWriteRegister (gemm_top_khdl, 0x1c, waddr);
    }
    
    waddr = 0x200000;
    xrtKernelWriteRegister (gemm_top_khdl, 0x1c, waddr);
    printf("Writing Matrix B\n");
    for (i=0; i<NUM_COL_URAM; i=i+1) {   // Only 8 URAMs are populated
       for (j=0; j<(MATRIX_B_SIZE/NUM_COL_URAM); j=j+1) { // 1024 locations written to 8 URAMs
          uram_data = matrix_B_data[MATRIX_B_SIZE/NUM_COL_URAM*i+j]; 
          xrtKernelWriteRegister (gemm_top_khdl, 0x20, uram_data);
       }
       // Increment the address
       waddr += 0x8000;
       xrtKernelWriteRegister (gemm_top_khdl, 0x1c, waddr);
    }
    
    // Set DUT Enable bit
    // Write to address 0x10, data = 0x3
    xrtKernelWriteRegister (gemm_top_khdl, 0x10, 0x3); 

}
  1. Data Integrity Check is performed by running golden_check function

void golden_check(unsigned int *mismatch_count) 
{

uint16_t golden_data_lower;
uint16_t golden_data_upper;
uint32_t read_data;
uint16_t read_data_lower;
uint16_t read_data_upper;
uint32_t read_addr;
unsigned int i, Done;
unsigned int match_count;

    // Poll for Done bit from DUT
    //printf("entered golden_check");
    //while (Done == 0) {
        // Read address 4
    //    xrtKernelReadRegister(gemm_top_khdl, 0x14, &read_data);
    //    Done = read_data & 0x1;
    //}

    // Write to indirect address control register, Vali = 1, R/W## = 1
    // Write to address 8, data = 0x3
    // xrtKernelWriteRegister (gemm_top_khdl, 0x18, 0x3);

    // Read 16 32x32 Matrices from Output URAMs from address 0x24
    // Total data is 32KB, 2-bytes read at a time, total 16K reads
    match_count = 0;
    mismatch_count = 0;
    for (i=0; i<8192; i=i+2) {
      golden_data_lower = output_data [i];
      golden_data_upper = output_data [i+1];
      xrtKernelReadRegister (gemm_top_khdl, 0x24, &read_data);
      read_data_lower = read_data & 0xFFFF;
      read_data_upper = read_data >> 16; 
      if (golden_data_lower != read_data_lower) {
         printf ("Data mismatch Addr : 0x%x, Golden Data : 0x%x, Read Data : 0x%x\n", i, golden_data_lower, read_data_lower);
         mismatch_count++;
      } else {
         //printf ("Data match Addr : 0x%x, Golden Data : 0x%x, Read Data : 0x%x\n", i, golden_data_lower, read_data_lower);
         match_count++;
      }
      if (golden_data_upper != read_data_upper) {
         printf ("Data mismatch Addr : 0x%x, Golden Data : 0x%x, Read Data : 0x%x\n", i+1, golden_data_upper, read_data_upper);
         mismatch_count++;
      } else {
         //printf ("Data match Addr : 0x%x, Golden Data : 0x%x, Read Data : 0x%x\n", i+1, golden_data_upper, read_data_upper);
         match_count++;
      }
    }  
    printf ("Match Count : %u, Mismatch Count : %u\n", match_count, *mismatch_count);
}