Data Shuffling Kernel - 2022.2 English

AI Engine Kernel and Graph Programming Guide (UG1079)

Document ID
UG1079
Release Date
2022-10-19
Version
2022.2 English

Because aie::mmul accepts row-based vector data for shape of matrix multiplication, it may require data shuffling in PL or AI Engine with raw data for performance. This section assumes that the original data is row based for whole matrices. It shuffles the data to match the shape 4*16*8 used in the matrix multiplication.

Following kernel code shuffles data for matrix A, with a target shape 4*16:
//element matrix size
const int M=4;
const int N=16;

//Total matrix sizes
const int rowA=64;
const int colA=64;

void shuffle_4x16(input_window<int8> * __restrict matA, output_window<int8> * __restrict matAout){
  const int sizeA=M*N;
  auto pV=aie::begin_vector<16>((int8*)matA->ptr);
  auto pOut=aie::begin_vector<sizeA>((int8*)matAout->ptr);
  aie::vector<int8,sizeA> mm;

  for(int i=0;i<rowA/M;i++){//output row number of element matrix
    for(int j=0;j<colA/N;j++){//output col number of element matrix
      for(int k=0;k<M;k++){//generate 4*16 matrix
        mm.insert(k,*pV);
        pV=pV+4;
      }
      *pOut++=mm;
      pV=pV-15;
    }
    pV=pV+12;
  }
}
Following is an example of code used to shuffle data for matrix B, with a target shape 16*8:
//element matrix size
const int M=16;
const int N=8;

//Total matrix sizes
const int rowA=64;
const int colA=64;

void shuffle_16x8(input_window<int8> * __restrict matA, output_window<int8> * __restrict matAout){
  const int sizeA=M*N;
  auto pV=aie::begin_vector<16>((int8*)matA->ptr);
  auto pOut=aie::begin_vector<16>((int8*)matAout->ptr);

  aie::vector<int8,16> sv1,sv2;
  for(int i=0;i<rowA/M;i++){
    for(int j=0;j<colA/N/2;j++){//generate two 16*8 matrices an iteration
      for(int k=0;k<M/2;k++){//generate two rows of two 16*8 matrices an iteration
        sv1=*pV;
        pV=pV+4;
        sv2=*pV;
        pV=pV+4;
        auto mm=aie::interleave_zip(sv1,sv2,8);
        *pOut=mm.first;
        pOut+=8;
        *pOut=mm.second;
        pOut-=7;
      }
      pOut+=8;
      pV-=63;
    }
    pV+=60;
  }
}
Following is an example of code used to shuffle data for matrix C, with an input shape 4*8:
//element matrix size
const int M=4;
const int N=8;

//Total matrix sizes
const int rowA=64;
const int colA=64;

void shuffle_4x8(input_buffer_1d<int8> & __restrict matA, output_buffer_1d<int8> & __restrict matAout){
  const int sizeA=M*N;
  auto pV=aie::begin_vector<sizeA>((int8*)matA.data());
  auto pOut=aie::begin_vector<sizeA>((int8*)matAout.data());

  aie::vector<int8,sizeA> mm1,mm2,mm3,mm4;
  for(int i=0;i<rowA/M;i++){
    for(int j=0;j<colA/N/4;j++){ 
      mm1=*pV++;
      mm2=*pV++;
      mm3=*pV++;
      mm4=*pV++;
      auto mm12=aie::interleave_zip(mm1,mm2,8);
      auto mm34=aie::interleave_zip(mm3,mm4,8);
      auto mm1234_low=aie::interleave_zip(mm12.first,mm34.first,16);
      auto mm1234_high=aie::interleave_zip(mm12.second,mm34.second,16);
      *pOut=mm1234_low.first;
      pOut=pOut+2;
      *pOut=mm1234_low.second;
      pOut=pOut+2;
      *pOut=mm1234_high.first;
      pOut=pOut+2;
      *pOut=mm1234_high.second;
      pOut=pOut-5;
    }
    pOut=pOut+6;
  }
}