GPU LDPC译码

bigbing · 发表于 2023-6-22 17:19:35

LDPC译码部分在 aerial/cuPHY/examples/error_correction/cuphy_ex_ldpc.cpp中。新建的文件（你自己建立）需要放在aerial/cuPHY/examples/error_correction，文件的名称后缀名需要以.cu结尾，否则不能运行CUDA编写的函数。以下是程序具体内容：

\#include <stdio.h>

\#include <limits>

\#include "ldpc_decode_test_vec_gen.hpp"

\#include "cuphy.h"

\#include <stdlib.h>

\#include <string>

\#include <iostream>

\#include <vector>

\#include <getopt.h>

\#include <bitset>

\#include "cuphy.hpp"

\#include "cuphy_hdf5.hpp"

\#include "hdf5hpp.hpp"

\#include <immintrin.h>

\#include <emmintrin.h>

\#include <cuda_fp16.h>

using namespace cuphy;

\#define REVERSE_BIT(dat) \

  (((dat & 0x01) << 7) | ((dat & 0x02) << 5) | ((dat & 0x04) << 3) | ((dat & 0x08) << 1) | \

  ((dat & 0x10) >> 1) | ((dat & 0x20) >> 3) | ((dat & 0x40) >> 5) | ((dat & 0x80) >> 7))

__global__ void reverse_bit_8block_D(int8_t *input, int8_t *output, int size)

{

  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  if(tid < size)

  {

• output[tid] = REVERSE_BIT(input[tid]);

  }

}

__global__ void float_to_half(half *H, int8_t* SI, int size)

{

  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  if(tid < size)

  {

• float b = (float)SI[tid];

• H[tid] = __float2half(b);

  }

}

__global__ void convertToBinary_8block_D(unsigned char *input, unsigned char *output, int size)

{

  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  if(tid < size)

  {

• unsigned char value = input[tid];

• for(int i=0; i<=7; i++)

• {

•    output[tid*8 + i] = (value & (1<<i)) ? 1 : 0;

•

• }

  }

}

// get_QAM_log2_D()

int get_QAM_log2_D(const char* str)

{

  if(0 == strcmp(str, "QAM256"))

  {

• return CUPHY_QAM_256;

  }

  else if(0 == strcmp(str, "QAM64"))

  {

• return CUPHY_QAM_64;

  }

  else if(0 == strcmp(str, "QAM16"))

  {

• return CUPHY_QAM_16;

  }

  else if(0 == strcmp(str, "QPSK"))

  {

• return CUPHY_QAM_4;

  }

  else if(0 == strcmp(str, "BPSK"))

  {

• return CUPHY_QAM_2;

  }

  else

  {

• std::runtime_error(std::string("Invalid modulation: ") + str);

• return 0;

  }

}

void gpu_LDPCDecode_single(int8_t* input, int8_t* output, int BG, int Kb, int Z, float code_rate, int numMaxIter, int K, const char* str)

{

  int    numIterations    = numMaxIter;

  int    algoIndex    = 0;

  float    minSumNorm    = 0.0f;

  int    log2QAM       = get_QAM_log2_D(str);

  bool    chooseHighThroughput = false;

  int N = std::lroundf(K / code_rate);

  // tv.config().num_cw = C;

  int K_padding = (K % 32 != 0) * (32 - (K % 32));

  // N: Number of modulated bits

  // B: Input block size

  // mb: number of parity nodes

  // P: punctured parity bits (0 <= P < Z)

  //

  // N = B + Z(mb - 2) - P

  // N - B    P

  // ----- + 2 = mb - ---

  //  Z       Z

  //

  //

  // 0 <= P/Z < 1

  //

  // mb = ceil(2 + (N - B)/Z) = ceil((2Z + N - B) / Z)

  int mb = static_cast<int>(std::ceil((N - K + (2 * Z)) / static_cast<float>(Z)));



  const int    V       = mb + ((1 == BG) ? 22 : 10);

  const int    NUM_SYMBOLS = (Z*V + log2QAM - 1) / log2QAM;

  const int    NUM_LLR = NUM_SYMBOLS * log2QAM;

  int NUM_padding = (NUM_LLR % 32 != 0) * (32 - (NUM_LLR % 32));

  cuphy::tensor_device tLLR_16(CUPHY_R_16F, NUM_LLR);

  tensor_device tDecode(CUPHY_BIT, K);

  int8_t* gpuOutput;

  int8_t* gpuInput;

  cudaMalloc((void**)&gpuInput, (NUM_LLR+NUM_padding) * sizeof(int8_t));

cudaMalloc((void**)&gpuOutput, (K+K_padding)/8 * sizeof(int8_t));

  CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));

  int block = 1024;

  int size_h = NUM_LLR+NUM_padding;

  int grid = (size_h + block - 1) / block;

  float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);

  // Create an LDPC decoder instance

  static cuphy::context ctx;

  static cuphy:

DPC_decoder dec(ctx);

  //--------------------------------------------------------------

  // Initialize an LDPC decode configuration. This is used for

  // both the tensor and transport block interfaces.

  uint32_t decode_flags = chooseHighThroughput ? CUPHY_LDPC_DECODE_CHOOSE_THROUGHPUT : 0;

  cuphy:

DPC_decode_config dec_cfg(CUPHY_R_16F, // LLR type (fp16 or fp32)

•                   mb, // num parity nodes

•                   Z, // lifting size

•                   numIterations, // max num iterations

•                   Kb, // info nodes

•                   minSumNorm, // normalization value

•                   decode_flags,  // flags

•                   BG, // base graph

•                   algoIndex, // algorithm index

•                   nullptr); // workspace address

  //--------------------------------------------------------------

  // If no normalization value was provided, query the library for

  // an appropriate value.

  if(minSumNorm <= 0.0f)

  {

dec.set_normalization(dec_cfg);

  }

  //--------------------------------------------------------------

  // Initialize an LDPC decode descriptor structure. (This is only

  // used when the transport block interface is selected.)

  LDPC_decode_desc dec_desc(dec_cfg);

  //--------------------------------------------------------------

  // Initialize an LDPC decode tensor params structure. (This is

  // only used when the tensor-based decoder interface is selected.)

  LDPC_decode_tensor_params dec_tensor(dec_cfg,       // LDPC configuration

•                   tDecode.desc().handle(), // output descriptor

•                   tDecode.addr(),    // output address

•                   tLLR_16.desc().handle(),  // LLR descriptor

•                   tLLR_16.addr());    // LLR address

  dec.decode(dec_tensor);

  int block_size = 1024;

  int size = (K+K_padding)/8;

  int grid_size = (size + block_size - 1) / block_size;

  reverse_bit_8block_D<<<grid_size, block_size>>>((int8_t*)tDecode.addr(), gpuOutput, size);

  CUDA_CHECK(cudaMemcpy(output, gpuOutput, K* sizeof(int8_t)/8, cudaMemcpyDeviceToHost));

  cudaFree(gpuOutput);

  cudaFree(gpuInput);

}

这里只讲解与译码相关部分，其余与编码部分类似，不再赘述。

首先是输入参数，其中code_rate是码率，据此与block_length（即B）计算输入的总比特数。numMaxIter是译码迭代的最大次数，F是填充比特（由于有时B不是恰好等于KbZc，因此就需要填充一部分零使其等于KbZc），str代表的是调制方式（不同调制方式导致不同的符号数）。

下面直到tensor_device声明都是一些变量设置，默认即可。

先将输入二维数组变为一维并通过cudaMemcpy函数传递给GPU变量gpuInput。

CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));

随后直接使用CUDA将输入的int8_t数据类型转变为half类型并赋给d_in_tensor（需要说明，在GPU译码中，输入译码器的格式是半精度浮点数，只有16位比特，相比于单精度的32位，这样做可以大大节省存储空间）。即

int block = 1024;

  int size_h = NUM_LLR+NUM_padding;

  int grid = (size_h + block - 1) / block;

  float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);

此函数结构也很简单，如下：

__global__ void float_to_half(half *H, int8_t* SI, int size)

{

  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  if(tid < size)

  {

• float b = (float)SI[tid];

• H[tid] = __float2half(b);

  }

}

先将int8_t变为float，再将float使用__float2half函数变为half类型，这样使用CUDA来进行转变速度非常快。

随后就是一些GPU译码的参数设置，同样保持不变就行。

最后将其从GPU中拷出，译码就完成了。

		自动登录	找回密码
密码			立即注册