LDPC译码部分在 aerial/cuPHY/examples/error_correction/cuphy_ex_ldpc.cpp中。新建的文件(你自己建立)需要放在aerial/cuPHY/examples/error_correction,文件的名称后缀名需要以.cu结尾,否则不能运行CUDA编写的函数。以下是程序具体内容: \#include <stdio.h>
\#include <limits>
\#include "ldpc_decode_test_vec_gen.hpp"
\#include "cuphy.h"
\#include <stdlib.h>
\#include <string>
\#include <iostream>
\#include <vector>
\#include <getopt.h>
\#include <bitset>
\#include "cuphy.hpp"
\#include "cuphy_hdf5.hpp"
\#include "hdf5hpp.hpp"
\#include <immintrin.h>
\#include <emmintrin.h>
\#include <cuda_fp16.h>
using namespace cuphy;
\#define REVERSE_BIT(dat) \
(((dat & 0x01) << 7) | ((dat & 0x02) << 5) | ((dat & 0x04) << 3) | ((dat & 0x08) << 1) | \
((dat & 0x10) >> 1) | ((dat & 0x20) >> 3) | ((dat & 0x40) >> 5) | ((dat & 0x80) >> 7))
__global__ void reverse_bit_8block_D(int8_t *input, int8_t *output, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
• output[tid] = REVERSE_BIT(input[tid]);
}
}
__global__ void float_to_half(half *H, int8_t* SI, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
• float b = (float)SI[tid];
• H[tid] = __float2half(b);
}
}
__global__ void convertToBinary_8block_D(unsigned char *input, unsigned char *output, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
• unsigned char value = input[tid];
• for(int i=0; i<=7; i++)
• {
• output[tid*8 + i] = (value & (1<<i)) ? 1 : 0;
•
• }
}
}
// get_QAM_log2_D()
int get_QAM_log2_D(const char* str)
{
if(0 == strcmp(str, "QAM256"))
{
• return CUPHY_QAM_256;
}
else if(0 == strcmp(str, "QAM64"))
{
• return CUPHY_QAM_64;
}
else if(0 == strcmp(str, "QAM16"))
{
• return CUPHY_QAM_16;
}
else if(0 == strcmp(str, "QPSK"))
{
• return CUPHY_QAM_4;
}
else if(0 == strcmp(str, "BPSK"))
{
• return CUPHY_QAM_2;
}
else
{
• std::runtime_error(std::string("Invalid modulation: ") + str);
• return 0;
}
}
void gpu_LDPCDecode_single(int8_t* input, int8_t* output, int BG, int Kb, int Z, float code_rate, int numMaxIter, int K, const char* str)
{
int numIterations = numMaxIter;
int algoIndex = 0;
float minSumNorm = 0.0f;
int log2QAM = get_QAM_log2_D(str);
bool chooseHighThroughput = false;
int N = std::lroundf(K / code_rate);
// tv.config().num_cw = C;
int K_padding = (K % 32 != 0) * (32 - (K % 32));
// N: Number of modulated bits
// B: Input block size
// mb: number of parity nodes
// P: punctured parity bits (0 <= P < Z)
//
// N = B + Z(mb - 2) - P
// N - B P
// ----- + 2 = mb - ---
// Z Z
//
//
// 0 <= P/Z < 1
//
// mb = ceil(2 + (N - B)/Z) = ceil((2Z + N - B) / Z)
int mb = static_cast<int>(std::ceil((N - K + (2 * Z)) / static_cast<float>(Z)));
const int V = mb + ((1 == BG) ? 22 : 10);
const int NUM_SYMBOLS = (Z*V + log2QAM - 1) / log2QAM;
const int NUM_LLR = NUM_SYMBOLS * log2QAM;
int NUM_padding = (NUM_LLR % 32 != 0) * (32 - (NUM_LLR % 32));
cuphy::tensor_device tLLR_16(CUPHY_R_16F, NUM_LLR);
tensor_device tDecode(CUPHY_BIT, K);
int8_t* gpuOutput;
int8_t* gpuInput;
cudaMalloc((void**)&gpuInput, (NUM_LLR+NUM_padding) * sizeof(int8_t));
cudaMalloc((void**)&gpuOutput, (K+K_padding)/8 * sizeof(int8_t));
CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));
int block = 1024;
int size_h = NUM_LLR+NUM_padding;
int grid = (size_h + block - 1) / block;
float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);
// Create an LDPC decoder instance
static cuphy::context ctx;
static cuphy:DPC_decoder dec(ctx);
//--------------------------------------------------------------
// Initialize an LDPC decode configuration. This is used for
// both the tensor and transport block interfaces.
uint32_t decode_flags = chooseHighThroughput ? CUPHY_LDPC_DECODE_CHOOSE_THROUGHPUT : 0;
cuphy:DPC_decode_config dec_cfg(CUPHY_R_16F, // LLR type (fp16 or fp32)
• mb, // num parity nodes
• Z, // lifting size
• numIterations, // max num iterations
• Kb, // info nodes
• minSumNorm, // normalization value
• decode_flags, // flags
• BG, // base graph
• algoIndex, // algorithm index
• nullptr); // workspace address
//--------------------------------------------------------------
// If no normalization value was provided, query the library for
// an appropriate value.
if(minSumNorm <= 0.0f)
{
dec.set_normalization(dec_cfg);
}
//--------------------------------------------------------------
// Initialize an LDPC decode descriptor structure. (This is only
// used when the transport block interface is selected.)
LDPC_decode_desc dec_desc(dec_cfg);
//--------------------------------------------------------------
// Initialize an LDPC decode tensor params structure. (This is
// only used when the tensor-based decoder interface is selected.)
LDPC_decode_tensor_params dec_tensor(dec_cfg, // LDPC configuration
• tDecode.desc().handle(), // output descriptor
• tDecode.addr(), // output address
• tLLR_16.desc().handle(), // LLR descriptor
• tLLR_16.addr()); // LLR address
dec.decode(dec_tensor);
int block_size = 1024;
int size = (K+K_padding)/8;
int grid_size = (size + block_size - 1) / block_size;
reverse_bit_8block_D<<<grid_size, block_size>>>((int8_t*)tDecode.addr(), gpuOutput, size);
CUDA_CHECK(cudaMemcpy(output, gpuOutput, K* sizeof(int8_t)/8, cudaMemcpyDeviceToHost));
cudaFree(gpuOutput);
cudaFree(gpuInput);
}
这里只讲解与译码相关部分,其余与编码部分类似,不再赘述。 首先是输入参数,其中code_rate是码率,据此与block_length(即B)计算输入的总比特数。numMaxIter是译码迭代的最大次数,F是填充比特(由于有时B不是恰好等于KbZc,因此就需要填充一部分零使其等于KbZc),str代表的是调制方式(不同调制方式导致不同的符号数)。 下面直到tensor_device声明都是一些变量设置,默认即可。 先将输入二维数组变为一维并通过cudaMemcpy函数传递给GPU变量gpuInput。 CUDA_CHECK(cudaMemcpy(gpuInput, input, NUM_LLR, cudaMemcpyHostToDevice));
随后直接使用CUDA将输入的int8_t数据类型转变为half类型并赋给d_in_tensor(需要说明,在GPU译码中,输入译码器的格式是半精度浮点数,只有16位比特,相比于单精度的32位,这样做可以大大节省存储空间)。即 int block = 1024;
int size_h = NUM_LLR+NUM_padding;
int grid = (size_h + block - 1) / block;
float_to_half<<<grid, block>>>((half*)tLLR_16.addr(), gpuInput, size_h);
此函数结构也很简单,如下: __global__ void float_to_half(half *H, int8_t* SI, int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < size)
{
• float b = (float)SI[tid];
• H[tid] = __float2half(b);
}
}
先将int8_t变为float,再将float使用__float2half函数变为half类型,这样使用CUDA来进行转变速度非常快。 随后就是一些GPU译码的参数设置,同样保持不变就行。 最后将其从GPU中拷出,译码就完成了。
|