Code

#include <cuda_runtime.h>

#include <cufft.h>

#include <iostream>

#include <vector>

#include <random>

#include <chrono>

#define NUM_VALUES 10000

#define MIN_SIZE 256

#define MAX_SIZE 16384

// Function to generate random powers of 2 between MIN_SIZE and MAX_SIZE

std::vector<int> generate_fft_sizes() {

std::vector<int> sizes;

for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) {

sizes.push_back(size);

}

return sizes;

}

__global__ void correlate(cufftComplex* data, cufftComplex* known_sequence, int size) {

int idx = threadIdx.x + blockIdx.x * blockDim.x;

if (idx < size) {

cufftComplex a = data[idx];

cufftComplex b = known_sequence[idx];

// Perform element-wise complex multiplication

data[idx].x = a.x * b.x - a.y * b.y;

data[idx].y = a.x * b.y + a.y * b.x;

}

int main() {

// Generate FFT sizes

std::vector<int> fft_sizes = generate_fft_sizes();

// Allocate known sequence (for simplicity, use all ones)

cufftComplex* known_sequence;

cudaMallocManaged(&known_sequence, MAX_SIZE * sizeof(cufftComplex));

for (int i = 0; i < MAX_SIZE; ++i) {

known_sequence[i].x = 1.0f;

known_sequence[i].y = 0.0f;

}

std::cout << "FFT Size\tAverage Time (s)" << std::endl;

for (int size : fft_sizes) {

auto start = std::chrono::high_resolution_clock::now();

for (int i = 0; i < NUM_VALUES; ++i) {

// Allocate memory for data

cufftComplex* data;

cudaMallocManaged(&data, size * sizeof(cufftComplex));

// Generate random data

for (int j = 0; j < size; ++j) {

data[j].x = static_cast<float>(rand()) / RAND_MAX;

data[j].y = static_cast<float>(rand()) / RAND_MAX;

}

// Create FFT plan

cufftHandle plan;

cufftPlan1d(&plan, size, CUFFT_C2C, 1);

// Execute forward FFT

cufftExecC2C(plan, data, data, CUFFT_FORWARD);

cudaDeviceSynchronize();

// Correlate with the known sequence

int threads = 256;

int blocks = (size + threads - 1) / threads;

correlate<<<blocks, threads>>>(data, known_sequence, size);

cudaDeviceSynchronize();

// Execute inverse FFT

cufftExecC2C(plan, data, data, CUFFT_INVERSE);

cudaDeviceSynchronize();

// Clean up

cufftDestroy(plan);

cudaFree(data);

}

auto end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> elapsed = end - start;

// Output benchmark results for the current FFT size

std::cout << size << "\t" << (elapsed.count() / NUM_VALUES) << std::endl;

}

// Cleanup

cudaFree(known_sequence);

return 0;

}

Page updated

Google Sites

Report abuse