Code
#include <cuda_runtime.h>
#include <cufft.h>
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#define NUM_VALUES 10000
#define MIN_SIZE 256
#define MAX_SIZE 16384
// Function to generate random powers of 2 between MIN_SIZE and MAX_SIZE
std::vector<int> generate_fft_sizes() {
std::vector<int> sizes;
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) {
sizes.push_back(size);
}
return sizes;
}
__global__ void correlate(cufftComplex* data, cufftComplex* known_sequence, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < size) {
cufftComplex a = data[idx];
cufftComplex b = known_sequence[idx];
// Perform element-wise complex multiplication
data[idx].x = a.x * b.x - a.y * b.y;
data[idx].y = a.x * b.y + a.y * b.x;
}
}
int main() {
// Generate FFT sizes
std::vector<int> fft_sizes = generate_fft_sizes();
// Allocate known sequence (for simplicity, use all ones)
cufftComplex* known_sequence;
cudaMallocManaged(&known_sequence, MAX_SIZE * sizeof(cufftComplex));
for (int i = 0; i < MAX_SIZE; ++i) {
known_sequence[i].x = 1.0f;
known_sequence[i].y = 0.0f;
}
std::cout << "FFT Size\tAverage Time (s)" << std::endl;
for (int size : fft_sizes) {
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_VALUES; ++i) {
// Allocate memory for data
cufftComplex* data;
cudaMallocManaged(&data, size * sizeof(cufftComplex));
// Generate random data
for (int j = 0; j < size; ++j) {
data[j].x = static_cast<float>(rand()) / RAND_MAX;
data[j].y = static_cast<float>(rand()) / RAND_MAX;
}
// Create FFT plan
cufftHandle plan;
cufftPlan1d(&plan, size, CUFFT_C2C, 1);
// Execute forward FFT
cufftExecC2C(plan, data, data, CUFFT_FORWARD);
cudaDeviceSynchronize();
// Correlate with the known sequence
int threads = 256;
int blocks = (size + threads - 1) / threads;
correlate<<<blocks, threads>>>(data, known_sequence, size);
cudaDeviceSynchronize();
// Execute inverse FFT
cufftExecC2C(plan, data, data, CUFFT_INVERSE);
cudaDeviceSynchronize();
// Clean up
cufftDestroy(plan);
cudaFree(data);
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
// Output benchmark results for the current FFT size
std::cout << size << "\t" << (elapsed.count() / NUM_VALUES) << std::endl;
}
// Cleanup
cudaFree(known_sequence);
return 0;
}