高速フーリエ変換 OpenMPで最適化版 | 16.78MHz

高速フーリエ変換 OpenMPで最適化版


#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <omp.h>

typedef struct {
float real;
float imaginary;
} Complex;

void initComplex( Complex *_arg_this, float _arg_real, float _arg_imaginary ) {
_arg_this->real = _arg_real;
_arg_this->imaginary = _arg_imaginary;
}

void addComplex( Complex *_arg_this, const Complex *_arg_left, const Complex *_arg_right ) {
_arg_this->real = _arg_left->real + _arg_right->real;
_arg_this->imaginary = _arg_left->imaginary + _arg_right->imaginary;
}

void multComplex( Complex *_arg_this, const Complex *_arg_left, const Complex *_arg_right ) {
_arg_this->real = _arg_left->real * _arg_right->real -
_arg_left->imaginary * _arg_right->imaginary;
_arg_this->imaginary = _arg_left->real * _arg_right->imaginary +
_arg_left->imaginary * _arg_right->real;
}

void divComplexByScalar( Complex *_arg_this, const Complex *_arg_left, float _arg_right ) {
_arg_this->real = _arg_left->real / _arg_right;
_arg_this->imaginary = _arg_left->imaginary / _arg_right;
}


typedef struct {
unsigned int size;
int inverse;
Complex *w;
Complex *w_gpu;
} FFT;

int initFFT( FFT *_arg_this, unsigned int _arg_size, int _arg_inverse ) {
_arg_this->size = _arg_size;
_arg_this->inverse = _arg_inverse;
_arg_this->w = ( Complex* )malloc( sizeof( Complex ) * _arg_this->size );
if( !_arg_this->w )
return -1;

float direction = -1.0f;
if( _arg_this->inverse )
direction = 1.0f;

int counter;
for( counter = 0; counter != _arg_this->size; counter++ ) {
float angle = direction * 2.0 * M_PI * counter / _arg_this->size;
initComplex( _arg_this->w + counter, cosf( angle ), sinf( angle ) );
}

return 0;
}

void finalFFT( FFT *_arg_this ) {
free( _arg_this->w );
}

unsigned int flipBitFFT( FFT *_arg_this, unsigned int _arg_value ) {
int counter;
unsigned int temp = 0;
unsigned int log_size = (unsigned int)log2f( (float)_arg_this->size );
for( counter = 0; counter != log_size; counter++ ) {
temp <<= 1;
temp |= ( _arg_value >> counter ) & 0x01;
}
return temp;
}

void swapFFT( FFT *_arg_this, Complex *_arg_buffer ) {
unsigned int counter;
for( counter = 0; counter != _arg_this->size; counter++ ) {
unsigned int flipped = flipBitFFT( _arg_this, counter );
if( counter < flipped ) {
Complex temp = _arg_buffer[ counter ];
_arg_buffer[ counter ] = _arg_buffer[ flipped ];
_arg_buffer[ flipped ] = temp;
}
}
}

void processFFT( const FFT *_arg_this, Complex *_arg_dest, const Complex *_arg_src, unsigned int _arg_block_size ) {
unsigned int counter;
#pragma omp parallel private( counter )
for( counter = omp_get_thread_num(); counter < _arg_this->size; counter+= omp_get_num_threads() ) {
unsigned int left = counter;
unsigned int right = ( counter & ~( _arg_block_size >> 1 ) ) | ( ~counter & ( _arg_block_size >> 1 ) );
if( left > right ) {
unsigned int temp = left;
left = right;
right = temp;
}
unsigned int w_step = _arg_this->size / _arg_block_size;
Complex m;
multComplex( &m, _arg_this->w + ( ( counter % _arg_block_size ) * w_step ), _arg_src + right );
addComplex( _arg_dest + counter, _arg_src + left, &m );
}
}

void divFFT( FFT *_arg_this, Complex *_arg_buffer ) {
unsigned int counter;
#pragma omp parallel private( counter )
for( counter = omp_get_thread_num(); counter < _arg_this->size; counter+= omp_get_num_threads() )
divComplexByScalar( _arg_buffer + counter, _arg_buffer + counter, _arg_this->size );
}

int runFFT( FFT *_arg_this, Complex *_arg_buffer ) {
swapFFT( _arg_this, _arg_buffer );
Complex *swap = ( Complex* )malloc( sizeof( Complex ) * _arg_this->size );
if( !swap )
return -1;
Complex *original_swap = swap;
int counter;

for( counter = 2; counter <= _arg_this->size; counter <<= 1 ) {
processFFT( _arg_this, swap, _arg_buffer, counter );
Complex *temp = swap;
swap = _arg_buffer;
_arg_buffer = temp;
}
if( _arg_this->inverse )
divFFT( _arg_this, _arg_buffer );

if( swap != original_swap ) {
Complex *temp = swap;
swap = _arg_buffer;
_arg_buffer = temp;
memcpy( _arg_buffer, swap, sizeof( Complex ) * _arg_this->size );
}

free( original_swap );
return 0;
}

int main( int argc, char *argv[] ) {
int size = 65536 * 64;
FFT test_fft;
initFFT( &test_fft, size, 0 );
FFT test_ifft;
initFFT( &test_ifft, size, 1 );

struct timeval tv[ 2 ];

Complex * buffer = ( Complex* )calloc( sizeof( Complex ), size );
buffer[ size / 2 ].real = 1000.0f;


gettimeofday(tv, NULL);
runFFT( &test_fft, buffer );
gettimeofday(tv + 1, NULL);
printf( "FFT time ( CPU ): %lf\n", ( (double)tv[ 1 ].tv_sec + (double)tv[ 1 ].tv_usec * 0.000001 ) -
( (double)tv[ 0 ].tv_sec + (double)tv[ 0 ].tv_usec * 0.000001 ) );

int counter;
for( counter = size * 0.8f; counter != size; counter++ ) {
buffer[ counter ].real = 0.0f;
buffer[ counter ].imaginary = 0.0f;
}

gettimeofday(tv, NULL);
runFFT( &test_ifft, buffer );
gettimeofday(tv + 1, NULL);
printf( "IFFT time ( CPU ): %lf\n", ( (double)tv[ 1 ].tv_sec + (double)tv[ 1 ].tv_usec * 0.000001 ) -
( (double)tv[ 0 ].tv_sec + (double)tv[ 0 ].tv_usec * 0.000001 ) );


return EXIT_SUCCESS;
}


$ gcc fft.c -o fft -lm -O2 -march=core2 -msse4 -mfpmath=sse -fno-math-errno -ffast-math -fforce-addr -fomit-frame-pointer -pipe -fstrength-reduce -fno-strict-aliasing -fstack-protector -fopenmp
$ $ ./fft
FFT time ( CPU ): 2.270238
IFFT time ( CPU ): 2.610293