/*************************************************************************** * Copyright (C) 1994 Charles P. Peterson * * 4007 Enchanted Sun, San Antonio, Texas 78244-1254 * * Email: Charles_P_Peterson@fcircus.sat.tx.us * * * * This is free software with NO WARRANTY. * * See gfft.c, or run program itself, for details. * * Support is available for a fee. * *************************************************************************** * * Program: gfft--General FFT analysis * File: cfft.c * Purpose: fast Fourier analysis function with complex i/o array * Author: Charles Peterson (CPP) * History: 4-June-1993 CPP; Created. * 9-March-1994 CPP; added SAVE_TRIG_OPT sections. * 6-Feb-95 CPP (1.31); Progress Requester * Comment: * Inspired by the algorithm expressed in four1.c, page 411-412 in * Numerical Recipes in C (William H. Press, Brian P. Flannery, Saul A. * Teukolsky, William T. Vetterling. Cambridge University Press, 1988.) * In turn, inspiration for their work came from N. Brenner of Lincoln * Laboratories, Danielson, Lanczos, J. W. Cooley, J. W. Tukey, and many * others, including Fourier, for whom the underlying analysis is named. * This, however, is an original implementation without any copied code. * I hope it is much easier to read and understand than its predecessors. * If nothing else, it is very verbose, and doesn't require an intuitive * understanding of all trigonometric identities. Things got a little * worse when I added an option to save the previously used trig values, * but most people would consider that (in principle) worthwhile. * * Single precision floating point, except for trigonometric values * which are computed in double precision. */ #define PROGRESS_INDICATOR #define SAVE_TRIG_OPT /* * Comment the above lines out for easiest use outside GFFT. * * You will lose the optional saving of trig values, but (see note below) * that really doesn't make a big difference in performance. * * Otherwise, when using this code outside GFFT you will have to duplicate * the GFFT memory allocation function, or modify this file accordingly * (e.g. with an error return if the memory demanded cannot be allocated). * * Note: When SAVE_TRIG_OPT is defined, as in GFFT, there is a BOOLEAN * global variable SaveMemory which disables the saving of trig values * when set. */ #include #ifdef _AMIGA #ifdef _M68881 #include #endif #endif #include "complex.h" #ifdef SAVE_TRIG_OPT #include "gfft.h" /* gmalloc safe allocation (longjmps on error) */ #include "settings.h" /* SaveMemory option */ #endif #ifdef PROGRESS_INDICATOR #include "wbench.h" #define PROGRESS_INCREMENT 1024 long Inner_Loop_Count = 0; extern double Percent_Per_Loop; extern double Initial_Progress; static long next_progress_count = 0; #endif void cfft (Complex_float datac[], long n, int isign) { /* ARGUMENTS: * * datac input/output Complex_float data array * transformed according to isign (see below) * * n number of complex data elements * *** n MUST BE A POWER OF 2 *** * *** THIS IS NOT CHECKED FOR *** * * isign if 1, datac is replaced with its discrete Fourier transform * if -1, datac is replaced with n times its INVERSE * discrete Fourier transform */ #ifdef SAVE_TRIG_OPT static Complex_float *wkvector = NULL; static last_n = 0; #endif #ifdef PROGRESS_INDICATOR next_progress_count = 0; #endif /* * This function is divided into 2 or 3 main blocks to minimize the scope * of variables involved and hopefully to encourage better register use. */ /* In the first block we sort the input array of complex numbers into * bit reversal order (e.g. the value at binary index ...001 is swapped * with the value at binary index 100...) * * While i counts forwards from 1, j counts "inversely" from binary 100... * * Skip i==0 and i==n-1 (They never get swapped anyway) */ { long const halfn = n / 2; long const nminus1 = n - 1; long register j = halfn; long register i = 1; for (; i < nminus1; i++) { if (j > i) /* If move required, and not done before */ { CF_SWAP (datac[i], datac[j]); } /* * The following sub-block is for inverse counting. * To count inversely, we clear highest set bits until reaching * the first clear bit, then set it. */ { long register scanbit = halfn; while (j >= scanbit && scanbit > 0) { j -= scanbit; scanbit >>= 1; } j += scanbit; } /* End inverse counting sub-block */ } /* End for (i = 1, 2, ..., n-1) */ } /* End sorting block */ /* * The following block(s) are the Danielson-Lanczos section of the routine. * Here we combine the transforms, initially of one value each, by two * at a time giving us transforms twice as large, until everything has * been combined into one big transform. * * Thanks to clever sorting, each transform of 'even' elements is * followed by a transform of 'odd' elements during each combination. * * mathematical summary (in which = denotes equality or definition): * (A transform of length one simply copies input to output) * F[k] = F[k][even] + (w ** k) * F[k][odd] * (w ** k) = - (w ** (k + N/2)) * w = e ** (2 * PI * i / N) * e ** (i * theta) = cos (theta) + i * sin (theta) * i = sqr (-1) * e and PI are the named transcendental numbers * cos and sin are the named trigonometric functions * N is the number of complex samples; k is a number 0..N-1 */ #ifdef SAVE_TRIG_OPT if (SaveMemory) #endif /* * This is the unaccelerated version * The trig constants are computed as they are needed * each time cfft is called. */ { long osize; /* Size of old transforms being combined */ long nsize; /* Size of new transforms being produced */ Complex_double w; Complex_double wtemp; Complex_double wk; /* (w ** k) */ Complex_float wk_times_f_k_odd; Complex_float wkfloat; /* Floating copy for innermost loop */ double theta; long ieven, iodd, k; for (osize = 1,nsize = 2; osize < n; osize = nsize,nsize *= 2) { theta = 2.0 * PI / (isign * nsize); /* PI is from math.h */ w.real = cos (theta); w.imaginary = sin (theta); wk.real = 1.0; wk.imaginary = 0.0; wkfloat.real = 1.0; wkfloat.imaginary = 0.0; for (k = 0; k < osize; k++) { for (ieven = k; ieven < n; ieven += nsize) { iodd = ieven + osize; /* * Note that the the upper and lower parts of each new * transform being produced use the same components-- * taken through a 'second cycle' since they are periodic * in their original N (nsize/2). Only W**k differs, * and, fortuitously, W**k = -W**(k+nsize/2). Also * fortuitously, the input slots for F_k_even and F_k_odd * are separated by nsize/2, as are the output slots for * F_k and F_(k+nsize/2), so we can use and write over the * same slots during each pass in the inner loop, using the * same value of W**k (we subtract instead of negating). */ C_MULT (wkfloat, datac[iodd], wk_times_f_k_odd); C_SUB (datac[ieven], wk_times_f_k_odd, datac[iodd]); C_ADD (datac[ieven], wk_times_f_k_odd, datac[ieven]); } /* end for (sums using same wk factor) */ /* C_MULT (wk, w, wk) but must use temp or i/o coincide */ C_MULT (wk, w, wtemp); wkfloat.real = (float) (wk.real = wtemp.real); wkfloat.imaginary = (float) (wk.imaginary = wtemp.imaginary); } /* end for (k = 1, 2, ..., osize) */ } /* end for (osize, nsize...) */ } /* end of Danielson-Lanczos section (SaveMemory slow version) */ #ifdef SAVE_TRIG_OPT else /* if (!SaveMemory) */ { /* To speed things up under most cases (where the same N is used for * more than one transform, we are going to first create the vector of * trigonometrically derived wk constants. If N is unchanged, and the * the vector has already been created, we simply re-use the previous * vector. If it weren't for this section (which uses GFFT allocation * routines) there would be no memory allocated in cfft, and no need for * the gfft.h header file). Note that this also increases the dynamic * memory requirements of gfft significantly because the vector necessary * must be large enough for 2*n complex floating point numbers, making it * one of the biggest single dynamic memory uses--particularly for large N * which some people might be interested in. So, I have decided * to make this feature optional, if included at all. As currently * implemented, it also requires the 'safe' gmalloc, which longjmps * on error (so as not to confuse the interface here and elsewhere). * * 11-March-94 CPP; I'm somewhat disappointed with the 5-7% overall * performance improvement attributable to this modification, in spite of * the fact that lstat shows cfft as using 67% (58% accelerated) of the * total time for a 30 sec batch spectrum analysis (run on a vanilla 68000 * with no FFP even). But, I should have anticipated this minor change; * the trigs become a relatively small part of the overall calculations. * Don't let anyone tell you any different. (Meanwhile, the dreaded * ok_spectrum and ok_read, which look so horribly inefficient, were only * taking around 2% of the total time each.) The number one line is the * complex subtraction following the main fft complex multiplication; it * probably waits there for the complex multiplication to finish. That, * not surprisingly, it the chief bottleneck. As it should be. * */ if (n > 1 && (!wkvector || n != last_n)) { long osize; /* Size of old transforms being combined */ long nsize; /* Size of new transforms being produced */ Complex_double w; Complex_double wk; /* (w ** k) */ Complex_double wtemp; double theta; long k; long vsize; /* vector size */ long ntemp; long wki = 0; /* * The vector size is N + log2(N) - 1 where log2 is 'log base 2' * we calculate this using integer arithmetic */ vsize = -1; for (ntemp = n; ntemp > 1; ntemp /= 2) vsize++; /* log base 2 */ vsize += n; if (wkvector) { gfree (wkvector); } wkvector = gmalloc (vsize * sizeof(Complex_float), NOTHING_SPECIAL); for (osize = 1,nsize = 2; osize < n; osize = nsize,nsize *= 2) { theta = 2.0 * PI / (isign * nsize); /* PI is from math.h */ w.real = cos (theta); w.imaginary = sin (theta); wk.real = 1.0; wk.imaginary = 0.0; wkvector[wki].real = 1.0; wkvector[wki++].imaginary = 0.0; for (k = 0; k < osize; k++) { /* C_MULT (wk, w, wk) but must use temp or i/o coincide */ C_MULT (wk, w, wtemp); wkvector[wki].real = (float) (wk.real = wtemp.real); wkvector[wki++].imaginary = (float) (wk.imaginary = wtemp.imaginary); } /* end for (k = 1, 2, ..., osize) */ } /* end for (osize, nsize...) */ last_n = n; } /* end of trig constants computation block */ /* * Now, we compute this fft */ { long osize; /* Size of old transforms being combined */ long nsize; /* Size of new transforms being produced */ Complex_float wk_times_f_k_odd; long ieven, iodd, k; register Complex_float *wkp = wkvector; for (osize = 1,nsize = 2; osize < n; osize = nsize,nsize *= 2) { for (k = 0; k < osize; k++) { for (ieven = k; ieven < n; ieven += nsize) { iodd = ieven + osize; /* * Note that the the upper and lower parts of each new * transform being produced use the same components-- * taken through a 'second cycle' since they are periodic * in their original N (nsize/2). Only W**k differs, * and, fortuitously, W**k = -W**(k+nsize/2). Also * fortuitously, the input slots for F_k_even and F_k_odd * are separated by nsize/2, as are the output slots for * F_k and F_(k+nsize/2), so we can use and write over the * same slots during each pass in the inner loop, using the * same value of W**k (we subtract instead of negating). */ C_MULT (*wkp, datac[iodd], wk_times_f_k_odd); C_SUB (datac[ieven],wk_times_f_k_odd, datac[iodd]); C_ADD (datac[ieven],wk_times_f_k_odd, datac[ieven]); #ifdef PROGRESS_INDICATOR if (++Inner_Loop_Count >= next_progress_count) { next_progress_count += PROGRESS_INCREMENT; progress_requester__update ((int) (Initial_Progress + (Inner_Loop_Count * Percent_Per_Loop))); } #endif } /* end for (sums using same wk factor) */ wkp++; } /* end for (k = 1, 2, ..., osize) */ wkp++; } /* end for (osize, nsize...) */ } /* end of transform computation */ } /* end of !SaveMemory (fast memory hog) version */ #endif /* whether SAVE_TRIG_OPT defined or not */ } /* end of cfft */