/***************************************************************************
 *		  Copyright (C) 1994  Charles P. Peterson                  *
 *	     4007 Enchanted Sun, San Antonio, Texas 78244-1254             *
 *              Email: Charles_P_Peterson@fcircus.sat.tx.us                *
 *                                                                         *
 *		  This is free software with NO WARRANTY.                  *
 *	      See gfft.c, or run program itself, for details.              *
 *		      Support is available for a fee.                      *
 ***************************************************************************
 *
 * Program:     gfft--General FFT analysis
 * File:        cfft.c
 * Purpose:     fast Fourier analysis function with complex i/o array
 * Author:      Charles Peterson (CPP)
 * History:     4-June-1993 CPP; Created.
 *              9-March-1994 CPP; added SAVE_TRIG_OPT sections.
 *              6-Feb-95 CPP (1.31); Progress Requester
 * Comment:
 *   Inspired by the algorithm expressed in four1.c, page 411-412 in
 *   Numerical Recipes in C  (William H. Press, Brian P. Flannery, Saul A.
 *   Teukolsky, William T. Vetterling.  Cambridge University Press, 1988.)
 *   In turn, inspiration for their work came from N. Brenner of Lincoln
 *   Laboratories, Danielson, Lanczos, J. W. Cooley, J. W. Tukey, and many
 *   others, including Fourier, for whom the underlying analysis is named.
 *   This, however, is an original implementation without any copied code.
 *   I hope it is much easier to read and understand than its predecessors.
 *   If nothing else, it is very verbose, and doesn't require an intuitive
 *   understanding of all trigonometric identities.  Things got a little
 *   worse when I added an option to save the previously used trig values,
 *   but most people would consider that (in principle) worthwhile.
 *
 *   Single precision floating point, except for trigonometric values
 *   which are computed in double precision.
 */

#define PROGRESS_INDICATOR
#define SAVE_TRIG_OPT
/*
 * Comment the above lines out for easiest use outside GFFT.
 *
 * You will lose the optional saving of trig values, but (see note below)
 * that really doesn't make a big difference in performance.
 *
 * Otherwise, when using this code outside GFFT you will have to duplicate
 * the GFFT memory allocation function, or modify this file accordingly
 * (e.g. with an error return if the memory demanded cannot be allocated).
 *
 * Note: When SAVE_TRIG_OPT is defined, as in GFFT, there is a BOOLEAN
 * global variable SaveMemory which disables the saving of trig values
 * when set.
 */

#include <math.h>

#ifdef _AMIGA
#ifdef _M68881
#include <m68881.h>
#endif
#endif

#include "complex.h"

#ifdef SAVE_TRIG_OPT
#include "gfft.h"       /* gmalloc safe allocation (longjmps on error) */
#include "settings.h"   /* SaveMemory option */
#endif

#ifdef PROGRESS_INDICATOR
#include "wbench.h"
#define PROGRESS_INCREMENT 1024
long Inner_Loop_Count = 0;
extern double Percent_Per_Loop;
extern double Initial_Progress;
static long next_progress_count = 0;
#endif

void cfft (Complex_float datac[], long n, int isign)
{
/* ARGUMENTS:
 *
 * datac       input/output Complex_float data array
 *             transformed according to isign (see below)
 *
 * n            number of complex data elements
 *              *** n MUST BE A POWER OF 2 ***
 *              *** THIS IS NOT CHECKED FOR ***
 *
 * isign	if 1, datac is replaced with its discrete Fourier transform
 *              if -1, datac is replaced with n times its INVERSE
 *                discrete Fourier transform
 */

#ifdef SAVE_TRIG_OPT
    static Complex_float *wkvector = NULL;
    static last_n = 0;
#endif

#ifdef PROGRESS_INDICATOR
    next_progress_count = 0;
#endif


/* 
 * This function is divided into 2 or 3 main blocks to minimize the scope
 * of variables involved and hopefully to encourage better register use.
 */

/* In the first block we sort the input array of complex numbers into
 * bit reversal order (e.g. the value at binary index ...001 is swapped
 * with the value at binary index 100...)  
 *
 * While i counts forwards from 1, j counts "inversely" from binary 100...
 *
 * Skip i==0 and i==n-1 (They never get swapped anyway)
 */

    {
	long const halfn = n / 2;
	long const nminus1 = n - 1;
	long register j = halfn;
	long register i = 1;

	for (; i < nminus1; i++)
	{
	    if (j > i)    /* If move required, and not done before */
	    {
		CF_SWAP (datac[i], datac[j]);
	    }
	/* 
         * The following sub-block is for inverse counting.
         * To count inversely, we clear highest set bits until reaching
         * the first clear bit, then set it.
         */
	    {
		long register scanbit = halfn;
		while (j >= scanbit && scanbit > 0)
		{
		    j -= scanbit;
		    scanbit >>= 1;
		}
	        j += scanbit;

	    }   /* End inverse counting sub-block */

	}   /* End for (i = 1, 2, ..., n-1) */

    }   /* End sorting block */
/*
 * The following block(s) are the Danielson-Lanczos section of the routine.
 * Here we combine the transforms, initially of one value each, by two
 * at a time giving us transforms twice as large, until everything has
 * been combined into one big transform.
 *
 * Thanks to clever sorting, each transform of 'even' elements is
 * followed by a transform of 'odd' elements during each combination.
 *
 * mathematical summary (in which = denotes equality or definition):
 * (A transform of length one simply copies input to output)
 * F[k] = F[k][even] + (w ** k) * F[k][odd]
 * (w ** k) = - (w ** (k + N/2))
 * w = e ** (2 * PI * i / N)
 * e ** (i * theta) = cos (theta) + i * sin (theta)
 * i = sqr (-1)
 * e and PI are the named transcendental numbers
 * cos and sin are the named trigonometric functions
 * N is the number of complex samples; k is a number 0..N-1
 */

#ifdef SAVE_TRIG_OPT
    if (SaveMemory)
#endif

   /*
    * This is the unaccelerated version
    * The trig constants are computed as they are needed
    * each time cfft is called.
    */

    {
	long osize;    /* Size of old transforms being combined */
	long nsize;    /* Size of new transforms being produced */
	Complex_double w;
	Complex_double wtemp;
	Complex_double wk;        /* (w ** k) */
	Complex_float wk_times_f_k_odd;
	Complex_float wkfloat;    /* Floating copy for innermost loop */

	double theta;
	long ieven, iodd, k;

	for (osize = 1,nsize = 2;  osize < n;  osize = nsize,nsize *= 2)
	{
	    theta = 2.0 * PI / (isign * nsize);  /* PI is from math.h */
	    w.real = cos (theta);
	    w.imaginary = sin (theta);
	    wk.real = 1.0;
	    wk.imaginary = 0.0;
	    wkfloat.real = 1.0;
	    wkfloat.imaginary = 0.0;

	    for (k = 0; k < osize; k++)
	    {
		for (ieven = k; ieven < n; ieven += nsize)
		{
		    iodd = ieven + osize;
		/*
                 * Note that the the upper and lower parts of each new
                 * transform being produced use the same components--
                 * taken through a 'second cycle' since they are periodic
		 * in their original N (nsize/2).  Only W**k differs,
		 * and, fortuitously, W**k = -W**(k+nsize/2).  Also
		 * fortuitously, the input slots for F_k_even and F_k_odd
		 * are separated by nsize/2, as are the output slots for 
                 * F_k and F_(k+nsize/2), so we can use and write over the 
                 * same slots during each pass in the inner loop, using the
                 * same value of W**k (we subtract instead of negating).
		 */
		    C_MULT (wkfloat, datac[iodd], wk_times_f_k_odd);
		    C_SUB (datac[ieven], wk_times_f_k_odd, datac[iodd]);
		    C_ADD (datac[ieven], wk_times_f_k_odd, datac[ieven]);

		} /* end for (sums using same wk factor) */

	    /*  C_MULT (wk, w, wk) but must use temp or i/o coincide */

		C_MULT (wk, w, wtemp);
		wkfloat.real = (float) (wk.real = wtemp.real);
		wkfloat.imaginary = (float) (wk.imaginary = 
					     wtemp.imaginary);

	    } /* end for (k = 1, 2, ..., osize) */

	} /* end for (osize, nsize...) */

    } /* end of Danielson-Lanczos section (SaveMemory slow version) */

#ifdef SAVE_TRIG_OPT
    else /* if (!SaveMemory) */
    {

/* To speed things up under most cases (where the same N is used for
 * more than one transform, we are going to first create the vector of
 * trigonometrically derived wk constants.  If N is unchanged, and the
 * the vector has already been created, we simply re-use the previous
 * vector.  If it weren't for this section (which uses GFFT allocation
 * routines) there would be no memory allocated in cfft, and no need for
 * the gfft.h header file).  Note that this also increases the dynamic
 * memory requirements of gfft significantly because the vector necessary
 * must be large enough for 2*n complex floating point numbers, making it
 * one of the biggest single dynamic memory uses--particularly for large N
 * which some people might be interested in.  So, I have decided
 * to make this feature optional, if included at all.  As currently
 * implemented, it also requires the 'safe' gmalloc, which longjmps
 * on error (so as not to confuse the interface here and elsewhere).
 *
 * 11-March-94 CPP; I'm somewhat disappointed with the 5-7% overall
 * performance improvement attributable to this modification, in spite of
 * the fact that lstat shows cfft as using 67% (58% accelerated) of the
 * total time for a 30 sec batch spectrum analysis (run on a vanilla 68000
 * with no FFP even).  But, I should have anticipated this minor change;
 * the trigs become a relatively small part of the overall calculations.
 * Don't let anyone tell you any different.  (Meanwhile, the dreaded
 * ok_spectrum and ok_read, which look so horribly inefficient, were only
 * taking around 2% of the total time each.)  The number one line is the
 * complex subtraction following the main fft complex multiplication; it
 * probably waits there for the complex multiplication to finish.  That,
 * not surprisingly, it the chief bottleneck.  As it should be.
 *
 */

	if (n > 1 && (!wkvector || n != last_n))
	{
	    long osize;    /* Size of old transforms being combined */
	    long nsize;    /* Size of new transforms being produced */
	    Complex_double w;
	    Complex_double wk;          /* (w ** k) */
	    Complex_double wtemp;

	    double theta;
	    long k;
	    long vsize;  /* vector size */
	    long ntemp;
	    long wki = 0;
	/*
	 * The vector size is N + log2(N) - 1 where log2 is 'log base 2'
	 * we calculate this using integer arithmetic
	 */
	    vsize = -1;
	    for (ntemp = n; ntemp > 1; ntemp /= 2) vsize++; /* log base 2 */
	    vsize += n;

	    if (wkvector)
	    {
		gfree (wkvector);
	    }
	    wkvector = gmalloc (vsize * sizeof(Complex_float), 
				NOTHING_SPECIAL);

	    for (osize = 1,nsize = 2;  osize < n;  osize = nsize,nsize *= 2)
	    {
		theta = 2.0 * PI / (isign * nsize);  /* PI is from math.h */
		w.real = cos (theta);
		w.imaginary = sin (theta);

		wk.real = 1.0;
		wk.imaginary = 0.0;

		wkvector[wki].real = 1.0;
		wkvector[wki++].imaginary = 0.0;

		for (k = 0; k < osize; k++)
		{
		/*  C_MULT (wk, w, wk) but must use temp or i/o coincide */

		    C_MULT (wk, w, wtemp);
		    wkvector[wki].real = (float) (wk.real = wtemp.real);
		    wkvector[wki++].imaginary = (float) (wk.imaginary = 
							 wtemp.imaginary);

		} /* end for (k = 1, 2, ..., osize) */

	    } /* end for (osize, nsize...) */

	    last_n = n;

	} /* end of trig constants computation block */

    /*
     * Now, we compute this fft
     */
        {
	    long osize;    /* Size of old transforms being combined */
	    long nsize;    /* Size of new transforms being produced */
	    Complex_float wk_times_f_k_odd;

	    long ieven, iodd, k;
	    register Complex_float *wkp = wkvector;

	    for (osize = 1,nsize = 2; osize < n;  osize = nsize,nsize *= 2)
	    {
		for (k = 0; k < osize; k++)
		{
		    for (ieven = k; ieven < n; ieven += nsize)
		    {
			iodd = ieven + osize;
		/*
		 * Note that the the upper and lower parts of each new
		 * transform being produced use the same components--
		 * taken through a 'second cycle' since they are periodic
		 * in their original N (nsize/2).  Only W**k differs,
		 * and, fortuitously, W**k = -W**(k+nsize/2).  Also
		 * fortuitously, the input slots for F_k_even and F_k_odd
		 * are separated by nsize/2, as are the output slots for 
		 * F_k and F_(k+nsize/2), so we can use and write over the 
		 * same slots during each pass in the inner loop, using the
		 * same value of W**k (we subtract instead of negating).
		 */
			C_MULT (*wkp, datac[iodd], wk_times_f_k_odd);
			C_SUB (datac[ieven],wk_times_f_k_odd, datac[iodd]);
			C_ADD (datac[ieven],wk_times_f_k_odd, datac[ieven]);

#ifdef PROGRESS_INDICATOR
			if (++Inner_Loop_Count >= next_progress_count)
			{
			    next_progress_count += PROGRESS_INCREMENT;
			    progress_requester__update 
			      ((int) (Initial_Progress + (Inner_Loop_Count *
			       Percent_Per_Loop)));
			}
#endif
		    } /* end for (sums using same wk factor) */

		    wkp++;

		} /* end for (k = 1, 2, ..., osize) */

		wkp++;

	    } /* end for (osize, nsize...) */

	} /* end of transform computation */

    } /* end of !SaveMemory (fast memory hog) version */

#endif /* whether SAVE_TRIG_OPT defined or not */

} /* end of cfft */